Example #1
0
int CORE_dswptr_ontile(PLASMA_desc descA, int i1, int i2, const int *ipiv, int inc,
                       const double *Akk, int ldak)
{
    double zone  = 1.0;
    int lda;
    int m = descA.mt == 1 ? descA.m : descA.mb;

    if ( descA.nt > 1 ) {
        coreblas_error(1, "Illegal value of descA.nt");
        return -1;
    }
    if ( i1 < 1 ) {
        coreblas_error(2, "Illegal value of i1");
        return -2;
    }
    if ( (i2 < i1) || (i2 > m) ) {
        coreblas_error(3, "Illegal value of i2");
        return -3;
    }

    CORE_dlaswp_ontile(descA, i1, i2, ipiv, inc);

    lda = BLKLDD(descA, 0);
    cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower,
                 CblasNoTrans, CblasUnit,
                 m, descA.n, (zone),
                 Akk,     ldak,
                 A(0, 0), lda );

    return PLASMA_SUCCESS;
}
int CORE_zgetrf_incpiv(int M, int N, int IB,
                       PLASMA_Complex64_t *A, int LDA,
                       int *IPIV, int *INFO)
{
    int i, j, k, sb;
    int iinfo;

    /* Check input arguments */
    *INFO = 0;
    if (M < 0) {
        coreblas_error(1, "Illegal value of M");
        return -1;
    }
    if (N < 0) {
        coreblas_error(2, "Illegal value of N");
        return -2;
    }
    if (IB < 0) {
        coreblas_error(3, "Illegal value of IB");
        return -3;
    }
    if ((LDA < max(1,M)) && (M > 0)) {
        coreblas_error(5, "Illegal value of LDA");
        return -5;
    }

    /* Quick return */
    if ((M == 0) || (N == 0) || (IB == 0))
        return PLASMA_SUCCESS;

    k = min(M, N);

    for(i =0 ; i < k; i += IB) {
        sb = min(IB, k-i);
        /*
         * Factor diagonal and subdiagonal blocks and test for exact singularity.
         */
        iinfo = LAPACKE_zgetf2_work(LAPACK_COL_MAJOR, M-i, sb, &A[LDA*i+i], LDA, &IPIV[i]);
        /*
         * Adjust INFO and the pivot indices.
         */
        if((*INFO == 0) && (iinfo > 0))
            *INFO = iinfo + i;

        if (i+sb < N) {
            CORE_zgessm(
                M-i, N-(i+sb), sb, sb,
                &IPIV[i],
                &A[LDA*i+i], LDA,
                &A[LDA*(i+sb)+i], LDA);
        }

        for(j = i; j < i+sb; j++) {
            IPIV[j] = i + IPIV[j];
        }
    }
    return PLASMA_SUCCESS;
}
int CORE_cgetrf_reclap(int M, int N, 
                       PLASMA_Complex32_t *A, int LDA, 
                       int *IPIV, int *info)
{
    int thidx = info[1];
    int thcnt = min( info[2], M / N );
    int minMN = min(M, N);

    if( M < 0 ) {
        coreblas_error(1, "illegal value of M");
        return -1;
    }
    if( N < 0 ) {
        coreblas_error(2, "illegal value of N");
        return -2;
    }
    if( LDA < max(1, M) ) {
        coreblas_error(5, "illegal value of LDA");
        return -5;
    }
    
    /*
     * Quick return
     */
    if ( (M == 0) || (N == 0) || (thidx >= thcnt) ){
      return PLASMA_SUCCESS;
    }

    *info = 0;
    CORE_cgetrf_reclap_rec( M, minMN, A, LDA, IPIV, info, 
                            thidx, thcnt, 0 );

    if ( N > minMN ) {
        CORE_cgetrf_reclap_update(M, 0, minMN, N-minMN,
                                  A, LDA, IPIV, 
                                  thidx, thcnt);
    }

    return info[0];
}
Example #4
0
int CORE_dgetrf_rectil(const PLASMA_desc A, int *IPIV, int *info)
{
    int ft, lt;
    int thidx = info[1];
    int thcnt = min( info[2], A.mt );
    int minMN = min( A.m, A.n );
    double pivot;

    info[0] = 0;
    info[2] = thcnt;

    if ( A.nt > 1 ) {
        coreblas_error(1, "Illegal value of A.nt");
        info[0] = -1;
        return -1;
    }

    if ( thidx >= thcnt )
      return 0;

    int q = A.mt / thcnt;
    int r = A.mt % thcnt;

    if (thidx < r) {
        q++;
        ft = thidx * q;
        lt = ft + q;
    } else {
        ft = r * (q + 1) + (thidx - r) * q;
        lt = ft + q;
        lt = min( lt, A.mt );
    }

    CORE_dgetrf_rectil_rec( A, IPIV, info, &pivot,
                            thidx, thcnt, 0, minMN, ft, lt);

    if ( A.n > minMN ) {
        CORE_dgetrf_rectil_update( A, IPIV,
                                   0, minMN, A.n-minMN,
                                   thidx, thcnt,
                                   ft, lt);
    }

    return info[0];
}
int CORE_ztsmqr_hetra1( int side, int trans,
                        int m1, int n1, int m2, int n2,
                        int k, int ib,
                        PLASMA_Complex64_t *A1, int lda1,
                        PLASMA_Complex64_t *A2, int lda2,
                        PLASMA_Complex64_t *V, int ldv,
                        PLASMA_Complex64_t *T, int ldt,
                        PLASMA_Complex64_t *WORK, int ldwork)
{
    int i, j;

    if ( (m1 != n1) ) {
        coreblas_error(3, "Illegal value of M1, N1");
        return -3;
    }

    /* in-place transposition of A1 */
    for (j = 0; j < n1; j++){
        A1[j + j*lda1] = conj(A1[j + j*lda1]);

        for (i = j+1; i < m1; i++){
            *WORK = *(A1 + i + j*lda1);
            *(A1 + i + j*lda1) = conj(*(A1 + j + i*lda1));
            *(A1 + j + i*lda1) = conj(*WORK);
        }
    }

    CORE_ztsmqr(side, trans, m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork);

    /* in-place transposition of A1 */
    for (j = 0; j < n1; j++){
        A1[j + j*lda1] = conj(A1[j + j*lda1]);

        for (i = j+1; i < m1; i++){
            *WORK = *(A1 + i + j*lda1);
            *(A1 + i + j*lda1) = conj(*(A1 + j + i*lda1));
            *(A1 + j + i*lda1) = conj(*WORK);
        }
    }

    return PLASMA_SUCCESS;
}
Example #6
0
int 
CORE_chbelr(int uplo, int N,
            PLASMA_desc *A,
            PLASMA_Complex32_t *V,
            PLASMA_Complex32_t *TAU,
            int st,
            int ed,
            int eltsize)
{
    int    NB, J1, J2;
    int    len1, len2, t1ed, t2st;
    int    i;
    static PLASMA_Complex32_t zzero = 0.0;
    PLASMA_desc vA=*A;


    /* Check input arguments */
    if (N < 0) {
        coreblas_error(2, "Illegal value of N");
        return -2;
    }
    if (ed <= st) {
        coreblas_error(23, "Illegal value of st and ed (internal)");
        return -23;
    }

    /* Quick return */
    if (N == 0)
        return PLASMA_SUCCESS;

    NB = A->mb;
    if( uplo == PlasmaLower ) {
       /* ========================
        *     LOWER CASE
        * ========================*/
       for (i = ed; i >= st+1 ; i--){
          /* generate Householder to annihilate a(i+k-1,i) within the band */
          *V(i)          = *A(i, (st-1));
          *A(i, (st-1))  = zzero;
          LAPACKE_clarfg_work( 2, A((i-1),(st-1)), V(i), 1, TAU(i));

          /* apply reflector from the left (horizontal row) and from the right for only the diagonal 2x2.*/
          J1    = st;
          J2    = i-2;
          t1ed  = (J2/NB)*NB;
          t2st  = max(t1ed+1,J1);
          len1  = t1ed-J1+1; /* can be negative */
          len2  = J2-t2st+1;
          if(len1>0)CORE_clarfx2(PlasmaLeft, len1 , *V(i), conjf(*TAU(i)), A(i-1, J1  ), ELTLDD(vA, i-1),  A(i,  J1 ), ELTLDD(vA, i) );
          if(len2>0)CORE_clarfx2(PlasmaLeft, len2 , *V(i), conjf(*TAU(i)), A(i-1, t2st), ELTLDD(vA, i-1),  A(i, t2st), ELTLDD(vA, i) );
          CORE_clarfx2c(PlasmaLower, *V(i), *TAU(i), A(i-1, i-1), A(i, i-1), A(i, i)); 
       }
       /* APPLY RIGHT ON THE REMAINING ELEMENT OF KERNEL 1 */
       for (i = ed; i >= st+1 ; i--){
          J1    = i+1;
          J2    = min(ed,N);
          t1ed  = (J2/NB)*NB;
          t2st  = max(t1ed+1,J1);
          len1  = t1ed-J1+1; /* can be negative */
          len2  = J2-t2st+1;
          if(len1>0)CORE_clarfx2(PlasmaRight, len1, *V(i), *TAU(i), A(J1,  i-1), ELTLDD(vA, J1)  , A(J1  , i), ELTLDD(vA, J1)   );
          if(len2>0)CORE_clarfx2(PlasmaRight, len2, *V(i), *TAU(i), A(t2st,i-1), ELTLDD(vA, t2st), A(t2st, i), ELTLDD(vA, t2st) );
       }
    }else{
       /* ========================
        *       UPPER CASE
        * ========================*/
       for (i = ed; i >= st+1 ; i--){
          /* generate Householder to annihilate a(i+k-1,i) within the band*/
          *V(i)          = *A((st-1),  i);
          *A((st-1),  i) = zzero;
          LAPACKE_clarfg_work( 2, A(st-1, i-1), V(i), 1, TAU(i));

          /* apply reflector from the left (horizontal row) and from the right for only the diagonal 2x2.*/
          J1    = st;
          J2    = i-2;
          t1ed  = (J2/NB)*NB;
          t2st  = max(t1ed+1,J1);
          len1  = t1ed-J1+1; /* can be negative */
          len2  = J2-t2st+1;
          if(len1>0)CORE_clarfx2(PlasmaRight, len1, conjf(*V(i)), conjf(*TAU(i)), A(J1,  i-1), ELTLDD(vA, J1)  , A(J1  , i), ELTLDD(vA, J1)   );
          if(len2>0)CORE_clarfx2(PlasmaRight, len2, conjf(*V(i)), conjf(*TAU(i)), A(t2st,i-1), ELTLDD(vA, t2st), A(t2st, i), ELTLDD(vA, t2st) );
          CORE_clarfx2c(PlasmaUpper, *V(i), *TAU(i), A(i-1, i-1), A(i-1, i), A(i,i));
       }
       /* APPLY LEFT ON THE REMAINING ELEMENT OF KERNEL 1 */
       for (i = ed; i >= st+1 ; i--){
          J1    = i+1;
          J2    = min(ed,N);
          t1ed  = (J2/NB)*NB;
          t2st  = max(t1ed+1,J1);
          len1  = t1ed-J1+1; /* can be negative */
          len2  = J2-t2st+1;
          if(len1>0)CORE_clarfx2(PlasmaLeft, len1 , conjf(*V(i)), *TAU(i), A(i-1, J1  ), ELTLDD(vA, i-1),  A(i,  J1 ), ELTLDD(vA, i) );
          if(len2>0)CORE_clarfx2(PlasmaLeft, len2 , conjf(*V(i)), *TAU(i), A(i-1, t2st), ELTLDD(vA, i-1),  A(i, t2st), ELTLDD(vA, i) );
       }
    }  /* end of else for the upper case */

    return PLASMA_SUCCESS;

}
Example #7
0
int CORE_zgeqrt(int M, int N, int IB,
                PLASMA_Complex64_t *A, int LDA,
                PLASMA_Complex64_t *T, int LDT,
                PLASMA_Complex64_t *TAU,
                PLASMA_Complex64_t *WORK)
{
    int i, k, sb;

    /* Check input arguments */
    if (M < 0) {
        coreblas_error(1, "Illegal value of M");
        return -1;
    }
    if (N < 0) {
        coreblas_error(2, "Illegal value of N");
        return -2;
    }
    if ((IB < 0) || ( (IB == 0) && ((M > 0) && (N > 0)) )) {
        coreblas_error(3, "Illegal value of IB");
        return -3;
    }
    if ((LDA < max(1,M)) && (M > 0)) {
        coreblas_error(5, "Illegal value of LDA");
        return -5;
    }
    if ((LDT < max(1,IB)) && (IB > 0)) {
        coreblas_error(7, "Illegal value of LDT");
        return -7;
    }

    /* Quick return */
    if ((M == 0) || (N == 0) || (IB == 0))
        return PLASMA_SUCCESS;

    k = min(M, N);

    for(i = 0; i < k; i += IB) {
        sb = min(IB, k-i);

        LAPACKE_zgeqr2_work(LAPACK_COL_MAJOR, M-i, sb,
                            &A[LDA*i+i], LDA, &TAU[i], WORK);

        LAPACKE_zlarft_work(LAPACK_COL_MAJOR,
            lapack_const(PlasmaForward),
            lapack_const(PlasmaColumnwise),
            M-i, sb,
            &A[LDA*i+i], LDA, &TAU[i],
            &T[LDT*i], LDT);

        if (N > i+sb) {
            LAPACKE_zlarfb_work(
                LAPACK_COL_MAJOR,
                lapack_const(PlasmaLeft),
                lapack_const(PlasmaConjTrans),
                lapack_const(PlasmaForward),
                lapack_const(PlasmaColumnwise),
                M-i, N-i-sb, sb,
                &A[LDA*i+i],      LDA,
                &T[LDT*i],        LDT,
                &A[LDA*(i+sb)+i], LDA,
                WORK, N-i-sb);
        }
    }
    return PLASMA_SUCCESS;
}
Example #8
0
int CORE_dlaswp_ontile(PLASMA_desc descA, int i1, int i2, const int *ipiv, int inc)
{
    int i, j, ip, it;
    double *A1;
    int lda1, lda2;

    /* Change i1 to C notation */
    i1--;

    /* Check parameters */
    if ( descA.nt > 1 ) {
        coreblas_error(1, "Illegal value of descA.nt");
        return -1;
    }
    if ( i1 < 0 ) {
        coreblas_error(2, "Illegal value of i1");
        return -2;
    }
    if ( (i2 <= i1) || (i2 > descA.m) ) {
        coreblas_error(3, "Illegal value of i2");
        return -3;
    }
    if ( ! ( (i2 - i1 - i1%descA.mb -1) < descA.mb ) ) {
        coreblas_error(2, "Illegal value of i1,i2. They have to be part of the same block.");
        return -3;
    }

    if (inc > 0) {
        it = i1 / descA.mb;
        A1 = A(it, 0);
        lda1 = BLKLDD(descA, 0);

        for (j = i1; j < i2; ++j, ipiv+=inc) {
            ip = (*ipiv) - descA.i - 1;
            if ( ip != j )
            {
                it = ip / descA.mb;
                i  = ip % descA.mb;
                lda2 = BLKLDD(descA, it);
                cblas_dswap(descA.n, A1       + j, lda1,
                            A(it, 0) + i, lda2 );
            }
        }
    }
    else
    {
        it = (i2-1) / descA.mb;
        A1 = A(it, 0);
        lda1 = BLKLDD(descA, it);

        i1--;
        ipiv = &ipiv[(1-i2)*inc];
        for (j = i2-1; j > i1; --j, ipiv+=inc) {
            ip = (*ipiv) - descA.i - 1;
            if ( ip != j )
            {
                it = ip / descA.mb;
                i  = ip % descA.mb;
                lda2 = BLKLDD(descA, it);
                cblas_dswap(descA.n, A1       + j, lda1,
                            A(it, 0) + i, lda2 );
            }
        }
    }

    return PLASMA_SUCCESS;
}
Example #9
0
int CORE_zttrfb(int side, int trans, int direct, int storev,
                int M1, int N1, int M2, int N2, int K,
                PLASMA_Complex64_t *A1, int LDA1,
                PLASMA_Complex64_t *A2, int LDA2,
                PLASMA_Complex64_t *V, int LDV,
                PLASMA_Complex64_t *T, int LDT,
                PLASMA_Complex64_t *WORK, int LDWORK)
{
    static PLASMA_Complex64_t zone  =  1.0;
    static PLASMA_Complex64_t mzone = -1.0;

    int j, vi;

    /* Check input arguments */
    if (M1 < 0) {
        coreblas_error(5, "Illegal value of M1");
        return -5;
    }
    if (N1 < 0) {
        coreblas_error(6, "Illegal value of N1");
        return -6;
    }
    if ((M2 < 0) ||
        ( (side == PlasmaRight) && (M1 != M2) ) ) {
        coreblas_error(7, "Illegal value of M2");
        return -7;
    }
    if ((N2 < 0) ||
        ( (side == PlasmaLeft) && (N1 != N2) ) ) {
        coreblas_error(8, "Illegal value of N2");
        return -8;
    }
    if (K < 0) {
        coreblas_error(9, "Illegal value of K");
        return -9;
    }

    /* Quick return */
    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0))
        return PLASMA_SUCCESS;

    if (storev == PlasmaColumnwise) {

        if (direct == PlasmaForward) {
            /*
             * Let  V =  ( V1 )    (first K rows)
             *           ( V2 )
             * where V2 is non-unit upper triangular
             */
            if (side == PlasmaLeft) {

                /*
                 * Colwise / Forward / Left
                 * -------------------------
                 *
                 * Form  H * A  or  H' * A  where  A = ( A1 )
                 *                                     ( A2 )
                 * where A2 = ( A2_1 )
                 *            ( A2_2 )
                 */

                /*
                 * W = A1 + V' * A2
                 */

                /*
                 * W = A2_2
                 */
                LAPACKE_zlacpy_work(LAPACK_COL_MAJOR,
                    lapack_const(PlasmaUpperLower),
                    K, N2,
                    &A2[M2-K], LDA2, WORK, LDWORK);

                /*
                 * W = V2' * A2_2
                 */
                cblas_ztrmm(
                    CblasColMajor, CblasLeft, CblasUpper,
                    CblasConjTrans, CblasNonUnit, K, N2,
                    CBLAS_SADDR(zone), &V[M2-K], LDV,
                    WORK, LDWORK);

                if (M2 > K) {
                    /*
                     * W = W + V1' * A2_1
                     */
                    cblas_zgemm(
                        CblasColMajor, CblasConjTrans, CblasNoTrans,
                        K, N2, M2-K,
                        CBLAS_SADDR(zone), V, LDV,
                        A2, LDA2,
                        CBLAS_SADDR(zone), WORK, LDWORK);
                }

                /*
                 * W = A1 + W
                 */
                for(j = 0; j < N1; j++) {
                    cblas_zaxpy(K, CBLAS_SADDR(zone),
                            &A1[LDA1*j], 1,
                            &WORK[LDWORK*j], 1);
                }

                /*
                 * A2 = A2 - V * T * W -> W = T * W, A2 = A2 - V * W
                 */

                /*
                 * W = T * W
                 */
                cblas_ztrmm(
                    CblasColMajor, CblasLeft, CblasUpper,
                    (CBLAS_TRANSPOSE)trans, CblasNonUnit, K, N2,
                    CBLAS_SADDR(zone), T, LDT, WORK, LDWORK);
                /*
                 * A1 = A1 - W
                 */
                for(j = 0; j < N1; j++) {
                    cblas_zaxpy(K, CBLAS_SADDR(mzone),
                            &WORK[LDWORK*j], 1,
                            &A1[LDA1*j], 1);
                }

                /*
                 * A2_1 = A2_1 - V1 * W
                 */
                if (M2 > K) {
                    cblas_zgemm(
                        CblasColMajor, CblasNoTrans, CblasNoTrans,
                        M2-K, N2, K,
                        CBLAS_SADDR(mzone), V, LDV,
                        WORK, LDWORK,
                        CBLAS_SADDR(zone), A2, LDA2);
                }

                /*
                 * W = - V2 * W
                 */
                cblas_ztrmm(
                    CblasColMajor, CblasLeft, CblasUpper,
                    CblasNoTrans, CblasNonUnit, K, N2,
                    CBLAS_SADDR(mzone), &V[M2-K], LDV,
                    WORK, LDWORK);

                /*
                 * A2_2 = A2_2 + W
                 */
                for(j = 0; j < N2; j++) {
                    cblas_zaxpy(
                        K, CBLAS_SADDR(zone),
                        &WORK[LDWORK*j], 1,
                        &A2[LDA2*j+(M2-K)], 1);
                }
            }
            else {
                /* 
                 * Colwise / Forward / Right
                 * -------------------------
                 *
                 * Form  H * A  or  H' * A  where:
                 * 
                 *   A  = ( A1 A2 )
                 *
                 *   A2 = ( A2_1 : A2_2 )
                 *
                 *         A2_1 is M2 x (M2-K)
                 *         A2_2 is M2 x K
                 *
                 *   V = ( V_1 ) 
                 *       ( V_2 )
                 *
                 *         V_1 is full and (N2-K) x K
                 *         V_2 is upper triangular and K x K
                 */

                /*
                 * W = ( A1 + A2_1*V_1 + A2_2*V_2 ) * op(T)
                 *
                 *    W  is M x K
                 *    A1 is M x K
                 *    A2 is M x N2 split as (A2_1 A2_2) such as
                 *       A2_1 is (N2-K) x K
                 *       A2_2 is M x K
                 */

                /* W = A2_2 */
                LAPACKE_zlacpy_work(LAPACK_COL_MAJOR,
                    lapack_const(PlasmaUpperLower),
                    M2, K,
                    &A2[LDA2*(N2-K)], LDA2, WORK, LDWORK);

                /* W = W * V_2 --> W = A2_2 * V_2 */
                cblas_ztrmm(
                    CblasColMajor, CblasRight, CblasUpper,
                    CblasNoTrans, CblasNonUnit, M2, K,
                    CBLAS_SADDR(zone), &V[N2-K], LDV,
                    WORK, LDWORK);

                /* W = W + A2_1 * V_1 */
                if (N2 > K) {
                    cblas_zgemm(
                        CblasColMajor, CblasNoTrans, CblasNoTrans,
                        M2, K, N2-K,
                        CBLAS_SADDR(zone), A2, LDA2,
                        V, LDV,
                        CBLAS_SADDR(zone), WORK, LDWORK);
                }

                /* W = A1 + W */
                for (j = 0; j < K; j++) {
                    cblas_zaxpy(M1, CBLAS_SADDR(zone), &A1[LDA1*j], 1,
                                &WORK[LDWORK*j], 1);
                }

                /* W = W * T --> ( A1 + A2_1*V_1 + A2_2*V_2 ) * op(T) */
                cblas_ztrmm(
                    CblasColMajor, CblasRight, CblasUpper,
                    (CBLAS_TRANSPOSE)trans, CblasNonUnit, M2, K,
                    CBLAS_SADDR(zone), T, LDT, WORK, LDWORK);

                /*
                 * A1 = A1 - W
                 */

                for(j = 0; j < K; j++) {
                    cblas_zaxpy(M1, CBLAS_SADDR(mzone), &WORK[LDWORK*j], 1,
                                &A1[LDA1*j], 1);
                }

                /*
                 * A2 = A2 - W * V' --> A2 - W*V_1' - W*V_2'
                 */

                /* A2 = A2 - W * V_1' */
                if (N2 > K) {
                    cblas_zgemm(
                        CblasColMajor, CblasNoTrans, CblasConjTrans,
                        M2, N2-K, K,
                        CBLAS_SADDR(mzone), WORK, LDWORK,
                        V, LDV,
                        CBLAS_SADDR(zone), A2, LDA2);
                }

                /* A2 =  A2 -  W * V_2' */
                cblas_ztrmm(
                    CblasColMajor, CblasRight, CblasUpper,
                    CblasConjTrans, CblasNonUnit, M2, K,
                    CBLAS_SADDR(mzone), &V[N2-K], LDV,
                    WORK, LDWORK);

                for(j = 0; j < K; j++) {
                    cblas_zaxpy(
                        M2, CBLAS_SADDR(zone),
                        &WORK[LDWORK*j], 1,
                        &A2[LDA2*(j+N2-K)], 1);
                }
            }
        }
        else {
            coreblas_error(3, "Not implemented (ColWise / Backward / Left or Right)");
            return PLASMA_ERR_NOT_SUPPORTED;
        }
    }
    else {
        /*
         * Rowwise
         */
        if (direct == PlasmaForward) {
            /*
             * Let  V =  ( V1 V2 )    (V1: first K cols)
             * 
             * where V2 is non-unit lower triangular
             */

            if (side == PlasmaLeft) {

                /*
                 * Rowwise / Forward / Left
                 * -------------------------
                 *
                 * Form  H * A  or  H' * A  where  A = ( A1 )
                 *                                     ( A2 )
                 * where A2 = ( A2_1 )
                 *            ( A2_2 )
                 */

                /* V_2 first element */
                vi = LDV*(M2-K);

                /*
                 * W = A1 + V * A2
                 */

                /*
                 * W = A2_2
                 */
                LAPACKE_zlacpy_work(LAPACK_COL_MAJOR,
                    lapack_const(PlasmaUpperLower),
                    K, N2,
                    &A2[M2-K], LDA2, WORK, LDWORK);

                /*
                 * W = V2 * A2_2
                 */
                //**DB CblasColMajor, CblasLeft, CblasUpper,
                cblas_ztrmm(
                    CblasColMajor, CblasLeft, CblasLower,
                    CblasNoTrans, CblasNonUnit, K, N2,
                    CBLAS_SADDR(zone), &V[vi], LDV,
                    WORK, LDWORK);

                if (M2 > K) {
                    /*
                     * W = W + V1 * A2_1
                     */
                    cblas_zgemm(
                        CblasColMajor, CblasNoTrans, CblasNoTrans,
                        K, N2, M2-K,
                        CBLAS_SADDR(zone), V, LDV,
                        A2, LDA2,
                        CBLAS_SADDR(zone), WORK, LDWORK);
                }

                /*
                 * W = A1 + W
                 */
                for(j = 0; j < N1; j++) {
                    cblas_zaxpy(
                            K, CBLAS_SADDR(zone),
                            &A1[LDA1*j], 1,
                            &WORK[LDWORK*j], 1);
                }

                /*
                 * W = T * W
                 */
                cblas_ztrmm(
                    CblasColMajor, CblasLeft, CblasUpper,
                    (CBLAS_TRANSPOSE)trans, CblasNonUnit, K, N2,
                    CBLAS_SADDR(zone), T, LDT, WORK, LDWORK);

                /*
                 * A1 = A1 - W
                 */
                for(j = 0; j < N1; j++) {
                    cblas_zaxpy(
                            K, CBLAS_SADDR(mzone),
                            &WORK[LDWORK*j], 1,
                            &A1[LDA1*j], 1);
                }

                /*
                 * A2 = A2 - V' * T * W -> A2 = A2 - V' * W
                 */

                /*
                 * A2_1 = A2_1 - V1' * W
                 */
                if (M2 > K) {
                    cblas_zgemm(
                        CblasColMajor, CblasConjTrans, CblasNoTrans,
                        M2-K, N2, K,
                        CBLAS_SADDR(mzone), V, LDV,
                        WORK, LDWORK,
                        CBLAS_SADDR(zone), A2, LDA2);
                }

                /*
                 * W = - V2' * W
                 */
                cblas_ztrmm(
                    CblasColMajor, CblasLeft, CblasLower,
                    CblasConjTrans, CblasNonUnit, K, N2,
                    CBLAS_SADDR(mzone), &V[vi], LDV,
                    WORK, LDWORK);

                /*
                 * A2_2 = A2_2 + W
                 */
                for(j = 0; j < N2; j++) {
                    cblas_zaxpy(
                        K, CBLAS_SADDR(zone),
                        &WORK[LDWORK*j], 1,
                        &A2[LDA2*j+(M2-K)], 1);
                }
            }
            else {
                /* 
                 * Rowwise / Forward / Right
                 * -------------------------
                 *
                 * Form  H * A  or  H' * A  where:
                 * 
                 *   A  = ( A1 A2 )
                 *
                 *   A2 = ( A2_1 : A2_2 )
                 *
                 *         A2_1 is M2 x (M2-K)
                 *         A2_2 is M2 x K
                 *
                 *   V = ( V_1 ) 
                 *       ( V_2 )
                 *
                 *         V_1 is full and (N2-K) x K
                 *         V_2 is lower triangular and K x K
                 */

                /*
                 * W = ( A1 + A2_1*V_1 + A2_2*V_2 ) * op(T)
                 *
                 *    W  is M x K
                 *    A1 is M x K
                 *    A2 is M x N2 split as (A2_1 A2_2) such as
                 *       A2_1 is (N2-K) x K
                 *       A2_2 is M x K
                 */

                /* V_2 and A2_2 first element */
                vi = LDV*(N2-K);

                /* W = A2_2 */
                LAPACKE_zlacpy_work(LAPACK_COL_MAJOR,
                    lapack_const(PlasmaUpperLower),
                    M2, K,
                    &A2[LDA2*(N2-K)], LDA2, WORK, LDWORK);

                /* W = W * V_2' --> W = A2_2 * V_2' */
                cblas_ztrmm(
                    CblasColMajor, CblasRight, CblasLower,
                    CblasConjTrans, CblasNonUnit, M2, K,
                    CBLAS_SADDR(zone), &V[vi], LDV,
                    WORK, LDWORK);

                /* W = W + A2_1 * V_1' */
                if (N2 > K) {
                    cblas_zgemm(
                        CblasColMajor, CblasNoTrans, CblasConjTrans,
                        M2, K, N2-K,
                        CBLAS_SADDR(zone), A2, LDA2,
                        V, LDV,
                        CBLAS_SADDR(zone), WORK, LDWORK);
                }

                /* W = A1 + W */
                for (j = 0; j < K; j++) {
                    cblas_zaxpy(M1, CBLAS_SADDR(zone), &A1[LDA1*j], 1,
                                &WORK[LDWORK*j], 1);
                }

                /* W = W * op(T) --> ( A1 + A2_1*V_1 + A2_2*V_2 ) * op(T) */
                cblas_ztrmm(
                    CblasColMajor, CblasRight, CblasUpper,
                    (CBLAS_TRANSPOSE)trans, CblasNonUnit, M2, K,
                    CBLAS_SADDR(zone), T, LDT, WORK, LDWORK);

                /*
                 * A1 = A1 - W
                 */

                for(j = 0; j < K; j++) {
                    cblas_zaxpy(M1, CBLAS_SADDR(mzone), &WORK[LDWORK*j], 1,
                                &A1[LDA1*j], 1);
                }

                /*
                 * A2 = A2 - W * V --> A2 - W*V_1 - W*V_2
                 */

                /* A2 = A2 - W * V_1 */
                if (N2 > K) {
                    cblas_zgemm(
                        CblasColMajor, CblasNoTrans, CblasNoTrans,
                        M2, N2-K, K,
                        CBLAS_SADDR(mzone), WORK, LDWORK,
                        V, LDV,
                        CBLAS_SADDR(zone), A2, LDA2);
                }

                /* A2 =  A2 -  W * V_2 */
                cblas_ztrmm(
                    CblasColMajor, CblasRight, CblasLower,
                    CblasNoTrans, CblasNonUnit, M2, K,
                    CBLAS_SADDR(mzone), &V[vi], LDV,
                    WORK, LDWORK);

                for(j = 0; j < K; j++) {
                    cblas_zaxpy(
                        M2, CBLAS_SADDR(zone),
                        &WORK[LDWORK*j], 1,
                        &A2[LDA2*(N2-K+j)], 1);
                }

            }
 
        }
        else {
            coreblas_error(3, "Not implemented (Rowwise / Backward / Left or Right)");
            return PLASMA_ERR_NOT_SUPPORTED;
        }
    }
    return PLASMA_SUCCESS;
}
Example #10
0
int CORE_stsrfb(int side, int trans, int direct, int storev,
                int M1, int N1, int M2, int N2, int K,
                float *A1, int LDA1,
                float *A2, int LDA2,
                float *V, int LDV,
                float *T, int LDT,
                float *WORK, int LDWORK)
{
    static float zone  =  1.0;
    static float mzone = -1.0;

    int j;

    /* Check input arguments */
    if (M1 < 0) {
        coreblas_error(5, "Illegal value of M1");
        return -5;
    }
    if (N1 < 0) {
        coreblas_error(6, "Illegal value of N1");
        return -6;
    }
    if ( (M2 < 0) || 
         ( (M2 != M1) && (side == PlasmaRight) ) ){
        coreblas_error(7, "Illegal value of M2");
        return -7;
    }
    if ( (N2 < 0) || 
         ( (N2 != N1) && (side == PlasmaLeft) ) ){
        coreblas_error(8, "Illegal value of N2");
        return -8;
    }
    if (K < 0) {
        coreblas_error(9, "Illegal value of K");
        return -9;
    }

    /* Quick return */
    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0))
        return PLASMA_SUCCESS;

    if (storev == PlasmaColumnwise) {
        if (direct == PlasmaForward) {
            if (side == PlasmaLeft) {
                /*
                 * B = A1 + V' * A2
                 */
                LAPACKE_slacpy_work(LAPACK_COL_MAJOR,
                    lapack_const(PlasmaUpperLower),
                    K, N1,
                    A1, LDA1, WORK, LDWORK);

                cblas_sgemm(
                    CblasColMajor, CblasTrans, CblasNoTrans,
                    K, N2, M2,
                    (zone), V, LDV,
                    A2, LDA2,
                    (zone), WORK, LDWORK);
                /*
                 * A2 = A2 - V*T*B -> B = T*B, A2 = A2 - V*B
                 */
                cblas_strmm(
                    CblasColMajor, CblasLeft, CblasUpper,
                    (CBLAS_TRANSPOSE)trans, CblasNonUnit, K, N2,
                    (zone), T, LDT, WORK, LDWORK);

                cblas_sgemm(
                    CblasColMajor, CblasNoTrans, CblasNoTrans,
                    M2, N2, K,
                    (mzone), V, LDV,
                    WORK, LDWORK,
                    (zone), A2, LDA2);
                /*
                 * A1 = A1 - B
                 */
                for(j = 0; j < N1; j++) {
                    cblas_saxpy(
                        K, (mzone),
                        &WORK[LDWORK*j], 1,
                        &A1[LDA1*j], 1);
                }
            }
            /*
             * Columnwise / Forward / Right
             */
            else {
                /*
                 * B = A1 + A2 * V
                 */
                LAPACKE_slacpy_work(LAPACK_COL_MAJOR,
                    lapack_const(PlasmaUpperLower),
                    M1, K,
                    A1, LDA1, WORK, LDWORK);

                cblas_sgemm(
                    CblasColMajor, CblasNoTrans, CblasNoTrans,
                    M2, K, N2,
                    (zone), A2, LDA2,
                    V, LDV,
                    (zone), WORK, LDWORK);
                /*
                 * A2 = A2 - B*T*V' -> B = B*T, A2 = A2 - B*V'
                 */
                cblas_strmm(
                    CblasColMajor, CblasRight, CblasUpper,
                    (CBLAS_TRANSPOSE)trans, CblasNonUnit, M1, K,
                    (zone), T, LDT, WORK, LDWORK);

                cblas_sgemm(
                    CblasColMajor, CblasNoTrans, CblasTrans,
                    M2, N2, K,
                    (mzone), WORK, LDWORK,
                    V, LDV,
                    (zone), A2, LDA2);
                /*
                 * A1 = A1 - B
                 */
                for(j = 0; j < K; j++) {
                    cblas_saxpy(
                        M1, (mzone),
                        &WORK[LDWORK*j], 1,
                        &A1[LDA1*j], 1);
                }
            }
        }
        else {
            coreblas_error(3, "Not implemented (ColMajor / Backward / Left or Right)");
            return PLASMA_ERR_NOT_SUPPORTED;
        }
    }
    else {
        if (direct == PlasmaForward) {
            /*
             * Rowwise / Forward / Left
             */
            if (side == PlasmaLeft) {
                /*
                 * B = A1 + V * A2
                 */
                LAPACKE_slacpy_work(LAPACK_COL_MAJOR,
                    lapack_const(PlasmaUpperLower),
                    K, N1,
                    A1, LDA1, WORK, LDWORK);

                cblas_sgemm(
                    CblasColMajor, CblasNoTrans, CblasNoTrans,
                    K, N2, M2,
                    (zone), V, LDV,
                    A2, LDA2,
                    (zone), WORK, LDWORK);
                /*
                 * A2 = A2 - V'*T*B -> B = T*B, A2 = A2 - V'*B
                 */
                cblas_strmm(
                    CblasColMajor, CblasLeft, CblasUpper,
                    (CBLAS_TRANSPOSE)trans, CblasNonUnit, K, N2,
                    (zone), T, LDT, WORK, LDWORK);

                cblas_sgemm(
                    CblasColMajor, CblasTrans, CblasNoTrans,
                    M2, N2, K,
                    (mzone), V, LDV,
                    WORK, LDWORK,
                    (zone), A2, LDA2);
                /*
                 * A1 = A1 - B
                 */
                for(j=0; j<N1; j++) {
                    cblas_saxpy(
                        K, (mzone),
                        &WORK[LDWORK*j], 1,
                        &A1[LDA1*j], 1);
                }
            }
            /*
             * Rowwise / Forward / Right
             */
            else {
                /*
                 * B = A1 + A2 * V'
                 */
                LAPACKE_slacpy_work(LAPACK_COL_MAJOR,
                    lapack_const(PlasmaUpperLower),
                    M1, K,
                    A1, LDA1, WORK, LDWORK);

                cblas_sgemm(
                    CblasColMajor, CblasNoTrans, CblasTrans,
                    M2, K, N2,
                    (zone), A2, LDA2,
                    V, LDV,
                    (zone), WORK, LDWORK);
                /*
                 * A2 = A2 - B*T*V -> B = B*T, A2 = A2 - B*V'
                 */
                cblas_strmm(
                    CblasColMajor, CblasRight, CblasUpper,
                    (CBLAS_TRANSPOSE)trans, CblasNonUnit, M1, K,
                    (zone), T, LDT, WORK, LDWORK);

                cblas_sgemm(
                    CblasColMajor, CblasNoTrans, CblasNoTrans,
                    M2, N2, K,
                    (mzone), WORK, LDWORK,
                    V, LDV,
                    (zone), A2, LDA2);
                /*
                 * A1 = A1 - B
                 */
                for(j = 0; j < K; j++) {
                    cblas_saxpy(
                        M1, (mzone),
                        &WORK[LDWORK*j], 1,
                        &A1[LDA1*j], 1);
                }
            }
        }
        else {
            coreblas_error(3, "Not implemented (RowMajor / Backward / Left or Right)");
            return PLASMA_ERR_NOT_SUPPORTED;
        }
    }
    return PLASMA_SUCCESS;
}
int CORE_dtsmlq_corner( int m1, int n1, int m2, int n2, int m3, int n3,
                        int k, int ib, int nb,
                        double *A1, int lda1,
                        double *A2, int lda2,
                        double *A3, int lda3,
                        double *V, int ldv,
                        double *T, int ldt,
                        double *WORK, int ldwork)
{
    PLASMA_enum side;
    PLASMA_enum trans;
    int i, j;

    if ( m1 != n1 ) {
        coreblas_error(1, "Illegal value of M1, N1");
        return -1;
    }

    /* Rebuild the symmetric block: WORK <- A1 */
    for (i = 0; i < m1; i++)
        for (j = i; j < n1; j++){
            *(WORK + i + j*ldwork) = *(A1 + i + j*lda1);
            if (j > i){
                *(WORK + j + i*ldwork) =  ( *(WORK + i + j*ldwork) );
            }
        }

    /*  Copy the transpose of A2: WORK+nb*ldwork <- A2' */
    for (j = 0; j < n2; j++)
        for (i = 0; i < m2; i++){
            *(WORK + j + (i + nb) * ldwork) = ( *(A2 + i + j*lda2) );
        }

    side = PlasmaRight;
    trans = PlasmaTrans;

    /*  Right application on |A1 A2| */
    CORE_dtsmlq(side, trans, m1, n1, m2, n2, k, ib, 
                WORK, ldwork, A2, lda2, 
                V, ldv, T, ldt, 
                WORK+3*nb*ldwork, ldwork);

    /*  Rebuild the symmetric block: WORK+2*nb*ldwork <- A3 */
    for (i = 0; i < m3; i++)
        for (j = i; j < n3; j++){
            *(WORK + i + (j + 2*nb) * ldwork) = *(A3 + i + j*lda3);
            if (j > i){
                *(WORK + j + (i + 2*nb) * ldwork) =   ( *(WORK + i + (j + 2*nb) * ldwork) );
            }
        }

    /*  Right application on | A2' A3 | */
    CORE_dtsmlq(side, trans, n2, m2, m3, n3, k, ib, 
                WORK+nb*ldwork, ldwork, WORK+2*nb*ldwork, ldwork, 
                V, ldv, T, ldt, 
                WORK + 3*nb*ldwork, ldwork);

    side = PlasmaLeft;
    trans = PlasmaNoTrans;

    /*  Left application on | A1  | */
    /*                      | A2' | */
    CORE_dtsmlq(side, trans, m1, n1, n2, m2, k, ib, 
                WORK, ldwork, WORK+nb*ldwork, ldwork, 
                V, ldv, T, ldt, 
                WORK + 3*nb*ldwork, ldwork);

    /*  Copy back the final result to the upper part of A1 */
    /*  A1 = WORK */
    for (i = 0; i < m1; i++)
        for (j = i; j < n1; j++)
            *(A1 + i + j*lda1) = *(WORK + i + j*ldwork);

    /*  Left application on | A2 | */
    /*                      | A3 | */
    CORE_dtsmlq(side, trans, m2, n2, m3, n3, k, ib, 
                A2, lda2, WORK+2*nb*ldwork, ldwork, 
                V, ldv, T, ldt, 
                WORK + 3*nb*ldwork, ldwork);

    /*  Copy back the final result to the upper part of A3 */
    /*  A3 = WORK+2*nb*ldwork */
    for (i = 0; i < m3; i++)
        for (j = i; j < n3; j++)
            *(A3 + i + j*lda3) = *(WORK + i + (j+ 2*nb) * ldwork);

    return PLASMA_SUCCESS;
}
Example #12
0
int 
CORE_shbrce(int uplo, int N,
            PLASMA_desc *A,
            float *V,
            float *TAU,
            int st,
            int ed,
            int eltsize)
{
    int    NB, J1, J2, J3, KDM2, len, pt;
    int    len1, len2, t1ed, t2st;
    int    i;
    static float zzero = 0.0;
    PLASMA_desc vA=*A;


    /* Check input arguments */
    if (N < 0) {
        coreblas_error(2, "Illegal value of N");
        return -2;
    }
    if (ed <= st) {
        coreblas_error(6, "Illegal value of st and ed (internal)");
        return -6;
    }

    /* Quick return */
    if (N == 0)
        return PLASMA_SUCCESS;

    NB = A->mb;
    KDM2 = A->mb-2;
    if( uplo == PlasmaLower ) {
        /* ========================
         *       LOWER CASE
         * ========================*/
        for (i = ed; i >= st+1 ; i--){
            /* apply Householder from the right. and create newnnz outside the band if J3 < N */
            J1  = ed+1;
            J2  = min((i+1+KDM2), N);
            J3  = min((J2+1), N);
            len = J3-J1+1;
            if(J3>J2)*A(J3,(i-1))=zzero;/* could be removed because A is supposed to be band.*/

            t1ed  = (J3/NB)*NB;
            t2st  = max(t1ed+1,J1);
            len1  = t1ed-J1+1; /* can be negative*/
            len2  = J3-t2st+1;
            if(len1>0)CORE_slarfx2(PlasmaRight, len1, *V(i), *TAU(i), A(J1,  i-1), ELTLDD(vA, J1),   A(J1  , i), ELTLDD(vA, J1)  );
            if(len2>0)CORE_slarfx2(PlasmaRight, len2, *V(i), *TAU(i), A(t2st,i-1), ELTLDD(vA, t2st), A(t2st, i), ELTLDD(vA, t2st));
            /* if nonzero element a(j+kd,j-1) has been created outside the band (if index < N) then eliminate it.*/
            len    = J3-J2; // soit 1 soit 0
            if(len>0){
                /* generate Householder to annihilate a(j+kd,j-1) within the band */
                *V(J3)         = *A(J3,i-1);
                *A(J3,i-1)   = 0.0;
                LAPACKE_slarfg_work( 2, A(J2,i-1), V(J3), 1, TAU(J3));
            }
        }
        /* APPLY LEFT ON THE REMAINING ELEMENT OF KERNEL 2 */
        for (i = ed; i >= st+1 ; i--){
            /* find if there was a nnz created. if yes apply left else nothing to be done.*/
            J2  = min((i+1+KDM2), N);
            J3  = min((J2+1), N);
            len    = J3-J2;
            if(len>0){
                pt    = J2;
                J1    = i;
                J2    = min(ed,N);
                t1ed  = (J2/NB)*NB;
                t2st  = max(t1ed+1,J1);
                len1  = t1ed-J1+1;  /* can be negative*/
                len2  = J2-t2st+1;
                if(len1>0)CORE_slarfx2(PlasmaLeft, len1 , *V(J3), (*TAU(J3)), A(pt, i   ), ELTLDD(vA, pt),  A((pt+1),  i  ), ELTLDD(vA, pt+1) );
                if(len2>0)CORE_slarfx2(PlasmaLeft, len2 , *V(J3), (*TAU(J3)), A(pt, t2st), ELTLDD(vA, pt),  A((pt+1), t2st), ELTLDD(vA, pt+1) );
            }
        }
    } else {
        /* ========================
         *       UPPER CASE
         * ========================*/
        for (i = ed; i >= st+1 ; i--){
            /* apply Householder from the right. and create newnnz outside the band if J3 < N */
            J1  = ed+1;
            J2  = min((i+1+KDM2), N);
            J3  = min((J2+1), N);
            len = J3-J1+1;
            if(J3>J2)*A((i-1), J3)=zzero;/* could be removed because A is supposed to be band.*/

            t1ed  = (J3/NB)*NB;
            t2st  = max(t1ed+1,J1);
            len1  = t1ed-J1+1;  /* can be negative*/
            len2  = J3-t2st+1;
            if(len1>0)CORE_slarfx2(PlasmaLeft, len1 , (*V(i)), *TAU(i), A(i-1, J1  ), ELTLDD(vA, (i-1)),  A(i,  J1 ), ELTLDD(vA, i) );
            if(len2>0)CORE_slarfx2(PlasmaLeft, len2 , (*V(i)), *TAU(i), A(i-1, t2st), ELTLDD(vA, (i-1)),  A(i, t2st), ELTLDD(vA, i) );
            /* if nonzero element a(j+kd,j-1) has been created outside the band (if index < N) then eliminate it.*/
            len    = J3-J2; /* either 1 soit 0*/
            if(len>0){
                /* generate Householder to annihilate a(j+kd,j-1) within the band*/
                *V(J3)         = *A((i-1), J3);
                *A((i-1), J3)  = 0.0;
                LAPACKE_slarfg_work( 2, A((i-1), J2), V(J3), 1, TAU(J3));
            }
        }
        /* APPLY RIGHT ON THE REMAINING ELEMENT OF KERNEL 2*/
        for (i = ed; i >= st+1 ; i--){
            /* find if there was a nnz created. if yes apply right else nothing to be done.*/
            J2  = min((i+1+KDM2), N);
            J3  = min((J2+1), N);
            len    = J3-J2;
            if(len>0){
                pt    = J2;
                J1    = i;
                J2    = min(ed,N);
                t1ed  = (J2/NB)*NB;
                t2st  = max(t1ed+1,J1);
                len1  = t1ed-J1+1;  /* can be negative*/
                len2  = J2-t2st+1;
                if(len1>0)CORE_slarfx2(PlasmaRight, len1 , (*V(J3)), (*TAU(J3)), A(i   , pt), ELTLDD(vA, i),     A(i,    pt+1), ELTLDD(vA, i) );
                if(len2>0)CORE_slarfx2(PlasmaRight, len2 , (*V(J3)), (*TAU(J3)), A(t2st, pt), ELTLDD(vA, t2st),  A(t2st, pt+1), ELTLDD(vA, t2st) );
            }
        }
    } /* end of else for the upper case */

    return PLASMA_SUCCESS;
}
Example #13
0
int CORE_cttmlq(int side, int trans,
                int M1, int N1, int M2, int N2, int K, int IB,
                PLASMA_Complex32_t *A1, int LDA1,
                PLASMA_Complex32_t *A2, int LDA2,
                PLASMA_Complex32_t *V, int LDV,
                PLASMA_Complex32_t *T, int LDT,
                PLASMA_Complex32_t *WORK, int LDWORK)
{
    int i,  i1, i3;
    int NQ, NW;
    int kb;
    int ic = 0;
    int jc = 0;
    int mi1 = M1;
    int mi2 = M2;
    int ni1 = N1;
    int ni2 = N2;

    /* Check input arguments */
    if ((side != PlasmaLeft) && (side != PlasmaRight)) {
        coreblas_error(1, "Illegal value of side");
        return -1;
    }

    /* NQ is the order of Q */
    if (side == PlasmaLeft) {
        NQ = N2;
        NW = IB;
    }
    else {
        NQ = M2;
        NW = N1;
    }

    if ((trans != PlasmaNoTrans) && (trans != PlasmaConjTrans)) {
        coreblas_error(2, "Illegal value of trans");
        return -2;
    }
    if (M1 < 0) {
        coreblas_error(3, "Illegal value of M1");
        return -3;
    }
    if (N1 < 0) {
        coreblas_error(4, "Illegal value of N1");
        return -4;
    }
    if ((M2 < 0) ||
        ( (side == PlasmaRight) && (M1 != M2) ) ) {
        coreblas_error(5, "Illegal value of M2");
        return -5;
    }
    if ((N2 < 0) ||
        ( (side == PlasmaLeft) && (N1 != N2) ) ) {
        coreblas_error(6, "Illegal value of N2");
        return -6;
    }
    if ((K < 0) || 
        ( (side == PlasmaLeft)  && (K > M1) ) ||
        ( (side == PlasmaRight) && (K > N1) ) ) {
        coreblas_error(7, "Illegal value of K");
        return -7;
    }
    if (IB < 0) {
        coreblas_error(8, "Illegal value of IB");
        return -8;
    }
    if (LDA1 < max(1,M1)){
        coreblas_error(10, "Illegal value of LDA1");
        return -10;
    }
    if (LDA2 < max(1,M2)){
        coreblas_error(12, "Illegal value of LDA2");
        return -12;
    }
    if (LDV < max(1,NQ)){
        coreblas_error(14, "Illegal value of LDV");
        return -14;
    }
    if (LDT < max(1,IB)){
        coreblas_error(16, "Illegal value of LDT");
        return -16;
    }
    if (LDWORK < max(1,NW)){
        coreblas_error(18, "Illegal value of LDWORK");
        return -18;
    }

    /* Quick return */
    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0))
        return PLASMA_SUCCESS;

    if (((side == PlasmaLeft) && (trans == PlasmaNoTrans))
        || ((side == PlasmaRight) && (trans != PlasmaNoTrans))) {
        i1 = 0;
        i3 = IB;
    }
    else {
        i1 = ( ( K-1 ) / IB )*IB;
        i3 = -IB;
    }

    /* Transpose */
    if (trans == PlasmaNoTrans) {
        trans = PlasmaConjTrans;
    }
    else {
        trans = PlasmaNoTrans;
    }

    for (i = i1; (i > -1) && (i < K); i+=i3) {
        kb = min(IB, K-i);

        if (side == PlasmaLeft) {
            mi1 = M1 - i;
            mi2 = i + kb;
            ic = i;
        }
        else {
            /* ni2 must be the number of rows of V */
            ni1 = kb;
            ni2 = i + kb;
            /*if ((K == IB) && (NeqIB > 0)) { ni2 = N2 - i; }*/
            jc = i;
        }

        /*
         * H or H' is applied to C(i:m,1:n)
         */
        CORE_cttrfb(
            side, trans, PlasmaForward, PlasmaRowwise,
            mi1, ni1, mi2, ni2, kb,
            &A1[LDA1*jc+ic], LDA1,
            A2, LDA2,
            &V[i], LDV,
            &T[LDT*i], LDT,
            WORK, LDWORK);
    }
    return PLASMA_SUCCESS;
}
Example #14
0
int CORE_sormqr(int side, int trans,
                int M, int N, int K, int IB,
                float *A, int LDA,
                float *T, int LDT,
                float *C, int LDC,
                float *WORK, int LDWORK)
{
    int i, kb;
    int i1, i3;
    int nq, nw;
    int ic = 0;
    int jc = 0;
    int ni = N;
    int mi = M;

    /* Check input arguments */
    if ((side != PlasmaLeft) && (side != PlasmaRight)) {
        coreblas_error(1, "Illegal value of side");
        return -1;
    }
    /*
     * NQ is the order of Q and NW is the minimum dimension of WORK
     */
    if (side == PlasmaLeft) {
        nq = M;
        nw = N;
    }
    else {
        nq = N;
        nw = M;
    }

    if ((trans != PlasmaNoTrans) && (trans != PlasmaTrans)) {
        coreblas_error(2, "Illegal value of trans");
        return -2;
    }
    if (M < 0) {
        coreblas_error(3, "Illegal value of M");
        return -3;
    }
    if (N < 0) {
        coreblas_error(4, "Illegal value of N");
        return -4;
    }
    if ((K < 0) || (K > nq)) {
        coreblas_error(5, "Illegal value of K");
        return -5;
    }
    if ((IB < 0) || ( (IB == 0) && ((M > 0) && (N > 0)) )) {
        coreblas_error(6, "Illegal value of IB");
        return -6;
    }
    if ((LDA < max(1,nq)) && (nq > 0)) {
        coreblas_error(8, "Illegal value of LDA");
        return -8;
    }
    if ((LDC < max(1,M)) && (M > 0)) {
        coreblas_error(12, "Illegal value of LDC");
        return -12;
    }
    if ((LDWORK < max(1,nw)) && (nw > 0)) {
        coreblas_error(14, "Illegal value of LDWORK");
        return -14;
    }

    /* Quick return */
    if ((M == 0) || (N == 0) || (K == 0))
        return PLASMA_SUCCESS;

    if (((side == PlasmaLeft) && (trans != PlasmaNoTrans))
        || ((side == PlasmaRight) && (trans == PlasmaNoTrans))) {
        i1 = 0;
        i3 = IB;
    }
    else {
        i1 = ( ( K-1 ) / IB )*IB;
        i3 = -IB;
    }

    for(i = i1; (i >- 1) && (i < K); i+=i3 ) {
        kb = min(IB, K-i);

        if (side == PlasmaLeft) {
            /*
             * H or H' is applied to C(i:m,1:n)
             */
            mi = M - i;
            ic = i;
        }
        else {
            /*
             * H or H' is applied to C(1:m,i:n)
             */
            ni = N - i;
            jc = i;
        }
        /*
         * Apply H or H'
         */
        LAPACKE_slarfb_work(LAPACK_COL_MAJOR,
            lapack_const(side),
            lapack_const(trans),
            lapack_const(PlasmaForward),
            lapack_const(PlasmaColumnwise),
            mi, ni, kb,
            &A[LDA*i+i], LDA,
            &T[LDT*i], LDT,
            &C[LDC*jc+ic], LDC,
            WORK, LDWORK);
    }
    return PLASMA_SUCCESS;
}
Example #15
0
/***************************************************************************//**
 *
 * @ingroup CORE_double
 *
 *  ZPAMM  performs one of the matrix-matrix operations
 *
 *                    LEFT                      RIGHT
 *     OP PlasmaW  :  W  = A1 + op(V) * A2  or  W  = A1 + A2 * op(V)
 *     OP PlasmaA2 :  A2 = A2 - op(V) * W   or  A2 = A2 - W * op(V)
 *
 *  where  op( V ) is one of
 *
 *     op( V ) = V   or   op( V ) = V**T   or   op( V ) = V**T,
 *
 *  A1, A2 and W are general matrices, and V is:
 *
 *        l = k: rectangle + triangle
 *        l < k: rectangle + trapezoid
 *        l = 0: rectangle
 *
 *  Size of V, both rowwise and columnwise, is:
 *
 *         ----------------------
 *          side   trans    size
 *         ----------------------
 *          left     N     M x K
 *                   T     K x M
 *          right    N     K x N
 *                   T     N x K
 *         ----------------------
 *
 *  LEFT (columnwise and rowwise):
 *
 *              |    K    |                 |         M         |
 *           _  __________   _              _______________        _
 *              |    |    |                 |             | \
 *     V:       |    |    |            V':  |_____________|___\    K
 *              |    |    | M-L             |                  |
 *           M  |    |    |                 |__________________|   _
 *              |____|    |  _
 *              \    |    |                 |    M - L    | L  |
 *                \  |    |  L
 *           _      \|____|  _
 *
 *
 *  RIGHT (columnwise and rowwise):
 *
 *          |         K         |                   |    N    |
 *          _______________        _             _  __________   _
 *          |             | \                       |    |    |
 *     V':  |_____________|___\    N        V:      |    |    |
 *          |                  |                    |    |    | K-L
 *          |__________________|   _             K  |    |    |
 *                                                  |____|    |  _
 *          |    K - L    | L  |                    \    |    |
 *                                                    \  |    |  L
 *                                               _      \|____|  _
 *
 *  Arguments
 *  ==========
 *
 * @param[in] op
 *
 *         OP specifies which operation to perform:
 *
 *         @arg PlasmaW  : W  = A1 + op(V) * A2  or  W  = A1 + A2 * op(V)
 *         @arg PlasmaA2 : A2 = A2 - op(V) * W   or  A2 = A2 - W * op(V)
 *
 * @param[in] side
 *
 *         SIDE specifies whether  op( V ) multiplies A2
 *         or W from the left or right as follows:
 *
 *         @arg PlasmaLeft  : multiply op( V ) from the left
 *                            OP PlasmaW  :  W  = A1 + op(V) * A2
 *                            OP PlasmaA2 :  A2 = A2 - op(V) * W
 *
 *         @arg PlasmaRight : multiply op( V ) from the right
 *                            OP PlasmaW  :  W  = A1 + A2 * op(V)
 *                            OP PlasmaA2 :  A2 = A2 - W * op(V)
 *
 * @param[in] storev
 *
 *         Indicates how the vectors which define the elementary
 *         reflectors are stored in V:
 *
 *         @arg PlasmaColumnwise
 *         @arg PlasmaRowwise
 *
 * @param[in] M
 *         The number of rows of the A1, A2 and W
 *         If SIDE is PlasmaLeft, the number of rows of op( V )
 *
 * @param[in] N
 *         The number of columns of the A1, A2 and W
 *         If SIDE is PlasmaRight, the number of columns of op( V )
 *
 * @param[in] K
 *         If SIDE is PlasmaLeft, the number of columns of op( V )
 *         If SIDE is PlasmaRight, the number of rows of op( V )
 *
 * @param[in] L
 *         The size of the triangular part of V
 *
 * @param[in] A1
 *         On entry, the M-by-N tile A1.
 *
 * @param[in] LDA1
 *         The leading dimension of the array A1. LDA1 >= max(1,M).
 *
 * @param[in,out] A2
 *         On entry, the M-by-N tile A2.
 *         On exit, if OP is PlasmaA2 A2 is overwritten
 *
 * @param[in] LDA2
 *         The leading dimension of the tile A2. LDA2 >= max(1,M).
 *
 * @param[in] V
 *         The matrix V as described above.
 *         If SIDE is PlasmaLeft : op( V ) is M-by-K
 *         If SIDE is PlasmaRight: op( V ) is K-by-N
 *
 * @param[in] LDV
 *         The leading dimension of the array V.
 *
 * @param[in,out] W
 *         On entry, the M-by-N matrix W.
 *         On exit, W is overwritten either if OP is PlasmaA2 or PlasmaW.
 *         If OP is PlasmaA2, W is an input and is used as a workspace.
 *
 * @param[in] LDW
 *         The leading dimension of array WORK.
 *
 *******************************************************************************
 *
 * @return
 *          \retval PLASMA_SUCCESS successful exit
 *          \retval <0 if -i, the i-th argument had an illegal value
 *
 ******************************************************************************/
int
CORE_dpamm(int op, PLASMA_enum side, PLASMA_enum storev,
           int M, int N, int K, int L,
           const double *A1, int LDA1,
                 double *A2, int LDA2,
           const double *V, int LDV,
                 double *W, int LDW)
{


    int vi2, vi3, uplo, trans, info;

    /* Check input arguments */
    if ((op != PlasmaW) && (op != PlasmaA2)) {
        coreblas_error(1, "Illegal value of op");
        return -1;
    }
    if ((side != PlasmaLeft) && (side != PlasmaRight)) {
        coreblas_error(2, "Illegal value of side");
        return -2;
    }
    if ((storev != PlasmaColumnwise) && (storev != PlasmaRowwise)) {
        coreblas_error(3, "Illegal value of storev");
        return -3;
    }
    if (M < 0) {
        coreblas_error(4, "Illegal value of M");
        return -4;
    }
    if (N < 0) {
        coreblas_error(5, "Illegal value of N");
        return -5;
    }
    if (K < 0) {
        coreblas_error(6, "Illegal value of K");
        return -6;
    }
    if (L < 0) {
        coreblas_error(7, "Illegal value of L");
        return -7;
    }
    if (LDA1 < 0) {
        coreblas_error(9, "Illegal value of LDA1");
        return -9;
    }
    if (LDA2 < 0) {
        coreblas_error(11, "Illegal value of LDA2");
        return -11;
    }
    if (LDV < 0) {
        coreblas_error(13, "Illegal value of LDV");
        return -13;
    }
    if (LDW < 0) {
        coreblas_error(15, "Illegal value of LDW");
        return -15;
    }

    /* Quick return */
    if ((M == 0) || (N == 0) || (K == 0))
        return PLASMA_SUCCESS;

    /*
     * TRANS is set as:
     *
     *        -------------------------------------
     *         side   direct     PlasmaW  PlasmaA2
     *        -------------------------------------
     *         left   colwise       T        N
     *                rowwise       N        T
     *         right  colwise       N        T
     *                rowwise       T        N
     *        -------------------------------------
     */

    /* Columnwise*/
    if (storev == PlasmaColumnwise) {
        uplo = CblasUpper;
        if (side == PlasmaLeft) {
            trans = op == PlasmaA2 ? PlasmaNoTrans : PlasmaTrans;
            vi2 = trans == PlasmaNoTrans ? M - L : K - L;
        }
        else {
            trans = op == PlasmaW ? PlasmaNoTrans : PlasmaTrans;
            vi2 = trans == PlasmaNoTrans ? K - L : N - L;
        }
        vi3 = LDV * L;
    }

    /* Rowwise */
    else {
        uplo = CblasLower;
        if (side == PlasmaLeft) {
            trans = op == PlasmaW ? PlasmaNoTrans : PlasmaTrans;
            vi2 = trans == PlasmaNoTrans ? K - L : M - L;
        }
        else {
            trans = op == PlasmaA2 ? PlasmaNoTrans : PlasmaTrans;
            vi2 = trans == PlasmaNoTrans ? N - L : K - L;
        }
        vi2 *= LDV;
        vi3  = L;
    }

    /**/
    if (op==PlasmaW) {
        info = CORE_dpamm_w(
                side, trans, uplo, M, N, K, L, vi2, vi3,
                A1, LDA1, A2, LDA2, V, LDV, W, LDW);
        if (info != 0)
            return info;
    } else if (op==PlasmaA2) {
        info = CORE_dpamm_a2(
                side, trans, uplo, M, N, K, L, vi2, vi3,
                A2, LDA2, V, LDV, W, LDW);
        if (info != 0)
            return info;
    }

    return PLASMA_SUCCESS;
}
Example #16
0
int CORE_dgelqt(int M, int N, int IB,
                double *A, int LDA,
                double *T, int LDT,
                double *TAU,
                double *WORK)
{
    int i, k, sb;
    int iinfo;

    /* Check input arguments */
    if (M < 0) {
        coreblas_error(1, "Illegal value of M");
        return -1;
    }
    if (N < 0) {
        coreblas_error(2, "Illegal value of N");
        return -2;
    }
    if ((IB < 0) || ( (IB == 0) && ((M > 0) && (N > 0)) )) {
        coreblas_error(3, "Illegal value of IB");
        return -3;
    }
    if ((LDA < max(1,M)) && (M > 0)) {
        coreblas_error(5, "Illegal value of LDA");
        return -5;
    }
    if ((LDT < max(1,IB)) && (IB > 0)) {
        coreblas_error(7, "Illegal value of LDT");
        return -7;
    }

    /* Quick return */
    if ((M == 0) || (N == 0) || (IB == 0))
        return PLASMA_SUCCESS;

    k = min(M, N);

    for(i = 0; i < k; i += IB) {
        sb = min(IB, k-i);

        iinfo = LAPACKE_dgelq2_work(LAPACK_COL_MAJOR, sb, N-i,
                                    &A[LDA*i+i], LDA, &TAU[i], WORK);

        LAPACKE_dlarft_work(LAPACK_COL_MAJOR,
                            lapack_const(PlasmaForward),
                            lapack_const(PlasmaRowwise),
                            N-i, sb,
                            &A[LDA*i+i], LDA, &TAU[i],
                            &T[LDT*i], LDT);

        if (M > i+sb) {
            LAPACKE_dlarfb_work(
                LAPACK_COL_MAJOR,
                lapack_const(PlasmaRight),
                lapack_const(PlasmaNoTrans),
                lapack_const(PlasmaForward),
                lapack_const(PlasmaRowwise),
                M-i-sb, N-i, sb,
                &A[LDA*i+i],      LDA,
                &T[LDT*i],        LDT,
                &A[LDA*i+(i+sb)], LDA,
                WORK, M-i-sb);
        }
    }
    return PLASMA_SUCCESS;
}
Example #17
0
int CORE_dtstrf(int M, int N, int IB, int NB,
                double *U, int LDU,
                double *A, int LDA,
                double *L, int LDL,
                int *IPIV,
                double *WORK, int LDWORK,
                int *INFO)
{
    static double zzero = 0.0;
    static double mzone =-1.0;

    double alpha;
    int i, j, ii, sb;
    int im, ip;

    /* Check input arguments */
    *INFO = 0;
    if (M < 0) {
        coreblas_error(1, "Illegal value of M");
        return -1;
    }
    if (N < 0) {
        coreblas_error(2, "Illegal value of N");
        return -2;
    }
    if (IB < 0) {
        coreblas_error(3, "Illegal value of IB");
        return -3;
    }
    if ((LDU < max(1,NB)) && (NB > 0)) {
        coreblas_error(6, "Illegal value of LDU");
        return -6;
    }
    if ((LDA < max(1,M)) && (M > 0)) {
        coreblas_error(8, "Illegal value of LDA");
        return -8;
    }
    if ((LDL < max(1,IB)) && (IB > 0)) {
        coreblas_error(10, "Illegal value of LDL");
        return -10;
    }

    /* Quick return */
    if ((M == 0) || (N == 0) || (IB == 0))
        return PLASMA_SUCCESS;

    /* Set L to 0 */
    memset(L, 0, LDL*N*sizeof(double));

    ip = 0;
    for (ii = 0; ii < N; ii += IB) {
        sb = min(N-ii, IB);

        for (i = 0; i < sb; i++) {
            im = cblas_idamax(M, &A[LDA*(ii+i)], 1);
            IPIV[ip] = ii+i+1;

            if (fabs(A[LDA*(ii+i)+im]) > fabs(U[LDU*(ii+i)+ii+i])) {
                /*
                 * Swap behind.
                 */
                cblas_dswap(i, &L[LDL*ii+i], LDL, &WORK[im], LDWORK );
                /*
                 * Swap ahead.
                 */
                cblas_dswap(sb-i, &U[LDU*(ii+i)+ii+i], LDU, &A[LDA*(ii+i)+im], LDA );
                /*
                 * Set IPIV.
                 */
                IPIV[ip] = NB + im + 1;

                for (j = 0; j < i; j++) {
                    A[LDA*(ii+j)+im] = zzero;
                }
            }

            if ((*INFO == 0) && (fabs(A[LDA*(ii+i)+im]) == zzero)
                && (fabs(U[LDU*(ii+i)+ii+i]) == zzero)) {
                *INFO = ii+i+1;
            }

            alpha = ((double)1. / U[LDU*(ii+i)+ii+i]);
            cblas_dscal(M, (alpha), &A[LDA*(ii+i)], 1);
            cblas_dcopy(M, &A[LDA*(ii+i)], 1, &WORK[LDWORK*i], 1);
            cblas_dger(
                CblasColMajor, M, sb-i-1,
                (mzone), &A[LDA*(ii+i)], 1,
                &U[LDU*(ii+i+1)+ii+i], LDU,
                &A[LDA*(ii+i+1)], LDA );
            ip = ip+1;
        }
        /*
         * Apply the subpanel to the rest of the panel.
         */
        if(ii+i < N) {
            for(j = ii; j < ii+sb; j++) {
                if (IPIV[j] <= NB) {
                    IPIV[j] = IPIV[j] - ii;
                }
            }

            CORE_dssssm(
                NB, N-(ii+sb), M, N-(ii+sb), sb, sb,
                &U[LDU*(ii+sb)+ii], LDU,
                &A[LDA*(ii+sb)], LDA,
                &L[LDL*ii], LDL,
                WORK, LDWORK, &IPIV[ii]);

            for(j = ii; j < ii+sb; j++) {
                if (IPIV[j] <= NB) {
                    IPIV[j] = IPIV[j] + ii;
                }
            }
        }
    }
    return PLASMA_SUCCESS;
}
Example #18
0
int CORE_dtstrf_cublas(int M, int N, int IB, int NB,
                double *U, int LDU,
                double *A, int LDA,
                double *L, int LDL,
                int *IPIV,
                double *WORK, int LDWORK,
                int *INFO)
{
  static double zzero = 0.0;
  static double mzone =-1.0;
  cublasStatus_t status;
  cudaError_t err;
  
  double alpha;
  int i, j, ii, sb;
  int im, ip;
  
#if CONFIG_VERBOSE
  fprintf(stdout, "%s: M=%d N=%d IB=%d NB=%d U=%p LDU=%d A=%p LDA=%d L=%p LDL=%d IPIV=%p WORK=%p LDWORK=%d\n",
          __FUNCTION__, M, N, IB, NB, U, LDU, A, LDA, L, LDL, IPIV, WORK, LDWORK);
  fflush(stdout);
#endif
  
  /* Check input arguments */
  *INFO = 0;
  if (M < 0) {
    coreblas_error(1, "Illegal value of M");
    return -1;
  }
  if (N < 0) {
    coreblas_error(2, "Illegal value of N");
    return -2;
  }
  if (IB < 0) {
    coreblas_error(3, "Illegal value of IB");
    return -3;
  }
  if ((LDU < max(1,NB)) && (NB > 0)) {
    coreblas_error(6, "Illegal value of LDU");
    return -6;
  }
  if ((LDA < max(1,M)) && (M > 0)) {
    coreblas_error(8, "Illegal value of LDA");
    return -8;
  }
  if ((LDL < max(1,IB)) && (IB > 0)) {
    coreblas_error(10, "Illegal value of LDL");
    return -10;
  }
  
  /* Quick return */
  if ((M == 0) || (N == 0) || (IB == 0))
    return PLASMA_SUCCESS;
  
  /* Set L to 0 */
  err = cudaMemset(L, 0, LDL*N*sizeof(double));
  PLASMA_CUDA_ASSERT(err);
  
  double* dev_ptr = 0;
  err = cudaMalloc((void**)&dev_ptr, 2*sizeof(double));
  PLASMA_CUDA_ASSERT(err);
  double* host_ptr;
  err = cudaMallocHost((void**)&host_ptr, 2*sizeof(double));
  PLASMA_CUDA_ASSERT(err);
  
  int* piv = kaapi_memory_get_host_pointer_and_validate(IPIV);
  
  ip = 0;
  for (ii = 0; ii < N; ii += IB) {
    sb = min(N-ii, IB);
    
    for (i = 0; i < sb; i++) {
      status = cublasIdamax(kaapi_cuda_cublas_handle(),
                            M, &A[LDA*(ii+i)], 1, &im
                            );
      PLASMA_CUBLAS_ASSERT(status);
      
      /* get im */
      err = cudaStreamSynchronize(kaapi_cuda_kernel_stream());
      PLASMA_CUDA_ASSERT(err);
      
      /* ajust index, CUBLAS is 1-based indexing */
      im--;

      piv[ip] = ii+i+1;
      
      core_dtstrf_cmp(kaapi_cuda_kernel_stream(),
                      &A[LDA*(ii+i)+im], &U[LDU*(ii+i)+ii+i], dev_ptr, host_ptr);
      err = cudaStreamSynchronize(kaapi_cuda_kernel_stream());
      PLASMA_CUDA_ASSERT(err);
      
      if (host_ptr[0] == 1.0f) {
        /*
         * Swap behind.
         */
        status = cublasDswap(kaapi_cuda_cublas_handle(),
                   i, &L[LDL*ii+i], LDL, &WORK[im], LDWORK
        );
        PLASMA_CUBLAS_ASSERT(status);
        /*
         * Swap ahead.
         */
        status = cublasDswap(kaapi_cuda_cublas_handle(),
              sb-i, &U[LDU*(ii+i)+ii+i], LDU, &A[LDA*(ii+i)+im], LDA
         );
        PLASMA_CUBLAS_ASSERT(status);
        /*
         * Set IPIV.
         */
        piv[ip] = NB + im + 1;

        core_dtstrf_set_zero(kaapi_cuda_kernel_stream(),
                             A, LDA, i, ii, im, zzero
                        );
      }
      
      core_dtstrf_cmp_zzero_and_get_alpha(kaapi_cuda_kernel_stream(),
                      &A[LDA*(ii+i)+im], &U[LDU*(ii+i)+ii+i], zzero, dev_ptr, host_ptr);
      err = cudaStreamSynchronize(kaapi_cuda_kernel_stream());
      PLASMA_CUDA_ASSERT(err);
      
      if ((*INFO == 0) && (host_ptr[0] == 1.0f)) {
        *INFO = ii+i+1;
      }
      
//      alpha = ((double)1. / U[LDU*(ii+i)+ii+i]);
      alpha = host_ptr[1];
      status = cublasDscal(kaapi_cuda_cublas_handle(),
                           M, &alpha, &A[LDA*(ii+i)], 1
                           );
      PLASMA_CUBLAS_ASSERT(status);
      
      status = cublasDcopy(kaapi_cuda_cublas_handle(),
                  M, &A[LDA*(ii+i)], 1, &WORK[LDWORK*i], 1
        );
      PLASMA_CUBLAS_ASSERT(status);
      
      status = cublasDger(kaapi_cuda_cublas_handle(),
                          M, sb-i-1,
                          &mzone, &A[LDA*(ii+i)], 1,
                          &U[LDU*(ii+i+1)+ii+i], LDU,
                          &A[LDA*(ii+i+1)], LDA
      );
      PLASMA_CUBLAS_ASSERT(status);
      ip = ip+1;
    }
    /*
     * Apply the subpanel to the rest of the panel.
     */
    if(ii+i < N) {
      for(j = ii; j < ii+sb; j++) {
        if (piv[j] <= NB) {
          piv[j] = piv[j] - ii;
        }
      }
      
      CORE_dssssm_cublas_v2(
                  NB, N-(ii+sb), M, N-(ii+sb), sb, sb,
                  &U[LDU*(ii+sb)+ii], LDU,
                  &A[LDA*(ii+sb)], LDA,
                  &L[LDL*ii], LDL,
                  WORK, LDWORK, &piv[ii]
                  );
      err = cudaStreamSynchronize(kaapi_cuda_kernel_stream());
      PLASMA_CUDA_ASSERT(err);
      
      for(j = ii; j < ii+sb; j++) {
        if (piv[j] <= NB) {
          piv[j] = piv[j] + ii;
        }
      }
    }
  }
  
  cudaFreeHost(host_ptr);
  cudaFree(dev_ptr);
  return PLASMA_SUCCESS;
}
Example #19
0
int 
CORE_zgblrx(int uplo, int N,
            PLASMA_desc *A,
            PLASMA_Complex64_t *V,
            PLASMA_Complex64_t *TAU,
            int st,
            int ed,
            int eltsize)
{
    int    NB, J1, J2;
    int    len1, len2, t1ed, t2st;
    int    i;
    PLASMA_desc vA=*A;

    /* Check input arguments */
    if (N < 0) {
        coreblas_error(2, "Illegal value of N");
        return -2;
    }
    if (ed <= st) {
        coreblas_error(6, "Illegal value of st and ed (internal)");
        return -6;
    }

    /* Quick return */
    if (N == 0)
        return PLASMA_SUCCESS;

    NB = A->mb;
    if( uplo == PlasmaLower ){
        /* ========================
         *       LOWER CASE
         * ========================*/
        for (i = ed; i >= st+1 ; i--){
            /* apply reflector from the left (horizontal row) and from the right for only the diagonal 2x2.*/
            J1    = st;
            J2    = i-2;
            t1ed  = (J2/NB)*NB;
            t2st  = max(t1ed+1,J1);
            len1  = t1ed-J1+1; 
            len2  = J2-t2st+1;
            if(len1>0)CORE_zlarfx2(PlasmaLeft, len1 , *V(i), conj(*TAU(i)), A(i-1, J1  ), ELTLDD(vA, (i-1)),  A(i,  J1 ), ELTLDD(vA, i) );
            if(len2>0)CORE_zlarfx2(PlasmaLeft, len2 , *V(i), conj(*TAU(i)), A(i-1, t2st), ELTLDD(vA, (i-1)),  A(i, t2st), ELTLDD(vA, i) );
            CORE_zlarfx2ce(PlasmaLower, V(i), TAU(i), A(i-1,i-1), A(i,i-1), A(i,i)); 
        }
        /* APPLY RIGHT ON THE REMAINING ELEMENT OF KERNEL 1 */
        for (i = ed; i >= st+1 ; i--){
            J1    = i+1;
            J2    = min(ed,N);
            t1ed  = (J2/NB)*NB;
            t2st  = max(t1ed+1,J1);
            len1  = t1ed-J1+1; 
            len2  = J2-t2st+1;
            if(len1>0)CORE_zlarfx2(PlasmaRight, len1, conj(*V(i)), conj(*TAU(i)), A(J1,i-1),   ELTLDD(vA, J1)  , A(J1  , i), ELTLDD(vA, J1)   );
            if(len2>0)CORE_zlarfx2(PlasmaRight, len2, conj(*V(i)), conj(*TAU(i)), A(t2st,i-1), ELTLDD(vA, t2st), A(t2st, i), ELTLDD(vA, t2st) );
        }
    } else {
        /* ========================
         *       UPPER CASE
         * ========================*/
        for (i = ed; i >= st+1 ; i--){
            /* apply reflector from the left (horizontal row) and from the right for only the diagonal 2x2.*/
            J1    = st;
            J2    = i-2;
            t1ed  = (J2/NB)*NB;
            t2st  = max(t1ed+1,J1);
            len1  = t1ed-J1+1; 
            len2  = J2-t2st+1;
            if(len1>0)CORE_zlarfx2(PlasmaRight, len1, conj(*V(i)), conj(*TAU(i)), A(J1,i-1),   ELTLDD(vA, J1)  , A(J1  , i), ELTLDD(vA, J1)   );
            if(len2>0)CORE_zlarfx2(PlasmaRight, len2, conj(*V(i)), conj(*TAU(i)), A(t2st,i-1), ELTLDD(vA, t2st), A(t2st, i), ELTLDD(vA, t2st) );
            CORE_zlarfx2ce(PlasmaUpper, V(i), TAU(i), A(i-1,i-1), A(i-1, i), A(i,i));
        }
        /* APPLY LEFT ON THE REMAINING ELEMENT OF KERNEL 1 */
        for (i = ed; i >= st+1 ; i--){
            J1    = i+1;
            J2    = min(ed,N);
            t1ed  = (J2/NB)*NB;
            t2st  = max(t1ed+1,J1);
            len1  = t1ed-J1+1; 
            len2  = J2-t2st+1;
            if(len1>0)CORE_zlarfx2(PlasmaLeft, len1 , *V(i), conj(*TAU(i)), A(i-1, J1  ), ELTLDD(vA, (i-1)),  A(i,  J1 ), ELTLDD(vA, i) );
            if(len2>0)CORE_zlarfx2(PlasmaLeft, len2 , *V(i), conj(*TAU(i)), A(i-1, t2st), ELTLDD(vA, (i-1)),  A(i, t2st), ELTLDD(vA, i) );
        }
    }  /* end of else for the upper case */

    return PLASMA_SUCCESS;
}
Example #20
0
int CORE_sgessm(int M, int N, int K, int IB,
                int *IPIV,
                float *L, int LDL,
                float *A, int LDA)
{
    static float zone  =  1.0;
    static float mzone = -1.0;
    static int                ione  =  1;

    int i, sb;
    int tmp, tmp2;

    /* Check input arguments */
    if (M < 0) {
        coreblas_error(1, "Illegal value of M");
        return -1;
    }
    if (N < 0) {
        coreblas_error(2, "Illegal value of N");
        return -2;
    }
    if (K < 0) {
        coreblas_error(3, "Illegal value of K");
        return -3;
    }
    if (IB < 0) {
        coreblas_error(4, "Illegal value of IB");
        return -4;
    }
    if ((LDL < max(1,M)) && (M > 0)) {
        coreblas_error(7, "Illegal value of LDL");
        return -7;
    }
    if ((LDA < max(1,M)) && (M > 0)) {
        coreblas_error(9, "Illegal value of LDA");
        return -9;
    }

    /* Quick return */
    if ((M == 0) || (N == 0) || (K == 0) || (IB == 0))
        return PLASMA_SUCCESS;

    for(i = 0; i < K; i += IB) {
        sb = min(IB, K-i);
        /*
         * Apply interchanges to columns I*IB+1:IB*( I+1 )+1.
         */
        tmp  = i+1;
        tmp2 = i+sb;
        LAPACKE_slaswp_work(LAPACK_COL_MAJOR, N, A, LDA, tmp, tmp2, IPIV, ione);
        /*
         * Compute block row of U.
         */
        cblas_strsm(
            CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit,
            sb, N, (zone),
            &L[LDL*i+i], LDL,
            &A[i], LDA );

        if (i+sb < M) {
        /*
        * Update trailing submatrix.
        */
        cblas_sgemm(
            CblasColMajor, CblasNoTrans, CblasNoTrans,
            M-(i+sb), N, sb,
            (mzone), &L[LDL*i+(i+sb)], LDL,
            &A[i], LDA,
            (zone), &A[i+sb], LDA );
        }
    }
    return PLASMA_SUCCESS;
}
Example #21
0
int CORE_ztsmlq(int side, int trans,
                int M1, int N1, int M2, int N2, int K, int IB,
                PLASMA_Complex64_t *A1, int LDA1,
                PLASMA_Complex64_t *A2, int LDA2,
                PLASMA_Complex64_t *V, int LDV,
                PLASMA_Complex64_t *T, int LDT,
                PLASMA_Complex64_t *WORK, int LDWORK)
{
    int i, i1, i3;
    int NW;
    int kb;
    int ic = 0;
    int jc = 0;
    int mi = M1;
    int ni = N1;

    /* Check input arguments */
    if ((side != PlasmaLeft) && (side != PlasmaRight)) {
        coreblas_error(1, "Illegal value of side");
        return -1;
    }

    /* NW is the minimum dimension of WORK */
    if (side == PlasmaLeft) {
        NW = IB;
    }
    else {
        NW = N1;
    }

    if ((trans != PlasmaNoTrans) && (trans != PlasmaConjTrans)) {
        coreblas_error(2, "Illegal value of trans");
        return -2;
    }
    if (M1 < 0) {
        coreblas_error(3, "Illegal value of M1");
        return -3;
    }
    if (N1 < 0) {
        coreblas_error(4, "Illegal value of N1");
        return -4;
    }
    if ( (M2 < 0) || 
         ( (M2 != M1) && (side == PlasmaRight) ) ){
        coreblas_error(5, "Illegal value of M2");
        return -5;
    }
    if ( (N2 < 0) || 
         ( (N2 != N1) && (side == PlasmaLeft) ) ){
        coreblas_error(6, "Illegal value of N2");
        return -6;
    }
    if ((K < 0) || 
        ( (side == PlasmaLeft)  && (K > M1) ) ||
        ( (side == PlasmaRight) && (K > N1) ) ) {
        coreblas_error(7, "Illegal value of K");
        return -7;
    }
    if (IB < 0) {
        coreblas_error(8, "Illegal value of IB");
        return -8;
    }
    if (LDA1 < max(1,M1)){
        coreblas_error(10, "Illegal value of LDA1");
        return -10;
    }
    if (LDA2 < max(1,M2)){
        coreblas_error(12, "Illegal value of LDA2");
        return -12;
    }
    if (LDV < max(1,K)){
        coreblas_error(14, "Illegal value of LDV");
        return -14;
    }
    if (LDT < max(1,IB)){
        coreblas_error(16, "Illegal value of LDT");
        return -16;
    }
    if (LDWORK < max(1,NW)){
        coreblas_error(18, "Illegal value of LDWORK");
        return -18;
    }

    /* Quick return */
    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0))
        return PLASMA_SUCCESS;

    if (((side == PlasmaLeft) && (trans == PlasmaNoTrans))
        || ((side == PlasmaRight) && (trans != PlasmaNoTrans))) {
        i1 = 0;
        i3 = IB;
    }
    else {
        i1 = ((K-1) / IB)*IB;
        i3 = -IB;
    }

    if (trans == PlasmaNoTrans) {
        trans = PlasmaConjTrans;
    }
    else {
        trans = PlasmaNoTrans;
    }

    for(i = i1; (i > -1) && (i < K); i += i3) {
        kb = min(IB, K-i);

        if (side == PlasmaLeft) {
            /*
             * H or H' is applied to C(i:m,1:n)
             */
            mi = M1 - i;
            ic = i;
        }
        else {
            /*
             * H or H' is applied to C(1:m,i:n)
             */
            ni = N1 - i;
            jc = i;
        }
        /*
         * Apply H or H'
         */
        CORE_ztsrfb(
            side, trans, PlasmaForward, PlasmaRowwise,
            mi, ni, M2, N2, kb,
            &A1[LDA1*jc+ic], LDA1,
            A2, LDA2,
            &V[i], LDV,
            &T[LDT*i], LDT,
            WORK, LDWORK);
    }
    return PLASMA_SUCCESS;
}
Example #22
0
int CORE_zttqrt(int M, int N, int IB,
                PLASMA_Complex64_t *A1, int LDA1,
                PLASMA_Complex64_t *A2, int LDA2,
                PLASMA_Complex64_t *T, int LDT,
                PLASMA_Complex64_t *TAU, PLASMA_Complex64_t *WORK)
{
    static PLASMA_Complex64_t zone  = 1.0;
    static PLASMA_Complex64_t zzero = 0.0;
    static int                ione  = 1;

    PLASMA_Complex64_t alpha;
    int i, j, ii, sb, mi, ni;

    /* Check input arguments */
    if (M < 0) {
        coreblas_error(1, "Illegal value of M");
        return -1;
    }
    if (N < 0) {
        coreblas_error(2, "Illegal value of N");
        return -2;
    }
    if (IB < 0) {
        coreblas_error(3, "Illegal value of IB");
        return -3;
    }
    if ((LDA2 < max(1,M)) && (M > 0)) {
        coreblas_error(7, "Illegal value of LDA2");
        return -7;
    }

    /* Quick return */
    if ((M == 0) || (N == 0) || (IB == 0))
        return PLASMA_SUCCESS;

    for(ii = 0; ii < N; ii += IB) {
        sb = min(N-ii, IB);
        for(i = 0; i < sb; i++) {
            /*
             * Generate elementary reflector H( II*IB+I ) to annihilate
             * A( II*IB+I:mi, II*IB+I ).
             */
            mi = ii + i + 1;
            LAPACKE_zlarfg_work(mi+1, &A1[LDA1*(ii+i)+ii+i], &A2[LDA2*(ii+i)], ione, &TAU[ii+i]);

            if (sb-i-1>0) {
                /*
                 * Apply H( II*IB+I ) to A( II*IB+I:M, II*IB+I+1:II*IB+IB ) from the left.
                 */
                ni = sb-i-1;
                cblas_zcopy(
                    ni,
                    &A1[LDA1*(ii+i+1)+(ii+i)], LDA1,
                    WORK, 1);

#ifdef COMPLEX
                LAPACKE_zlacgv_work(ni, WORK, ione);
#endif
                cblas_zgemv(
                    CblasColMajor, (CBLAS_TRANSPOSE)PlasmaConjTrans,
                    mi, ni,
                    CBLAS_SADDR(zone), &A2[LDA2*(ii+i+1)], LDA2,
                    &A2[LDA2*(ii+i)], 1,
                    CBLAS_SADDR(zone), WORK, 1);
#ifdef COMPLEX
                LAPACKE_zlacgv_work(ni, WORK, ione);
#endif
                alpha = -conj(TAU[ii+i]);
                cblas_zaxpy(
                    ni, CBLAS_SADDR(alpha),
                    WORK, 1,
                    &A1[LDA1*(ii+i+1)+ii+i], LDA1);
#ifdef COMPLEX
                LAPACKE_zlacgv_work(ni, WORK, ione);
#endif
                cblas_zgerc(
                    CblasColMajor, mi, ni,
                    CBLAS_SADDR(alpha), &A2[LDA2*(ii+i)], 1,
                    WORK, 1,
                    &A2[LDA2*(ii+i+1)], LDA2);
            }

            /*
             * Calculate T.
             */

            if (i > 0 ) {

                cblas_zcopy(i, &A2[LDA2*(ii+i)+ii], 1, &WORK[ii], 1);
    
                cblas_ztrmv(
                    CblasColMajor, (CBLAS_UPLO)PlasmaUpper,
                    (CBLAS_TRANSPOSE)PlasmaConjTrans, (CBLAS_DIAG)PlasmaNonUnit,
                    i, &A2[LDA2*ii+ii], LDA2,
                    &WORK[ii], 1);
    
                alpha = -(TAU[ii+i]);
                for(j = 0; j < i; j++) {
                    WORK[ii+j] = alpha * WORK[ii+j];
                }
    
                if (ii > 0) {
                    cblas_zgemv(
                        CblasColMajor, (CBLAS_TRANSPOSE)PlasmaConjTrans, ii, i,
                        CBLAS_SADDR(alpha), &A2[LDA2*ii], LDA2,
                        &A2[LDA2*(ii+i)], 1,
                        CBLAS_SADDR(zzero), WORK, 1);
    
                    cblas_zaxpy(i, CBLAS_SADDR(zone), &WORK[ii], 1, WORK, 1);
                }
    
                cblas_zcopy(i, WORK, 1, &T[LDT*(ii+i)], 1);
    
                cblas_ztrmv(
                    CblasColMajor, (CBLAS_UPLO)PlasmaUpper,
                    (CBLAS_TRANSPOSE)PlasmaNoTrans, (CBLAS_DIAG)PlasmaNonUnit,
                    i, &T[LDT*ii], LDT,
                    &T[LDT*(ii+i)], 1);

            }

            T[LDT*(ii+i)+i] = TAU[ii+i];
        }

        /* Apply Q' to the rest of the matrix to the left  */
        if (N > ii+sb) {
            CORE_zttrfb(
                PlasmaLeft, PlasmaConjTrans,
                PlasmaForward, PlasmaColumnwise,
                sb, N-(ii+sb), ii+sb, N-(ii+sb), sb,
                &A1[LDA1*(ii+sb)+ii], LDA1,
                &A2[LDA2*(ii+sb)], LDA2,
                &A2[LDA2*ii], LDA2,
                &T[LDT*ii], LDT,
                WORK, sb);
        }
    }
    return PLASMA_SUCCESS;
}
int CORE_zpltmg_chebvand( int M, int N, PLASMA_Complex64_t *A, int LDA,
                          int gN, int m0, int n0,
                          PLASMA_Complex64_t *W )
{
    PLASMA_Complex64_t step;
    int i, j, jj;

    /* Check input arguments */
    if (M < 0) {
        coreblas_error(1, "Illegal value of M");
        return -1;
    }
    if (N < 0) {
        coreblas_error(2, "Illegal value of N");
        return -2;
    }
    if ((LDA < max(1,M)) && (M > 0)) {
        coreblas_error(4, "Illegal value of LDA");
        return -4;
    }
    if (m0 < 0) {
        coreblas_error(6, "Illegal value of m0");
        return -6;
    }
    if (n0 < 0) {
        coreblas_error(7, "Illegal value of n0");
        return -7;
    }
    if (gN < n0+N) {
        coreblas_error(5, "Illegal value of gN");
        return -5;
    }

    step = (PLASMA_Complex64_t)1. / (gN - 1.);

    /* Initialize W if required */
    if (m0 == 0) {
        for (j=0, jj=n0; j<N; j++, jj++) {
            W[2*j  ] = 1.;
            W[2*j+1] = jj * step;
        }

        if ( M == 1 ) {
            LAPACKE_zlacpy_work( LAPACK_COL_MAJOR, 'A',
                                 1, N, W, 2, A, LDA );
            return PLASMA_SUCCESS;
        }

        LAPACKE_zlacpy_work( LAPACK_COL_MAJOR, 'A',
                             2, N, W, 2, A, LDA );

        M -= 2;
        A += 2;
    }

    /* Case NB=1, W contains row 0 and 1 and M should be 1 */
    if (m0 == 1) {
        if (M != 1) {
            coreblas_error(1, "Illegal value of M for m0 = 1");
            return -1;
        }

        LAPACKE_zlacpy_work( LAPACK_COL_MAJOR, 'A',
                             1, N, W+1, 2, A, LDA );
        return PLASMA_SUCCESS;
    }

    for (j=0, jj=n0; j<N; j++, jj++) {
        /* First line */
        if (M > 0) {
            A[LDA*j] = 2. * jj * step * W[j*2 + 1] - W[j*2  ];
        }

        /* Second line */
        if (M > 1) {
            A[LDA*j+1] = 2. * jj * step * A[LDA*j  ] - W[j*2+1];
        }

        for (i=2; i<M; i++) {
            A[LDA*j+i] = 2. * jj * step * A[LDA*j+i-1] - A[LDA*j+i-2];
        }
    }

    if ( M == 1 ) {
        cblas_zcopy( N, W+1, 2, W, 2 );
        cblas_zcopy( N, A+M-1, LDA, W+1, 2 );
    }
    else {
        LAPACKE_zlacpy_work( LAPACK_COL_MAJOR, 'A',
                             2, N, A + M - 2, LDA, W, 2 );
    }

    return PLASMA_SUCCESS;
}
Example #24
0
int CORE_zttlqt(int M, int N, int IB,
                PLASMA_Complex64_t *A1, int LDA1,
                PLASMA_Complex64_t *A2, int LDA2,
                PLASMA_Complex64_t *T, int LDT,
                PLASMA_Complex64_t *TAU, PLASMA_Complex64_t *WORK)
{
    static PLASMA_Complex64_t zone  = 1.0;
    static PLASMA_Complex64_t zzero = 0.0;
#ifdef COMPLEX
    static int                ione  = 1;
#endif

    PLASMA_Complex64_t alpha;
    int i, j, l, ii, sb, mi, ni;

    /* Check input arguments */
    if (M < 0) {
        coreblas_error(1, "Illegal value of M");
        return -1;
    }
    if (N < 0) {
        coreblas_error(2, "Illegal value of N");
        return -2;
    }
    if (IB < 0) {
        coreblas_error(3, "Illegal value of IB");
        return -3;
    }
    if ((LDA2 < max(1,M)) && (M > 0)) {
        coreblas_error(7, "Illegal value of LDA2");
        return -7;
    }

    /* Quick return */
    if ((M == 0) || (N == 0) || (IB == 0))
        return PLASMA_SUCCESS;

    /* TODO: Need to check why some cases require
     *  this to not have uninitialized values */
    CORE_zlaset( PlasmaUpperLower, IB, N,
                 0., 0., T, LDT);

    for(ii = 0; ii < M; ii += IB) {
        sb = min(M-ii, IB);
        for(i = 0; i < sb; i++) {
            j  = ii + i;
            mi = sb-i-1;
            ni = min( j + 1, N);
            /*
             * Generate elementary reflector H( II*IB+I ) to annihilate A( II*IB+I, II*IB+I:M ).
             */
#ifdef COMPLEX
            LAPACKE_zlacgv_work(ni, &A2[j], LDA2);
            LAPACKE_zlacgv_work(ione, &A1[LDA1*j+j], LDA1);
#endif
            LAPACKE_zlarfg_work(ni+1, &A1[LDA1*j+j], &A2[j], LDA2, &TAU[j]);

            if (mi > 0) {
                /*
                 * Apply H( j-1 ) to A( j:II+IB-1, j-1:M  ) from the right.
                 */
                cblas_zcopy(
                    mi,
                    &A1[LDA1*j+(j+1)], 1,
                    WORK, 1);

                cblas_zgemv(
                    CblasColMajor, (CBLAS_TRANSPOSE)PlasmaNoTrans,
                    mi, ni,
                    CBLAS_SADDR(zone), &A2[j+1], LDA2,
                    &A2[j], LDA2,
                    CBLAS_SADDR(zone), WORK, 1);

                alpha = -(TAU[j]);
                cblas_zaxpy(
                    mi, CBLAS_SADDR(alpha),
                    WORK, 1,
                    &A1[LDA1*j+j+1], 1);

                cblas_zgerc(
                    CblasColMajor, mi, ni,
                    CBLAS_SADDR(alpha), WORK, 1,
                    &A2[j], LDA2,
                    &A2[j+1], LDA2);
            }

            /*
             * Calculate T.
             */

            if (i > 0 ) {

                l = min(i, max(0, N-ii));
                alpha = -(TAU[j]);

                CORE_zpemv(
                        PlasmaNoTrans, PlasmaRowwise,
                        i , min(j, N), l,
                        alpha, &A2[ii], LDA2,
                        &A2[j], LDA2,
                        zzero, &T[LDT*j], 1,
                        WORK);

                /* T(0:i-1, j) = T(0:i-1, ii:j-1) * T(0:i-1, j) */
                cblas_ztrmv(
                        CblasColMajor, (CBLAS_UPLO)PlasmaUpper,
                        (CBLAS_TRANSPOSE)PlasmaNoTrans,
                        (CBLAS_DIAG)PlasmaNonUnit,
                        i, &T[LDT*ii], LDT,
                        &T[LDT*j], 1);

            }

#ifdef COMPLEX
            LAPACKE_zlacgv_work(ni, &A2[j], LDA2 );
            LAPACKE_zlacgv_work(ione, &A1[LDA1*j+j], LDA1 );
#endif

            T[LDT*j+i] = TAU[j];
        }

        /* Apply Q to the rest of the matrix to the right */
        if (M > ii+sb) {
            mi = M-(ii+sb);
            ni = min(ii+sb, N);
            l  = min(sb, max(0, ni-ii));
            CORE_zparfb(
                PlasmaRight, PlasmaNoTrans,
                PlasmaForward, PlasmaRowwise,
                mi, IB, mi, ni, sb, l,
                &A1[LDA1*ii+ii+sb], LDA1,
                &A2[ii+sb], LDA2,
                &A2[ii], LDA2,
                &T[LDT*ii], LDT,
                WORK, M);

        }
    }
    return PLASMA_SUCCESS;
}
Example #25
0
int CORE_zlatro(PLASMA_enum uplo, PLASMA_enum trans,
                int M, int N,
                const PLASMA_Complex64_t *A, int LDA,
                      PLASMA_Complex64_t *B, int LDB)
{
    int i, j;

    /* Check input arguments */
    if ((uplo != PlasmaUpper) && (uplo != PlasmaLower) && (uplo != PlasmaUpperLower) ) {
        coreblas_error(1, "Illegal value of uplo");
        return -1;
    }
    if ((trans != PlasmaConjTrans) && (trans != PlasmaNoTrans) && (trans != PlasmaTrans) ) {
        coreblas_error(2, "Illegal value of trans");
        return -2;
    }
    if (M < 0) {
        coreblas_error(3, "Illegal value of M");
        return -3;
    }
    if (N < 0) {
        coreblas_error(4, "Illegal value of N");
        return -4;
    }
    if ( (LDA < max(1,M)) && (M > 0) ) {
        coreblas_error(6, "Illegal value of LDA");
        return -6;
    }
    if ( (LDB < max(1,N)) && (N > 0) ) {
        coreblas_error(8, "Illegal value of LDB");
        return -8;
    }

    if (trans == PlasmaNoTrans) {
        CORE_zlacpy(uplo, M, N, A, LDA, B, LDB);
    }
    else {
        if (trans == PlasmaConjTrans) {
            if(uplo == PlasmaUpper) {
                for(j=0; j<N; j++)
                    for(i=0; i<min(j+1,M); i++)
                        B[j+i*LDB] = conj(A[i+j*LDA]);
            }
            else if(uplo == PlasmaLower) {
                for(j=0;j<N;j++)
                    for(i=j;i<M;i++)
                        B[j+i*LDB] = conj(A[i+j*LDA]);
            }
            else {
                for(j=0;j<N;j++)
                    for(i=0;i<M;i++)
                        B[j+i*LDB] = conj(A[i+j*LDA]);
            }
        }
        else {
            if(uplo==PlasmaUpper) {
                for(j=0;j<N;j++)
                    for(i=0;i<min(j+1,M);i++)
                        B[j+i*LDB] = A[i+j*LDA];
            }
            else if(uplo==PlasmaLower) {
                for(j=0;j<N;j++)
                    for(i=j;i<M;i++)
                        B[j+i*LDB] = A[i+j*LDA];
            }
            else {
                for(j=0;j<N;j++)
                    for(i=0;i<M;i++)
                        B[j+i*LDB] = A[i+j*LDA];
            }
        }
    }

    return PLASMA_SUCCESS;
}
Example #26
0
int CORE_ctsqrt(int M, int N, int IB,
                PLASMA_Complex32_t *A1, int LDA1,
                PLASMA_Complex32_t *A2, int LDA2,
                PLASMA_Complex32_t *T, int LDT,
                PLASMA_Complex32_t *TAU, PLASMA_Complex32_t *WORK)
{
    static PLASMA_Complex32_t zone  = 1.0;
    static PLASMA_Complex32_t zzero = 0.0;

    PLASMA_Complex32_t alpha;
    int i, ii, sb;

    /* Check input arguments */
    if (M < 0) {
        coreblas_error(1, "Illegal value of M");
        return -1;
    }
    if (N < 0) {
        coreblas_error(2, "Illegal value of N");
        return -2;
    }
    if (IB < 0) {
        coreblas_error(3, "Illegal value of IB");
        return -3;
    }
    if ((LDA2 < max(1,M)) && (M > 0)) {
        coreblas_error(8, "Illegal value of LDA2");
        return -8;
    }

    /* Quick return */
    if ((M == 0) || (N == 0) || (IB == 0))
        return PLASMA_SUCCESS;

    for(ii = 0; ii < N; ii += IB) {
        sb = min(N-ii, IB);
        for(i = 0; i < sb; i++) {
            /*
             * Generate elementary reflector H( II*IB+I ) to annihilate
             * A( II*IB+I:M, II*IB+I )
             */
            LAPACKE_clarfg_work(M+1, &A1[LDA1*(ii+i)+ii+i], &A2[LDA2*(ii+i)], 1, &TAU[ii+i]);

            if (ii+i+1 < N) {
                /*
                 * Apply H( II*IB+I ) to A( II*IB+I:M, II*IB+I+1:II*IB+IB ) from the left
                 */
                alpha = -conjf(TAU[ii+i]);
                cblas_ccopy(
                    sb-i-1,
                    &A1[LDA1*(ii+i+1)+(ii+i)], LDA1,
                    WORK, 1);
#ifdef COMPLEX
                LAPACKE_clacgv_work(sb-i-1, WORK, 1);
#endif
                cblas_cgemv(
                    CblasColMajor, (CBLAS_TRANSPOSE)PlasmaConjTrans,
                    M, sb-i-1,
                    CBLAS_SADDR(zone), &A2[LDA2*(ii+i+1)], LDA2,
                    &A2[LDA2*(ii+i)], 1,
                    CBLAS_SADDR(zone), WORK, 1);
#ifdef COMPLEX
                LAPACKE_clacgv_work(sb-i-1, WORK, 1 );
#endif
                cblas_caxpy(
                    sb-i-1, CBLAS_SADDR(alpha),
                    WORK, 1,
                    &A1[LDA1*(ii+i+1)+ii+i], LDA1);
#ifdef COMPLEX
                LAPACKE_clacgv_work(sb-i-1, WORK, 1 );
#endif
                cblas_cgerc(
                    CblasColMajor, M, sb-i-1, CBLAS_SADDR(alpha),
                    &A2[LDA2*(ii+i)], 1,
                    WORK, 1,
                    &A2[LDA2*(ii+i+1)], LDA2);
            }
            /*
             * Calculate T
             */
            alpha = -TAU[ii+i];
            cblas_cgemv(
                CblasColMajor, (CBLAS_TRANSPOSE)PlasmaConjTrans, M, i,
                CBLAS_SADDR(alpha), &A2[LDA2*ii], LDA2,
                &A2[LDA2*(ii+i)], 1,
                CBLAS_SADDR(zzero), &T[LDT*(ii+i)], 1);

            cblas_ctrmv(
                CblasColMajor, (CBLAS_UPLO)PlasmaUpper,
                (CBLAS_TRANSPOSE)PlasmaNoTrans, (CBLAS_DIAG)PlasmaNonUnit, i,
                &T[LDT*ii], LDT,
                &T[LDT*(ii+i)], 1);

            T[LDT*(ii+i)+i] = TAU[ii+i];
        }
        if (N > ii+sb) {
            CORE_ctsmqr(
                PlasmaLeft, PlasmaConjTrans,
                sb, N-(ii+sb), M, N-(ii+sb), IB, IB,
                &A1[LDA1*(ii+sb)+ii], LDA1,
                &A2[LDA2*(ii+sb)], LDA2,
                &A2[LDA2*ii], LDA2,
                &T[LDT*ii], LDT,
                WORK, sb);
        }
    }
    return PLASMA_SUCCESS;
}
Example #27
0
/***************************************************************************//**
 *
 * @ingroup CORE_PLASMA_Complex64_t
 *
 *  CORE_zgetf2_nopiv computes an LU factorization of a general diagonal
 *  dominant M-by-N matrix A witout no pivoting and no blocking. It is the
 *  internal function called by CORE_zgetrf_nopiv().
 *
 *  The factorization has the form
 *     A = L * U
 *  where L is lower triangular with unit
 *  diagonal elements (lower trapezoidal if m > n), and U is upper
 *  triangular (upper trapezoidal if m < n).
 *
 *  This is the right-looking Level 3 BLAS version of the algorithm.
 *
 *******************************************************************************
 *
 *  @param[in] M
 *          The number of rows of the matrix A.  M >= 0.
 *
 *  @param[in] N
 *          The number of columns of the matrix A.  N >= 0.
 *
 *  @param[in,out] A
 *          On entry, the M-by-N matrix to be factored.
 *          On exit, the factors L and U from the factorization
 *          A = P*L*U; the unit diagonal elements of L are not stored.
 *
 *  @param[in] LDA
 *          The leading dimension of the array A.  LDA >= max(1,M).
 *
 *******************************************************************************
 *
 * @return
 *         \retval PLASMA_SUCCESS successful exit
 *         \retval <0 if INFO = -k, the k-th argument had an illegal value
 *         \retval >0 if INFO = k, U(k,k) is exactly zero. The factorization
 *              has been completed, but the factor U is exactly
 *              singular, and division by zero will occur if it is used
 *              to solve a system of equations.
 *
 ******************************************************************************/
int
CORE_zgetf2_nopiv(int M, int N,
                  PLASMA_Complex64_t *A, int LDA)
{
    PLASMA_Complex64_t mzone = (PLASMA_Complex64_t)-1.0;
    PLASMA_Complex64_t alpha;
    double sfmin;
    int i, j, k;
    int info;

    /* Check input arguments */
    info = 0;
    if (M < 0) {
        coreblas_error(1, "Illegal value of M");
        return -1;
    }
    if (N < 0) {
        coreblas_error(2, "Illegal value of N");
        return -2;
    }
    if ((LDA < max(1,M)) && (M > 0)) {
        coreblas_error(5, "Illegal value of LDA");
        return -5;
    }

    /* Quick return */
    if ( (M == 0) || (N == 0) )
        return PLASMA_SUCCESS;

    sfmin = LAPACKE_dlamch_work('S');
    k = min(M, N);
    for(i=0 ; i < k; i++) {
        alpha = A[i*LDA+i];
        if ( alpha != (PLASMA_Complex64_t)0.0 ) {
            /* Compute elements J+1:M of J-th column. */
            if (i < M) {
                if ( cabs(alpha) > sfmin ) {
                    alpha = 1.0 / alpha;
                    cblas_zscal( M-i-1, CBLAS_SADDR(alpha), &(A[i*LDA+i+1]), 1);
                } else {
                    for(j=i+1; j<M; j++)
                        A[LDA*i+j] = A[LDA*i+j] / alpha;
                }
            }
        } else if ( info == 0 ) {
            info = i;
            goto end;
        }

        if ( i < k ) {
            /* Update trailing submatrix */
            cblas_zgeru(CblasColMajor,
                        M-i-1, N-i-1, CBLAS_SADDR(mzone),
                        &A[LDA* i   +i+1], 1,
                        &A[LDA*(i+1)+i  ], LDA,
                        &A[LDA*(i+1)+i+1], LDA);
        }
    }

 end:
    return info;
}