Example #1
0
void
GradDpElement :: computeStiffnessMatrix_ku(FloatMatrix &answer, MatResponseMode rMode, TimeStep *tStep)
{
    double dV;
    NLStructuralElement *elem = this->giveNLStructuralElement();
    FloatArray Nk;
    FloatMatrix B, DkuB, Dku;
    StructuralCrossSection *cs = elem->giveStructuralCrossSection();

    answer.clear();

    int nlGeo = elem->giveGeometryMode();

    for ( auto &gp: *elem->giveIntegrationRule(0) ) {

        GradDpMaterialExtensionInterface *dpmat = dynamic_cast< GradDpMaterialExtensionInterface * >(
            cs->giveMaterialInterface(GradDpMaterialExtensionInterfaceType, gp) );
        if ( !dpmat ) {
            OOFEM_ERROR("Material doesn't implement the required DpGrad interface!");
        }

        elem->computeBmatrixAt(gp, B);
        if ( nlGeo == 1 ) {
            if ( elem->domain->giveEngngModel()->giveFormulation() == AL ) {
                elem->computeBmatrixAt(gp, B);
            } else {
                elem->computeBHmatrixAt(gp, B);
            }
        }

        dpmat->givePDGradMatrix_ku(Dku, rMode, gp, tStep);
        this->computeNkappaMatrixAt(gp, Nk);
        dV = elem->computeVolumeAround(gp);
        DkuB.beProductOf(Dku, B);
        answer.plusProductUnsym(Nk, DkuB, -dV);

        if ( dpmat->giveAveragingType() == 2 ) {
            double dl1, dl2, dl3;
            FloatArray Gk;
            FloatMatrix D, DB, LDB;
            FloatMatrix Bk, BktM22, BktM22Gk, BktM12, BktM12Gk, M22(2, 2), M12(2, 2);
            FloatMatrix dL1(1, 3), dL2(1, 3), result1, result2, dLdS, n(2, 2);

            this->computeBkappaMatrixAt(gp, Bk);
            dpmat->givePDGradMatrix_uu(D, rMode, gp, tStep);
            dpmat->givePDGradMatrix_LD(dLdS, rMode, gp, tStep);
            this->computeNonlocalGradient(Gk, gp, tStep);

            dl1 = dLdS.at(3, 3);
            dl2 = dLdS.at(4, 4);
            dl3 = dLdS.at(5, 5);

            n.at(1, 1) = dLdS.at(1, 1);
            n.at(1, 2) = dLdS.at(1, 2);
            n.at(2, 1) = dLdS.at(2, 1);
            n.at(2, 2) = dLdS.at(2, 2);
            // first term Bk^T M22 G L1 D B
            // M22 = n2 \otimes n2
            M22.at(1, 1) = n.at(1, 2) * n.at(1, 2);
            M22.at(1, 2) = n.at(1, 2) * n.at(2, 2);
            M22.at(2, 1) = n.at(2, 2) * n.at(1, 2);
            M22.at(2, 2) = n.at(2, 2) * n.at(2, 2);
            // dL1
            dL1.at(1, 1) = dl1 * n.at(1, 1) * n.at(1, 1) + dl2 *n.at(1, 2) * n.at(1, 2);
            dL1.at(1, 2) = dl1 * n.at(2, 1) * n.at(2, 1) + dl2 *n.at(2, 2) * n.at(2, 2);
            dL1.at(1, 3) = dl1 * n.at(1, 1) * n.at(2, 1) + dl2 *n.at(1, 2) * n.at(2, 2);

            DB.beProductOf(D, B);
            LDB.beProductOf(dL1, DB);
            BktM22.beTProductOf(Bk, M22);
            ///@todo This can't possibly work if this is uncommented (!) / Mikael
            //BktM22Gk.beProductOf(BktM22,Gk);
            result1.beProductOf(BktM22Gk, LDB);
            answer.add(dV, result1);
            // This would be slightly shorter and faster;
            //GkLDB.beProductOf(Gk, LDB);
            //MGkLDB.beProductOf(M22, GkLDB);
            //answer.plusProductUnsym(Bk, MGkLDB, dV);

            // M12 + M21  = n1 \otimes n2 + n2 \otimes n1
            M12.at(1, 1) = n.at(1, 1) * n.at(1, 2) + n.at(1, 2) * n.at(1, 1);
            M12.at(1, 2) = n.at(1, 1) * n.at(2, 2) + n.at(1, 2) * n.at(2, 1);
            M12.at(2, 1) = n.at(2, 1) * n.at(1, 2) + n.at(2, 2) * n.at(1, 1);
            M12.at(2, 2) = n.at(2, 1) * n.at(2, 2) + n.at(2, 2) * n.at(2, 1);
            //dL2
            dL2.at(1, 1) = dl3 * ( n.at(1, 1) * n.at(1, 2) + n.at(1, 1) * n.at(1, 2) );
            dL2.at(1, 2) = dl3 * ( n.at(2, 1) * n.at(2, 2) + n.at(2, 1) * n.at(2, 2) );
            dL2.at(1, 3) = dl3 * ( n.at(1, 2) * n.at(2, 1) + n.at(1, 1) * n.at(2, 2) );

            LDB.beProductOf(dL2, DB);
            BktM12.beTProductOf(Bk, M12);
            ///@todo This can't possibly work if this is uncommented (!) / Mikael
            //BktM12Gk.beProductOf(BktM12,Gk);
            result2.beProductOf(BktM12Gk, LDB);
            answer.add(dV, result2);
            // This would be slightly shorter and faster;
            //GkLDB.beProductOf(Gk, LDB);
            //MGkLDB.beProductOf(M12, GkLDB);
            //answer.plusProductUnsym(Bk, MGkLDB, dV);
        }
    }
}
Example #2
0
/**
    Purpose
    -------
    CGETRF_INCPIV computes an LU factorization of a general M-by-N tile A
    using partial pivoting with row interchanges.

    The factorization has the form

      A = P * L * U

    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 2.5 BLAS version of the algorithm.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in]
    ib      INTEGER
            The inner-blocking size.  IB >= 0.

    @param[in,out]
    hA      COMPLEX array, dimension(LDHA, N), on cpu.
            On entry, only the M-by-IB first panel needs to be identical to dA(1..M, 1..IB).
            On exit, the content is incomplete. Shouldn't be used.

    @param[in]
    ldha    INTEGER
            The leading dimension of the array hA.  LDHA >= max(1,M).

    @param[in,out]
    dA      COMPLEX array, dimension(LDDA, N), on gpu.
            On entry, the M-by-N tile to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    @param[in]
    ldda    INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,M).

    @param[out]
    hL      COMPLEX array, dimension(LDHL, min(M,N)), on vpu.
            On exit, contains in the upper part the IB-by-K lower triangular tile,
            and in the lower part IB-by-min(M,N) the inverse of the top part.

    @param[in]
    ldhl    INTEGER
            The leading dimension of the array hL.  LDHL >= max(1,2*IB).

    @param[out]
    dL      COMPLEX array, dimension(LDDL, K), on gpu.
            On exit, contains in the upper part the IB-by-min(M,N) lower triangular tile,
            and in the lower part IB-by-min(M,N) the inverse of the top part.

    @param[in]
    lddl    INTEGER
            The leading dimension of the array dL.  LDDL >= max(1,2*IB).

    @param[out]
    ipiv    INTEGER array, dimension min(M,N), on the cpu.
            The pivot indices array.

    @param[out]
    dWORK   COMPLEX array, dimension(LDDWORK, 2*IB), on gpu.
            Workspace.

    @param[in]
    lddwork INTEGER
            The leading dimension of the array dWORK.  LDDWORK >= max(NB, 1).

    @param[out]
    info    INTEGER
            - PLASMA_SUCCESS successful exit
            - < 0 if INFO = -k, the k-th argument had an illegal value
            - > 0 if INFO = k, U(k,k) is exactly zero. The factorization
                has been completed, but the factor U is exactly
                singular, and division by zero will occur if it is used
                to solve a system of equations.

    @ingroup magma_cgesv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_cgetrf_incpiv_gpu( magma_order_t order, magma_int_t m, magma_int_t n, magma_int_t ib,
                         magmaFloatComplex *hA, magma_int_t ldha, magmaFloatComplex *dA, magma_int_t ldda,
                         magmaFloatComplex *hL, magma_int_t ldhl, magmaFloatComplex *dL, magma_int_t lddl,
                         magma_int_t *ipiv,
                         magmaFloatComplex *dwork, magma_int_t lddwork,
                         magma_int_t *info)
{
#define AT(i,j) (dAT + (i)*ib*ldda + (j)*ib)
#define hA(i,j) (hA  + (i)*ib + (j)*ib*ldha)
#define hL(j)   (hL  + (j)*ib*ldhl         )
#define hL2(j)  (hL2 + (j)*ib*ldhl         )
#define dL(j)   (dL  + (j)*ib*lddl         )
#define dL2(j)  (dL2 + (j)*ib*lddl         )

    magmaFloatComplex c_one     = MAGMA_C_ONE;
    magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE;

    magma_int_t iinfo;
    magma_int_t maxm, mindim;
    magma_int_t i, rows, cols, s, ii, sb;
    magmaFloatComplex *dAT;
#ifndef WITHOUTTRTRI
    magmaFloatComplex *dL2 = dL + ib;
    magmaFloatComplex *hL2 = hL + ib;
#endif

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -1;
    else if (n < 0)
        *info = -2;
    else if (ldda < max(1,m))
        *info = -4;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    mindim = min(m, n);
    s      = mindim / ib;

    if ( ib >= mindim ) {
        /* Use CPU code. */
        lapackf77_cgetrf(&m, &n, hA, &ldha, ipiv, info);

#ifndef WITHOUTTRTRI
        CORE_clacpy(PlasmaUpperLower, mindim, mindim,
                    (PLASMA_Complex32_t*)hA, ldha,
                    (PLASMA_Complex32_t*)hL2, ldhl );

        CORE_ctrtri( PlasmaLower, PlasmaUnit, mindim,
                     (PLASMA_Complex32_t*)hL2, ldhl, info );
        if (*info != 0 ) {
            fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info);
        }

        magma_csetmatrix( mindim, mindim, hL2, ldhl, dL2, lddl );
#endif

        if ( order == MagmaRowMajor ) {
            magma_csetmatrix( m, n, hA, ldha, dwork, lddwork );
            magmablas_ctranspose( m, n, dwork, lddwork, dA, ldda );
        } else {
            magma_csetmatrix( m, n, hA, ldha, dA, ldda );
        }
    }
    else {
        /* Use hybrid blocked code. */
        maxm = ((m + 31)/32)*32;

        if ( order == MagmaColMajor ) {
            magmablas_cgetmo_in( dA, dAT, ldda, m, n );
        } else {
            dAT = dA;
        }

        for( i=0; i < s; i++ ) {
            ii = i * ib;
            sb = min(ib, mindim-ii);
            cols = maxm - ii;

            if ( i > 0 ) {
                // download i-th panel
                magmablas_ctranspose( sb, m, AT(0,i), ldda, dwork, maxm );
                magma_cgetmatrix( m, sb, dwork, maxm, hA(0, i), ldha );

                // make sure that gpu queue is empty
                //magma_device_sync();
#ifndef WITHOUTTRTRI
                magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit,
                             n - (ii+sb), ib,
                             c_one, dL2(i-1),    lddl,
                                    AT(i-1,i+1), ldda );
#else
                magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             n - (ii+sb), ib,
                             c_one, AT(i-1,i-1), ldda,
                                    AT(i-1,i+1), ldda );
#endif
                magma_cgemm( MagmaNoTrans, MagmaNoTrans,
                             n-(ii+sb), m-ii, ib,
                             c_neg_one, AT(i-1,i+1), ldda,
                                        AT(i,  i-1), ldda,
                             c_one,     AT(i,  i+1), ldda );
            }

            // do the cpu part
            rows = m - ii;
            lapackf77_cgetrf( &rows, &sb, hA(i, i), &ldha, ipiv+ii, &iinfo);
            if ( (*info == 0) && (iinfo > 0) )
                *info = iinfo + ii;

            {
                int j;
                int fin = ii + sb;
                for (j=ii; j < fin; j++) {
                    ipiv[j] = ii + ipiv[j];
                }
            }
            magmablas_claswp( n-ii, AT(0, i), ldda, ii+1, ii+sb, ipiv, 1 );

#ifndef WITHOUTTRTRI
            CORE_clacpy(PlasmaLower, sb, sb,
                        (PLASMA_Complex32_t*)hA(i, i), ldha,
                        (PLASMA_Complex32_t*)hL2(i), ldhl );

            CORE_ctrtri( PlasmaLower, PlasmaUnit, sb,
                         (PLASMA_Complex32_t*)hL2(i), ldhl, info );
            if (*info != 0 ) {
                fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info);
            }
            magma_csetmatrix( sb, sb, hL2(i), ldhl, dL2(i), lddl );
#endif
            // upload i-th panel
            magma_csetmatrix( rows, sb, hA(i, i), ldha, dwork, cols );
            magmablas_ctranspose( rows, sb, dwork, cols, AT(i,i), ldda );

            // do the small non-parallel computations
            if ( s > (i+1) ) {
#ifndef WITHOUTTRTRI
                magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit,
                             sb, sb,
                             c_one, dL2(i),     lddl,
                                    AT(i, i+1), ldda);
#else
                magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             sb, sb,
                             c_one, AT(i, i  ), ldda,
                                    AT(i, i+1), ldda);
#endif
                magma_cgemm( MagmaNoTrans, MagmaNoTrans,
                             sb, m-(ii+sb), sb,
                             c_neg_one, AT(i,   i+1), ldda,
                                        AT(i+1, i  ), ldda,
                             c_one,     AT(i+1, i+1), ldda );
            }
            else {
                /* Update of the last panel */
#ifndef WITHOUTTRTRI
                magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit,
                             n-mindim, sb,
                             c_one, dL2(i),     lddl,
                                    AT(i, i+1), ldda);
#else
                magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             n-mindim, sb,
                             c_one, AT(i, i  ), ldda,
                                    AT(i, i+1), ldda);
#endif
                /* m-(ii+sb) should be always 0 */
                magma_cgemm( MagmaNoTrans, MagmaNoTrans,
                             n-mindim, m-(ii+sb), sb,
                             c_neg_one, AT(i,   i+1), ldda,
                                        AT(i+1, i  ), ldda,
                             c_one,     AT(i+1, i+1), ldda );
            }
        }

        if ( order == MagmaColMajor ) {
            magmablas_cgetmo_out( dA, dAT, ldda, m, n );
        }
    }
    return *info;
}
Example #3
0
void
GradDpElement :: computeStiffnessMatrix(FloatMatrix &answer, MatResponseMode rMode, TimeStep *tStep)
{
    //set displacement and nonlocal location array
    this->setDisplacementLocationArray();
    this->setNonlocalLocationArray();

    NLStructuralElement *elem = this->giveNLStructuralElement();
    StructuralCrossSection *cs = elem->giveStructuralCrossSection();
    FloatMatrix B, D, DB;
    FloatMatrix DkuB, Dku;
    FloatArray Nk;
    FloatMatrix SNk, gPSigma;
    FloatMatrix lStiff;
    FloatMatrix Bk, LBk;
    FloatMatrix answer_uu, answer_ku, answer_uk, answer_kk;

    int nlGeo = elem->giveGeometryMode();
    bool matStiffSymmFlag = elem->giveCrossSection()->isCharacteristicMtrxSymmetric(rMode);

    for ( auto &gp : *elem->giveIntegrationRule(0) ) {
        GradDpMaterialExtensionInterface *dpmat = dynamic_cast< GradDpMaterialExtensionInterface * >(
            cs->giveMaterialInterface(GradDpMaterialExtensionInterfaceType, gp) );
        if ( !dpmat ) {
            OOFEM_ERROR("Material doesn't implement the required DpGrad interface!");
        }

        double dV = elem->computeVolumeAround(gp);

        if ( nlGeo == 0 ) {
            elem->computeBmatrixAt(gp, B);
        } else if ( nlGeo == 1 ) {
            if ( elem->domain->giveEngngModel()->giveFormulation() == AL ) {
                elem->computeBmatrixAt(gp, B);
            } else {
                elem->computeBHmatrixAt(gp, B);
            }
        }
        this->computeNkappaMatrixAt(gp, Nk);
        this->computeBkappaMatrixAt(gp, Bk);

        dpmat->givePDGradMatrix_uu(D, rMode, gp, tStep);
        dpmat->givePDGradMatrix_ku(Dku, rMode, gp, tStep);
        dpmat->givePDGradMatrix_uk(gPSigma, rMode, gp, tStep);
        dpmat->givePDGradMatrix_kk(lStiff, rMode, gp, tStep);

        /////////////////////////////////////////////////////////////////// uu:
        DB.beProductOf(D, B);
        if ( matStiffSymmFlag ) {
            answer_uu.plusProductSymmUpper(B, DB, dV);
        } else {
            answer_uu.plusProductUnsym(B, DB, dV);
        }

        //////////////////////////////////////////////////////////////////////// ku:
        DkuB.beProductOf(Dku, B);
        answer_ku.plusProductUnsym(Nk, DkuB, -dV);

        if ( dpmat->giveAveragingType() == 2 ) {
            double dl1, dl2, dl3;
            FloatMatrix LDB;
            FloatMatrix GkLDB, MGkLDB;
            FloatMatrix M22, M12;
            FloatMatrix dL1(1, 3), dL2(1, 3), dLdS;
            FloatArray Gk, n1, n2;


            dpmat->givePDGradMatrix_LD(dLdS, rMode, gp, tStep);
            this->computeNonlocalGradient(Gk, gp, tStep);

            dl1 = dLdS.at(3, 3);
            dl2 = dLdS.at(4, 4);
            dl3 = dLdS.at(5, 5);
            n1 = {dLdS.at(1, 1), dLdS.at(2, 1)};
            n2 = {dLdS.at(1, 2), dLdS.at(2, 2)};

            // first term Bk^T M22 G L1 D B
            // M22 = n2 \otimes n2
            M22.plusDyadUnsym(n2, n2, 1.);
            // dL1
            dL1.at(1, 1) = dl1 * n1.at(1) * n1.at(1) + dl2 * n2.at(1) * n2.at(1);
            dL1.at(1, 2) = dl1 * n1.at(2) * n1.at(2) + dl2 * n2.at(2) * n2.at(2);
            dL1.at(1, 3) = dl1 * n1.at(1) * n1.at(2) + dl2 * n2.at(1) * n2.at(2);

            LDB.beProductOf(dL1, DB);
            GkLDB.beProductOf(Gk, LDB);
            MGkLDB.beProductOf(M22, GkLDB);
            answer.plusProductUnsym(Bk, MGkLDB, dV);

            // M12 + M21  = n1 \otimes n2 + n2 \otimes n1
            M12.plusDyadUnsym(n1, n2, 1.);
            M12.plusDyadUnsym(n2, n1, 1.);
            //dL2
            dL2.at(1, 1) = dl3 * ( n1.at(1) * n2.at(1) + n1.at(1) * n2.at(1) );
            dL2.at(1, 2) = dl3 * ( n1.at(2) * n2.at(2) + n1.at(2) * n2.at(2) );
            dL2.at(1, 3) = dl3 * ( n1.at(2) * n2.at(1) + n1.at(1) * n2.at(2) );

            // Bk * ((M12 * L2 + M22 * L1) * DB)
            LDB.beProductOf(dL2, DB);
            GkLDB.beProductOf(Gk, LDB);
            MGkLDB.beProductOf(M12, GkLDB);
            answer.plusProductUnsym(Bk, MGkLDB, dV);
        }

        //////////////////////////////////////////////////////////////////////// uk:
        SNk.beProductOf(gPSigma, Nk);
        answer_uk.plusProductUnsym(B, SNk, -dV); // uk

        /////////////////////////////////////////////////////////////////////// kk:
        answer_kk.plusProductUnsym(Nk, Nk, dV);
        if ( dpmat->giveAveragingType() == 0 || dpmat->giveAveragingType() == 1 ) {
            double l = lStiff.at(1, 1);
            answer_kk.plusProductUnsym(Bk, Bk, l * l * dV);
        } else if ( dpmat->giveAveragingType() == 2 ) {
            LBk.beProductOf(lStiff, Bk);
            answer_kk.plusProductUnsym(Bk, LBk, dV);
        }
    }

    if ( elem->domain->giveEngngModel()->giveFormulation() == AL ) {
        FloatMatrix initialStressMatrix;
        elem->computeInitialStressMatrix(initialStressMatrix, tStep);
        answer_uu.add(initialStressMatrix);
    }

    if ( matStiffSymmFlag ) {
        answer_uu.symmetrized();
    }

    answer.resize(totalSize, totalSize);
    answer.zero();
    answer.assemble(answer_uu, locU);
    answer.assemble(answer_uk, locU, locK);
    answer.assemble(answer_ku, locK, locU);
    answer.assemble(answer_kk,locK);
}
Example #4
0
extern "C" magma_int_t
magma_sgetrf_incpiv_gpu( char storev, magma_int_t m, magma_int_t n, magma_int_t ib,
                         float *hA, magma_int_t ldha, float *dA, magma_int_t ldda,
                         float *hL, magma_int_t ldhl, float *dL, magma_int_t lddl,
                         magma_int_t *ipiv, 
                         float *dwork, magma_int_t lddwork,
                         magma_int_t *info)
{
/*  -- MAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2012

    Purpose
    =======

    SGETRF_INCPIV computes an LU factorization of a general M-by-N tile A
    using partial pivoting with row interchanges.
  
    The factorization has the form
  
      A = P * L * U
  
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).
  
    This is the right-looking Level 2.5 BLAS version of the algorithm.

    Arguments
    =========

    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    IB      (input) INTEGER
            The inner-blocking size.  IB >= 0.

    hA      (input,output) DOUBLE COMPLEX array, dimension(LDHA, N), on cpu.
            On entry, only the M-by-IB first panel needs to be identical to dA(1..M, 1..IB).
            On exit, the content is incomplete. Shouldn't be used.
 
    LDHA    (input) INTEGER
            The leading dimension of the array hA.  LDHA >= max(1,M).
 
    dA      (input,output) DOUBLE COMPLEX array, dimension(LDDA, N) , on gpu.
            On entry, the M-by-N tile to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.
 
    LDDA    (input) INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,M).
 
    hL      (output) DOUBLE COMPLEX array, dimension(LDHL, min(M,N)), on vpu.
            On exit, contains in the upper part the IB-by-K lower triangular tile,
            and in the lower part IB-by-min(M,N) the inverse of the top part.
 
    LDHL    (input) INTEGER
            The leading dimension of the array hL.  LDHL >= max(1,2*IB).
 
    dL      (output) DOUBLE COMPLEX array, dimension(LDDL, K), on gpu.
            On exit, contains in the upper part the IB-by-min(M,N) lower triangular tile,
            and in the lower part IB-by-min(M,N) the inverse of the top part.
 
    LDDL    (input) INTEGER
            The leading dimension of the array dL.  LDDL >= max(1,2*IB).
 
    IPIV    (output) INTEGER array, dimension min(M,N), on the cpu.
            The pivot indices array.
 
    dWORK   (output) DOUBLE COMPLEX array, dimension(LDDWORK, 2*IB), on gpu.
            Workspace.

    LDDWORK (input) INTEGER
            The leading dimension of the array dWORK.  LDDWORK >= max(NB, 1).
 
    INFO    (output) INTEGER
            - PLASMA_SUCCESS successful exit
            - < 0 if INFO = -k, the k-th argument had an illegal value
            - > 0 if INFO = k, U(k,k) is exactly zero. The factorization
                has been completed, but the factor U is exactly
                singular, and division by zero will occur if it is used
                to solve a system of equations.
           
    =====================================================================    */

#define AT(i,j) (dAT + (i)*ib*ldda + (j)*ib)
#define hA(i,j) (hA  + (i)*ib + (j)*ib*ldha)
#define hL(j)   (hL  + (j)*ib*ldhl         )
#define hL2(j)  (hL2 + (j)*ib*ldhl         )
#define dL(j)   (dL  + (j)*ib*lddl         )
#define dL2(j)  (dL2 + (j)*ib*lddl         )

    float c_one     = MAGMA_S_ONE;
    float c_neg_one = MAGMA_S_NEG_ONE;

    magma_int_t iinfo;
    magma_int_t maxm, mindim;
    magma_int_t i, rows, cols, s, ii, sb;
    float *dAT;
#ifndef WITHOUTTRTRI
    float *dL2 = dL + ib;
    float *hL2 = hL + ib;
#endif

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -1;
    else if (n < 0)
        *info = -2;
    else if (ldda < max(1,m))
        *info = -4;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    mindim = min(m, n);
    s      = mindim / ib;

    if ( ib >= mindim ) {
        /* Use CPU code. */
        lapackf77_sgetrf(&m, &n, hA, &ldha, ipiv, info);

#ifndef WITHOUTTRTRI
        CORE_slacpy(PlasmaUpperLower, mindim, mindim, 
                    (float*)hA, ldha, 
                    (float*)hL2, ldhl );

        CORE_strtri( PlasmaLower, PlasmaUnit, mindim, 
                     (float*)hL2, ldhl, info );
        if (*info != 0 ) {
          fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info);
        }          

        magma_ssetmatrix( mindim, mindim, hL2, ldhl, dL2, lddl );
#endif
            
        if ( (storev == 'R') || (storev == 'r') ) {
            magma_ssetmatrix( m, n, hA, ldha, dwork, lddwork );
            magmablas_stranspose( dA, ldda, dwork, lddwork, m, n );
        } else {
            magma_ssetmatrix( m, n, hA, ldha, dA, ldda );
        }
    }
    else {
        /* Use hybrid blocked code. */
        maxm = ((m + 31)/32)*32;

        if ( (storev == 'C') || (storev == 'c') ) {
            magmablas_sgetmo_in( dA, dAT, ldda, m, n );
        } else {
            dAT = dA;
        }
            
        for( i=0; i<s; i++ )
        {
            ii = i * ib;
            sb = min(ib, mindim-ii);
            cols = maxm - ii;

            if ( i>0 ){
                // download i-th panel
                magmablas_stranspose( dwork, maxm, AT(0, i), ldda, sb, m );
                magma_sgetmatrix( m, sb, dwork, maxm, hA(0, i), ldha );
                
                // make sure that gpu queue is empty
                //magma_device_sync();
#ifndef WITHOUTTRTRI
                magma_strmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, 
                             n - (ii+sb), ib, 
                             c_one, dL2(i-1),    lddl, 
                                    AT(i-1,i+1), ldda );
#else
                magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                             n - (ii+sb), ib, 
                             c_one, AT(i-1,i-1), ldda, 
                                    AT(i-1,i+1), ldda );
#endif
                magma_sgemm( MagmaNoTrans, MagmaNoTrans, 
                             n-(ii+sb), m-ii, ib, 
                             c_neg_one, AT(i-1,i+1), ldda, 
                                        AT(i,  i-1), ldda, 
                             c_one,     AT(i,  i+1), ldda );
            }

            // do the cpu part
            rows = m - ii;
            lapackf77_sgetrf( &rows, &sb, hA(i, i), &ldha, ipiv+ii, &iinfo);
            if ( (*info == 0) && (iinfo > 0) )
                *info = iinfo + ii;

            { 
                int j;
                int fin = ii + sb;
                for(j=ii ; j <fin; j++) {
                    ipiv[j] = ii + ipiv[j];
                }
            }
            magmablas_slaswp( n-ii, AT(0, i), ldda, ii+1, ii+sb, ipiv, 1 );

#ifndef WITHOUTTRTRI
            CORE_slacpy(PlasmaLower, sb, sb, 
                        (float*)hA(i, i), ldha, 
                        (float*)hL2(i), ldhl );
            
            CORE_strtri( PlasmaLower, PlasmaUnit, sb, 
                         (float*)hL2(i), ldhl, info );
            if (*info != 0 ) {
              fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info);
            }
            magma_ssetmatrix( sb, sb, hL2(i), ldhl, dL2(i), lddl );
#endif
            // upload i-th panel
            magma_ssetmatrix( rows, sb, hA(i, i), ldha, dwork, cols );
            magmablas_stranspose( AT(i,i), ldda, dwork, cols, rows, sb);

            // do the small non-parallel computations
            if ( s > (i+1) ) {
#ifndef WITHOUTTRTRI
                magma_strmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, 
                             sb, sb, 
                             c_one, dL2(i),     lddl,
                                    AT(i, i+1), ldda);
#else
                magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                             sb, sb, 
                             c_one, AT(i, i  ), ldda,
                                    AT(i, i+1), ldda);
#endif
                magma_sgemm( MagmaNoTrans, MagmaNoTrans, 
                             sb, m-(ii+sb), sb, 
                             c_neg_one, AT(i,   i+1), ldda,
                                        AT(i+1, i  ), ldda, 
                             c_one,     AT(i+1, i+1), ldda );
            }
            else {
                /* Update of the last panel */
#ifndef WITHOUTTRTRI
                magma_strmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, 
                             n-mindim, sb, 
                             c_one, dL2(i),     lddl,
                                    AT(i, i+1), ldda);
#else
                magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, 
                             n-mindim, sb, 
                             c_one, AT(i, i  ), ldda,
                                    AT(i, i+1), ldda);
#endif
                /* m-(ii+sb) should be always 0 */
                magma_sgemm( MagmaNoTrans, MagmaNoTrans, 
                             n-mindim, m-(ii+sb), sb,
                             c_neg_one, AT(i,   i+1), ldda,
                                        AT(i+1, i  ), ldda, 
                             c_one,     AT(i+1, i+1), ldda );
            }
        }

        if ( (storev == 'C') || (storev == 'c') ) {
            magmablas_sgetmo_out( dA, dAT, ldda, m, n );
        }
    }
    return *info;
}