Exemple #1
0
extern "C" void
magma_ctrdtype1cbHLsym_withQ(
    magma_int_t N, magma_int_t NB,
    magmaFloatComplex *A, magma_int_t LDA,
    magmaFloatComplex *V, magmaFloatComplex *TAU,
    magma_int_t st, magma_int_t ed, magma_int_t sweep, magma_int_t Vblksiz)
{
    //magma_int_t    J1, J2, J3, i, j;
    magma_int_t    len, LDX;
    magma_int_t    IONE=1;
    magma_int_t    blkid, vpos, taupos, tpos;
    //magmaFloatComplex conjtmp;
    magmaFloatComplex Z_ONE  =  MAGMA_C_ONE;
    magmaFloatComplex *WORK;
    magma_cmalloc_cpu( &WORK, N );
    
    
    findVTpos(N,NB,Vblksiz,sweep-1,st-1, &vpos, &taupos, &tpos, &blkid);
    //printf("voici vpos %d taupos %d  tpos %d  blkid %d \n", vpos, taupos, tpos, blkid);
    LDX     = LDA-1;
    len     = ed-st+1;
    *V(vpos)  = Z_ONE;
    memcpy(V(vpos+1), A(st+1, st-1), (len-1)*sizeof(magmaFloatComplex));
    memset(A(st+1, st-1), 0, (len-1)*sizeof(magmaFloatComplex));
    /* Eliminate the col  at st-1 */
    lapackf77_clarfg( &len, A(st, st-1), V(vpos+1), &IONE, TAU(taupos) );
    /* apply left and right on A(st:ed,st:ed)*/
    magma_clarfxsym(len,A(st,st),LDX,V(vpos),TAU(taupos));
    //conjtmp = MAGMA_C_CONJ(*TAU(taupos));
    //lapackf77_clarfy("L", &len, V(vpos), &IONE, &conjtmp, A(st,st), &LDX, WORK); //&(MAGMA_C_CONJ(*TAU(taupos)))
    magma_free_cpu(WORK);
}
Exemple #2
0
extern "C" void
magma_ctrdtype3cbHLsym_withQ(
    magma_int_t N, magma_int_t NB,
    magmaFloatComplex *A, magma_int_t LDA,
    magmaFloatComplex *V, magmaFloatComplex *TAU,
    magma_int_t st, magma_int_t ed, magma_int_t sweep, magma_int_t Vblksiz)
{
    //magma_int_t    J1, J2, J3, i, j;
    magma_int_t    len, LDX;
    //magma_int_t    IONE=1;
    magma_int_t    blkid, vpos, taupos, tpos;
    //magmaFloatComplex conjtmp;
    magmaFloatComplex *WORK;
    magma_cmalloc_cpu( &WORK, N );
    
    
    findVTpos(N,NB,Vblksiz,sweep-1,st-1, &vpos, &taupos, &tpos, &blkid);
    LDX    = LDA-1;
    len    = ed-st+1;
    
    /* apply left and right on A(st:ed,st:ed)*/
    magma_clarfxsym(len,A(st,st),LDX,V(vpos),TAU(taupos));
    //conjtmp = MAGMA_C_CONJ(*TAU(taupos));
    //lapackf77_clarfy("L", &len, V(vpos), &IONE,  &(MAGMA_C_CONJ(*TAU(taupos))), A(st,st), &LDX, WORK);
    magma_free_cpu(WORK);
}
Exemple #3
0
extern "C" void
magma_ctrdtype2cbHLsym_withQ(
    magma_int_t N, magma_int_t NB,
    magmaFloatComplex *A, magma_int_t LDA,
    magmaFloatComplex *V, magmaFloatComplex *TAU,
    magma_int_t st, magma_int_t ed, magma_int_t sweep, magma_int_t Vblksiz)
{
    magma_int_t    J1, J2, len, lem, LDX;
    //magma_int_t    i, j;
    magma_int_t    IONE=1;
    magma_int_t    blkid, vpos, taupos, tpos;
    magmaFloatComplex conjtmp;
    magmaFloatComplex Z_ONE  =  MAGMA_C_ONE;
    //magmaFloatComplex WORK[NB];
    magmaFloatComplex *WORK;
    magma_cmalloc_cpu( &WORK, NB );
    
    
    findVTpos(N,NB,Vblksiz,sweep-1,st-1, &vpos, &taupos, &tpos, &blkid);
    LDX    = LDA-1;
    J1     = ed+1;
    J2     = min(ed+NB,N);
    len    = ed-st+1;
    lem    = J2-J1+1;
    if (lem > 0) {
        /* apply remaining right commming from the top block */
        lapackf77_clarfx("R", &lem, &len, V(vpos), TAU(taupos), A(J1, st), &LDX, WORK);
    }
    if (lem > 1) {
        findVTpos(N,NB,Vblksiz,sweep-1,J1-1, &vpos, &taupos, &tpos, &blkid);
        /* remove the first column of the created bulge */
        *V(vpos)  = Z_ONE;
        memcpy(V(vpos+1), A(J1+1, st), (lem-1)*sizeof(magmaFloatComplex));
        memset(A(J1+1, st),0,(lem-1)*sizeof(magmaFloatComplex));
        /* Eliminate the col at st */
        lapackf77_clarfg( &lem, A(J1, st), V(vpos+1), &IONE, TAU(taupos) );
        /* apply left on A(J1:J2,st+1:ed) */
        len = len-1; /* because we start at col st+1 instead of st. col st is the col that has been revomved; */
        conjtmp = MAGMA_C_CONJ(*TAU(taupos));
        lapackf77_clarfx("L", &lem, &len, V(vpos),  &conjtmp, A(J1, st+1), &LDX, WORK);
    }
    magma_free_cpu(WORK);
}
Exemple #4
0
extern "C" void magma_zbulge_applyQ(
    magma_int_t WANTZ, magma_side_t SIDE, magma_int_t NE, magma_int_t N, magma_int_t NB,
    magma_int_t Vblksiz, magmaDoubleComplex *E, magma_int_t LDE,
    magmaDoubleComplex *V, magmaDoubleComplex *TAU, magmaDoubleComplex *T,
    magma_int_t *INFO, magmaDoubleComplex *dV, magmaDoubleComplex *dT,
    magmaDoubleComplex *dE, magma_int_t copytype )
{
    //%===========================
    //%   local variables
    //%===========================
    magmaDoubleComplex c_zero = MAGMA_Z_ZERO;
    magmaDoubleComplex c_one  = MAGMA_Z_ONE;
    
    magma_int_t LDT, LDV, firstcolj;
    magma_int_t bg, nbGblk, rownbm, k, m, n;
    magma_int_t st, ed, fst, vlen, vnb, colj, len;
    magma_int_t blkid, vpos, taupos, tpos;
    //magmaDoubleComplex *WORK;
    magma_int_t LWORK;
    magma_int_t  cur_blksiz, avai_blksiz, ncolinvolvd;
    magma_int_t  nbgr, colst, coled, versionL, versionR;
    magma_int_t blkcnt=-1;

    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );
    
    *INFO=0;
    versionL = 113;
    versionR = 92;
    LDT      = Vblksiz;
    LDV      = NB+Vblksiz-1;
    //blklen = LDV*Vblksiz;
    nbGblk   = plasma_ceildiv((N-1), Vblksiz);
    //magma_zmalloc_cpu( &WORK, LWORK );

    /* find the size of the matrix T V*/
    findVTsiz(N, NB, Vblksiz, &blkcnt, &LDV);
    /* Copy E & V & T to the GPU in dE and dV and dT
     * depending on copytype:
     * 1: mean copy only V
     * 2: mean copy V and T
     * 3: mean copy V, T and E
     * */
    if (copytype > 0) magma_zsetmatrix( LDV, blkcnt*Vblksiz, V, LDV, dV, LDV );
    if (copytype > 1) magma_zsetmatrix( LDT, blkcnt*Vblksiz, T, LDT, dT, LDT );
    if (copytype > 2) magma_zsetmatrix( N, NE, E, N, dE, N );
    magmaDoubleComplex *dwork;
    //ldwork  = NE;
    LWORK   = 2*N*max(Vblksiz, 64);
    if (MAGMA_SUCCESS != magma_zmalloc( &dwork, LWORK )) {
        printf ("!!!!  magma_zbulge_applyQ magma_alloc failed for: dwork\n" );
        exit(-1);
    }

    /* SIDE LEFT  meaning apply E = Q*E = (q_1*q_2*.....*q_n) * E ==> so traverse Vs in reverse order (forward) from q_n to q_1
     *            Also E is splitten by row meaning each apply consist in a block of row (horizontal block) */
    /* SIDE RIGHT meaning apply E = E*Q = E * (q_1*q_2*.....*q_n) ==> so tarverse Vs in normal  order (forward) from q_1 to q_n
     *            Also E is splitten by col meaning each apply consist in a block of col (vertical block) */

    /* WANTZ = 1 meaning E is IDENTITY so form Q using optimized update.
     *         So we use the reverse order from small q to large one,
     *         so from q_n to q_1 so Left update to Identity.
     *         Use versionL 113 because in 114 we need to update the whole matrix and not in icreasing order.
     * WANTZ = 2 meaning E is a full matrix and need to be updated from Left or Right so use normal update
     * */
    if (WANTZ == 1) {
        versionL=113;
        SIDE = MagmaLeft;
        //set the matrix to Identity here to avoid copying it from the CPU
        magmablas_zlaset( MagmaFull, N, N, c_zero, c_one, dE, N );
    }
    


    printf("  APPLY Q_v115 GPU with  N %d   NB %d   Vblksiz %d SIDE %c versionL %d versionR %d WANTZ %d \n",
           (int) N, (int) NB, (int) Vblksiz, SIDE, (int) versionL, (int) versionR, (int) WANTZ);


#if defined(USESTREAM)
    magma_int_t N2=N/2;
    magma_int_t N1=N-N2;
    printf("using stream\n");
    magma_queue_t stream[2];
    magma_queue_create( &stream[0] );
    magma_queue_create( &stream[1] );
#endif
    

    if (SIDE == MagmaLeft) {
        if (versionL == 113) {
            for (bg = nbGblk; bg > 0; bg--) {
                firstcolj = (bg-1)*Vblksiz + 1;
                if (bg == nbGblk)
                    rownbm = plasma_ceildiv((N-(firstcolj)), NB);  // last blk has size=1 used for complex to handle A(N,N-1)
                else
                    rownbm = plasma_ceildiv((N-(firstcolj+1)), NB);
                
                for (m = rownbm; m > 0; m--) {
                    vlen = 0;
                    vnb  = 0;
                    colj = (bg-1)*Vblksiz; // for k=0; I compute the fst and then can remove it from the loop
                    fst  = (rownbm -m)*NB+colj +1;
                    for (k=0; k < Vblksiz; k++) {
                        colj = (bg-1)*Vblksiz + k;
                        st   = (rownbm -m)*NB+colj +1;
                        ed   = min(st+NB-1, N-1);
                        if (st > ed) break;
                        if ((st == ed) && (colj != N-2)) break;
                        vlen=ed-fst+1;
                        vnb=k+1;
                    }
                    colst     = (bg-1)*Vblksiz;
                    findVTpos(N, NB, Vblksiz, colst, fst, &vpos, &taupos, &tpos, &blkid);
                    printf("voici bg %d m %d  vlen %d  vnb %d fcolj %d vpos %d taupos %d \n", (int) bg, (int) m, (int) vlen, (int) vnb, (int) colst+1, (int) vpos+1, (int) taupos+1);
                    if ((vlen > 0) && (vnb > 0)) {
                        if (WANTZ == 1) {
                            len =  N-colst;
                            magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, len, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,colst), LDE, dwork, len);
                        } else {
                            magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, NE, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,0), LDE, dwork, NE);
                        }
                    }
                }
            }
        } else if (versionL == 114) {
            rownbm = plasma_ceildiv((N-1), NB);
            for (m = rownbm; m > 0; m--) {
                ncolinvolvd = min(N-1, m*NB);
                avai_blksiz=min(Vblksiz, ncolinvolvd);
                nbgr = plasma_ceildiv(ncolinvolvd, avai_blksiz);
                for (n = nbgr; n > 0; n--) {
                    vlen = 0;
                    vnb  = 0;
                    cur_blksiz = min(ncolinvolvd-(n-1)*avai_blksiz, avai_blksiz);
                    colst = (n-1)*avai_blksiz;
                    coled = colst + cur_blksiz -1;
                    fst   = (rownbm -m)*NB+colst +1;
                    for (colj=colst; colj <= coled; colj++) {
                        st = (rownbm -m)*NB+colj +1;
                        ed = min(st+NB-1, N-1);
                        if (st > ed) break;
                        if ((st == ed) && (colj != N-2)) break;
                        vlen=ed-fst+1;
                        vnb=vnb+1;
                    }
                    findVTpos(N, NB, Vblksiz, colst, fst, &vpos, &taupos, &tpos, &blkid);
                    //printf("voici bg %d m %d  vlen %d  vnb %d fcolj %d vpos %d taupos %d \n", bg, m, vlen, vnb, colst+1, vpos+1, taupos+1);
                    if ((vlen > 0) && (vnb > 0)) {
                        #if defined(USESTREAM)
                        magmablasSetKernelStream(stream[0]);
                        magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, N1, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,0), LDE, dwork, N1);
                        magmablasSetKernelStream(stream[1]);
                        magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, N2, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,N1), LDE, &dwork[N1*Vblksiz], N2);
                        #else
                        magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, NE, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,0), LDE, dwork, NE);
                        #endif
                    }
                }
            }
        }
    } else if (SIDE == MagmaRight) {
        if (versionR == 91) {
            for (bg =1; bg <= nbGblk; bg++) {
                firstcolj = (bg-1)*Vblksiz + 1;
                rownbm    = plasma_ceildiv((N-(firstcolj+1)), NB);
                if (bg == nbGblk) rownbm    = plasma_ceildiv((N-(firstcolj)), NB);  // last blk has size=1 used for complex to handle A(N,N-1)
                for (m = 1; m <= rownbm; m++) {
                    vlen = 0;
                    vnb  = 0;
                    // for k=0; I compute the fst and then can remove it from the loop
                    colj = (bg-1)*Vblksiz;
                    fst  = (rownbm -m)*NB+colj +1;
                    for (k=0; k < Vblksiz; k++) {
                        colj = (bg-1)*Vblksiz + k;
                        st   = (rownbm -m)*NB+colj +1;
                        ed   = min(st+NB-1, N-1);
                        if (st > ed) break;
                        if ((st == ed) && (colj != N-2)) break;
                        vlen=ed-fst+1;
                        vnb=k+1;
                    }
                    colj     = (bg-1)*Vblksiz;
                    findVTpos(N, NB, Vblksiz, colj, fst, &vpos, &taupos, &tpos, &blkid);
                    //printf("voici bg %d m %d  vlen %d  vnb %d fcolj %d vpos %d taupos %d \n", bg, m, vlen, vnb, colj, vpos, taupos);
                    if ((vlen > 0) && (vnb > 0)) {
                        #if defined(USESTREAM)
                        magmablasSetKernelStream(stream[0]);
                        magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N1, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, N1);
                        magmablasSetKernelStream(stream[1]);
                        magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N2, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(N1, fst), LDE, &dwork[N1*Vblksiz], N2);
                        #else
                        magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, NE);
                        #endif
                    }
                }
            }
        } else if (versionR == 92) {
            rownbm = plasma_ceildiv((N-1), NB);
            for (m = 1; m <= rownbm; m++) {
                ncolinvolvd = min(N-1, m*NB);
                avai_blksiz=min(Vblksiz, ncolinvolvd);
                nbgr = plasma_ceildiv(ncolinvolvd, avai_blksiz);
                for (n = 1; n <= nbgr; n++) {
                    vlen = 0;
                    vnb  = 0;
                    cur_blksiz = min(ncolinvolvd-(n-1)*avai_blksiz, avai_blksiz);
                    colst = (n-1)*avai_blksiz;
                    coled = colst + cur_blksiz -1;
                    fst   = (rownbm -m)*NB+colst +1;
                    for (colj=colst; colj <= coled; colj++) {
                        st = (rownbm -m)*NB+colj +1;
                        ed = min(st+NB-1, N-1);
                        if (st > ed) break;
                        if ((st == ed) && (colj != N-2)) break;
                        vlen=ed-fst+1;
                        vnb=vnb+1;
                    }
                    findVTpos(N, NB, Vblksiz, colst, fst, &vpos, &taupos, &tpos, &blkid);
                    if ((vlen > 0) && (vnb > 0)) {
                        #if defined(USESTREAM)
                        magmablasSetKernelStream(stream[0]);
                        magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N1, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, N1);
                        magmablasSetKernelStream(stream[1]);
                        magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N2, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(N1, fst), LDE, &dwork[N1*Vblksiz], N2);
                        #else
                        magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, NE);
                        #endif
                    }
                }
            }
        }
    } else {
            printf("ERROR SIDE %d\n", SIDE);
    }

#if defined(USESTREAM)
    magma_queue_destroy( stream[0] );
    magma_queue_destroy( stream[1] );
#endif
    magmablasSetKernelStream( orig_stream );
}
/***************************************************************************
 *          TYPE 2-BAND Lower-columnwise-Householder
 ***************************************************************************/
void
CORE_zhbtype2cb(int N, int NB,
                PLASMA_Complex64_t *A, int LDA,
                PLASMA_Complex64_t *V, PLASMA_Complex64_t *TAU,
                int st, int ed, int sweep, int Vblksiz, int WANTZ,
                PLASMA_Complex64_t *WORK)
{
    PLASMA_Complex64_t ctmp;
    int J1, J2, len, lem, LDX;
    int blkid, vpos, taupos, tpos;

    if( WANTZ == 0 ) {
        vpos   = ((sweep+1)%2)*N + st;
        taupos = ((sweep+1)%2)*N + st;
    } else {
        findVTpos(N, NB, Vblksiz, sweep, st,
                  &vpos, &taupos, &tpos, &blkid);
    }

    LDX = LDA-1;
    J1  = ed+1;
    J2  = min(ed+NB,N-1);
    len = ed-st+1;
    lem = J2-J1+1;

    if( lem > 0 ) {
        /* Apply remaining right commming from the top block */
        LAPACKE_zlarfx_work(LAPACK_COL_MAJOR, lapack_const(PlasmaRight),
                            lem, len, V(vpos), *(TAU(taupos)), A(J1, st), LDX, WORK);
    }

    if( lem > 1 ) {
        if( WANTZ == 0 ) {
            vpos   = ((sweep+1)%2)*N + J1;
            taupos = ((sweep+1)%2)*N + J1;
        } else {
            findVTpos(N,NB,Vblksiz,sweep,J1, &vpos, &taupos, &tpos, &blkid);
        }

        /* Remove the first column of the created bulge */
        *V(vpos) = 1.;

        memcpy(V(vpos+1), A(J1+1, st), (lem-1)*sizeof(PLASMA_Complex64_t));
        memset(A(J1+1, st), 0, (lem-1)*sizeof(PLASMA_Complex64_t));

        /* Eliminate the col at st */
        LAPACKE_zlarfg_work( lem, A(J1, st), V(vpos+1), 1, TAU(taupos) );

        /*
         * Apply left on A(J1:J2,st+1:ed)
         * We decrease len because we start at col st+1 instead of st.
         * col st is the col that has been revomved;
         */
        len = len-1;

        ctmp = conj(*TAU(taupos));
        LAPACKE_zlarfx_work(LAPACK_COL_MAJOR, lapack_const(PlasmaLeft),
                            lem, len, V(vpos), ctmp, A(J1, st+1), LDX, WORK);
    }
    return;
}
void plasma_pzlarft_blgtrd(plasma_context_t *plasma)
{
    int my_core_id = PLASMA_RANK;
    int cores_num  = plasma->world_size;
    /*===========================*/
    int N, NB,Vblksiz;
    PLASMA_Complex64_t *V;
    PLASMA_Complex64_t *T;
    PLASMA_Complex64_t *TAU;
    PLASMA_sequence *sequence;
    PLASMA_request *request;
    /*===========================
     *   local variables
     *===========================*/
    int LDT, LDV;
    int Vm, Vn, mt, nt;
    int myrow, mycol, blkj, blki;
    int firstrow;
    int blkid,vpos,taupos,tpos;
    int blkpercore,blkcnt, myid;


    plasma_unpack_args_8(N, NB, Vblksiz, V, T, TAU, sequence, request);
    if (sequence->status != PLASMA_SUCCESS)
        return;

    /* Quick return */
    if (N == 0){
        return;
    }
    if (NB == 0){
        return;
    }
    if (NB == 1){
        return;
    }

    findVTsiz(N, NB, Vblksiz, &blkcnt, &LDV);
    blkpercore = blkcnt/cores_num;
    blkpercore = blkpercore==0 ? 1:blkpercore;
    LDT        = Vblksiz;    
    LDV        = NB+Vblksiz-1;

    /*========================================
     * compute the T's in parallel.
     * The Ts are independent so each core pick
     * a T and compute it. The loop is based on 
     * the version 113 of the pzunmqr_blgtrd.c
     * which go over the losange block_column 
     * by block column. but it is not important 
     * here the order because Ts are independent.
     * ========================================
    */ 
    nt  = plasma_ceildiv((N-1),Vblksiz);
    for (blkj=nt-1; blkj>=0; blkj--) {
        /* the index of the first row on the top of block (blkj) */ 
        firstrow = blkj * Vblksiz + 1;
        /*find the number of tile for this block */
        if( blkj == nt-1 )
            mt = plasma_ceildiv( N -  firstrow,    NB);
        else
            mt = plasma_ceildiv( N - (firstrow+1), NB);
        /*loop over the tiles find the size of the Vs and apply it */
        for (blki=mt; blki>0; blki--) {
            /*calculate the size of each losange of Vs= (Vm,Vn)*/
            myrow     = firstrow + (mt-blki)*NB;
            mycol     = blkj*Vblksiz;
            Vm = min( NB+Vblksiz-1, N-myrow);
            if( ( blkj == nt-1 ) && ( blki == mt ) ){
                Vn = min (Vblksiz, Vm);
            } else {
                Vn = min (Vblksiz, Vm-1);
            }
            /*calculate the pointer to the Vs and the Ts.
             * Note that Vs and Ts have special storage done
             * by the bulgechasing function*/
            findVTpos(N,NB,Vblksiz,mycol,myrow, &vpos, &taupos, &tpos, &blkid);
            myid = blkid/blkpercore;
            if( my_core_id==(myid%cores_num) ){
                if( ( Vm > 0 ) && ( Vn > 0 ) ){
                    LAPACKE_zlarft_work(LAPACK_COL_MAJOR, 
                                  lapack_const(PlasmaForward), 
                                  lapack_const(PlasmaColumnwise),
                                  Vm, Vn, V(vpos), LDV, TAU(taupos), T(tpos), LDT);
                }
            }
        }
    }
}