void findVTpos(magma_int_t N, magma_int_t NB, magma_int_t Vblksiz, magma_int_t sweep, magma_int_t st, magma_int_t *Vpos, magma_int_t *TAUpos, magma_int_t *Tpos, magma_int_t *myblkid)
    {
        // to be able to use and compare with the old reduction function.
        // route the old function to the new ones because the changes are done on the new function.
        magma_int_t ldv = NB + Vblksiz;
        magma_int_t ldt = Vblksiz;
        magma_bulge_findVTAUTpos(N,  NB, Vblksiz,  sweep,  st,  ldv, ldt,
                               Vpos, TAUpos, Tpos, myblkid);
        return;


        magma_int_t locblknb, prevblkcnt, prevGblkid;
        magma_int_t myblknb, nbprevGblk, mastersweep;
        magma_int_t blkid, locj, LDV;
        locblknb = 0;
        prevblkcnt   = 0;
        myblknb  = 0;

        nbprevGblk = sweep/Vblksiz;
        for (prevGblkid = 0; prevGblkid < nbprevGblk; prevGblkid++)
        {
            mastersweep  = prevGblkid * Vblksiz;
            locblknb = plasma_ceildiv((N-(mastersweep+2)),NB);
            prevblkcnt   = prevblkcnt + locblknb;
        }
        myblknb = plasma_ceildiv((st-sweep),NB);
        blkid       = prevblkcnt + myblknb -1;
        locj        = sweep%Vblksiz;
        LDV         = NB + Vblksiz;

        *myblkid= blkid;
        *Vpos   = blkid*Vblksiz*LDV  + locj*LDV + locj;
        *TAUpos = blkid*Vblksiz + locj;
        *Tpos   = blkid*Vblksiz*Vblksiz + locj*Vblksiz + locj;
        //printf("voici  blkid  %d  locj %d  vpos %d tpos %d \n",blkid,locj,*Vpos,*Tpos);
    }
    void findVTsiz(magma_int_t N, magma_int_t NB, magma_int_t Vblksiz, magma_int_t *blkcnt, magma_int_t *LDV)
    {
        magma_int_t colblk, nbcolblk;
        magma_int_t myblknb, mastersweep;

        *blkcnt   = 0;
        nbcolblk = plasma_ceildiv((N-1),Vblksiz);
        for (colblk = 0; colblk<nbcolblk; colblk++)
        {
            mastersweep = colblk * Vblksiz;
            if(colblk == (nbcolblk-1))
                myblknb = magma_ceildiv((N-(mastersweep+1)),NB);
            else
                myblknb = magma_ceildiv((N-(mastersweep+2)),NB);

            *blkcnt      = *blkcnt + myblknb;
            //printf("voici  nbcolblk %d    master sweep %d     blkcnt %d \n",nbcolblk, mastersweep,*blkcnt);
        }
        *LDV= NB+Vblksiz;
    }
Beispiel #3
0
extern "C" void magma_zbulge_applyQ(
    magma_int_t WANTZ, magma_side_t SIDE, magma_int_t NE, magma_int_t N, magma_int_t NB,
    magma_int_t Vblksiz, magmaDoubleComplex *E, magma_int_t LDE,
    magmaDoubleComplex *V, magmaDoubleComplex *TAU, magmaDoubleComplex *T,
    magma_int_t *INFO, magmaDoubleComplex *dV, magmaDoubleComplex *dT,
    magmaDoubleComplex *dE, magma_int_t copytype )
{
    //%===========================
    //%   local variables
    //%===========================
    magmaDoubleComplex c_zero = MAGMA_Z_ZERO;
    magmaDoubleComplex c_one  = MAGMA_Z_ONE;
    
    magma_int_t LDT, LDV, firstcolj;
    magma_int_t bg, nbGblk, rownbm, k, m, n;
    magma_int_t st, ed, fst, vlen, vnb, colj, len;
    magma_int_t blkid, vpos, taupos, tpos;
    //magmaDoubleComplex *WORK;
    magma_int_t LWORK;
    magma_int_t  cur_blksiz, avai_blksiz, ncolinvolvd;
    magma_int_t  nbgr, colst, coled, versionL, versionR;
    magma_int_t blkcnt=-1;

    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );
    
    *INFO=0;
    versionL = 113;
    versionR = 92;
    LDT      = Vblksiz;
    LDV      = NB+Vblksiz-1;
    //blklen = LDV*Vblksiz;
    nbGblk   = plasma_ceildiv((N-1), Vblksiz);
    //magma_zmalloc_cpu( &WORK, LWORK );

    /* find the size of the matrix T V*/
    findVTsiz(N, NB, Vblksiz, &blkcnt, &LDV);
    /* Copy E & V & T to the GPU in dE and dV and dT
     * depending on copytype:
     * 1: mean copy only V
     * 2: mean copy V and T
     * 3: mean copy V, T and E
     * */
    if (copytype > 0) magma_zsetmatrix( LDV, blkcnt*Vblksiz, V, LDV, dV, LDV );
    if (copytype > 1) magma_zsetmatrix( LDT, blkcnt*Vblksiz, T, LDT, dT, LDT );
    if (copytype > 2) magma_zsetmatrix( N, NE, E, N, dE, N );
    magmaDoubleComplex *dwork;
    //ldwork  = NE;
    LWORK   = 2*N*max(Vblksiz, 64);
    if (MAGMA_SUCCESS != magma_zmalloc( &dwork, LWORK )) {
        printf ("!!!!  magma_zbulge_applyQ magma_alloc failed for: dwork\n" );
        exit(-1);
    }

    /* SIDE LEFT  meaning apply E = Q*E = (q_1*q_2*.....*q_n) * E ==> so traverse Vs in reverse order (forward) from q_n to q_1
     *            Also E is splitten by row meaning each apply consist in a block of row (horizontal block) */
    /* SIDE RIGHT meaning apply E = E*Q = E * (q_1*q_2*.....*q_n) ==> so tarverse Vs in normal  order (forward) from q_1 to q_n
     *            Also E is splitten by col meaning each apply consist in a block of col (vertical block) */

    /* WANTZ = 1 meaning E is IDENTITY so form Q using optimized update.
     *         So we use the reverse order from small q to large one,
     *         so from q_n to q_1 so Left update to Identity.
     *         Use versionL 113 because in 114 we need to update the whole matrix and not in icreasing order.
     * WANTZ = 2 meaning E is a full matrix and need to be updated from Left or Right so use normal update
     * */
    if (WANTZ == 1) {
        versionL=113;
        SIDE = MagmaLeft;
        //set the matrix to Identity here to avoid copying it from the CPU
        magmablas_zlaset( MagmaFull, N, N, c_zero, c_one, dE, N );
    }
    


    printf("  APPLY Q_v115 GPU with  N %d   NB %d   Vblksiz %d SIDE %c versionL %d versionR %d WANTZ %d \n",
           (int) N, (int) NB, (int) Vblksiz, SIDE, (int) versionL, (int) versionR, (int) WANTZ);


#if defined(USESTREAM)
    magma_int_t N2=N/2;
    magma_int_t N1=N-N2;
    printf("using stream\n");
    magma_queue_t stream[2];
    magma_queue_create( &stream[0] );
    magma_queue_create( &stream[1] );
#endif
    

    if (SIDE == MagmaLeft) {
        if (versionL == 113) {
            for (bg = nbGblk; bg > 0; bg--) {
                firstcolj = (bg-1)*Vblksiz + 1;
                if (bg == nbGblk)
                    rownbm = plasma_ceildiv((N-(firstcolj)), NB);  // last blk has size=1 used for complex to handle A(N,N-1)
                else
                    rownbm = plasma_ceildiv((N-(firstcolj+1)), NB);
                
                for (m = rownbm; m > 0; m--) {
                    vlen = 0;
                    vnb  = 0;
                    colj = (bg-1)*Vblksiz; // for k=0; I compute the fst and then can remove it from the loop
                    fst  = (rownbm -m)*NB+colj +1;
                    for (k=0; k < Vblksiz; k++) {
                        colj = (bg-1)*Vblksiz + k;
                        st   = (rownbm -m)*NB+colj +1;
                        ed   = min(st+NB-1, N-1);
                        if (st > ed) break;
                        if ((st == ed) && (colj != N-2)) break;
                        vlen=ed-fst+1;
                        vnb=k+1;
                    }
                    colst     = (bg-1)*Vblksiz;
                    findVTpos(N, NB, Vblksiz, colst, fst, &vpos, &taupos, &tpos, &blkid);
                    printf("voici bg %d m %d  vlen %d  vnb %d fcolj %d vpos %d taupos %d \n", (int) bg, (int) m, (int) vlen, (int) vnb, (int) colst+1, (int) vpos+1, (int) taupos+1);
                    if ((vlen > 0) && (vnb > 0)) {
                        if (WANTZ == 1) {
                            len =  N-colst;
                            magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, len, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,colst), LDE, dwork, len);
                        } else {
                            magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, NE, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,0), LDE, dwork, NE);
                        }
                    }
                }
            }
        } else if (versionL == 114) {
            rownbm = plasma_ceildiv((N-1), NB);
            for (m = rownbm; m > 0; m--) {
                ncolinvolvd = min(N-1, m*NB);
                avai_blksiz=min(Vblksiz, ncolinvolvd);
                nbgr = plasma_ceildiv(ncolinvolvd, avai_blksiz);
                for (n = nbgr; n > 0; n--) {
                    vlen = 0;
                    vnb  = 0;
                    cur_blksiz = min(ncolinvolvd-(n-1)*avai_blksiz, avai_blksiz);
                    colst = (n-1)*avai_blksiz;
                    coled = colst + cur_blksiz -1;
                    fst   = (rownbm -m)*NB+colst +1;
                    for (colj=colst; colj <= coled; colj++) {
                        st = (rownbm -m)*NB+colj +1;
                        ed = min(st+NB-1, N-1);
                        if (st > ed) break;
                        if ((st == ed) && (colj != N-2)) break;
                        vlen=ed-fst+1;
                        vnb=vnb+1;
                    }
                    findVTpos(N, NB, Vblksiz, colst, fst, &vpos, &taupos, &tpos, &blkid);
                    //printf("voici bg %d m %d  vlen %d  vnb %d fcolj %d vpos %d taupos %d \n", bg, m, vlen, vnb, colst+1, vpos+1, taupos+1);
                    if ((vlen > 0) && (vnb > 0)) {
                        #if defined(USESTREAM)
                        magmablasSetKernelStream(stream[0]);
                        magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, N1, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,0), LDE, dwork, N1);
                        magmablasSetKernelStream(stream[1]);
                        magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, N2, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,N1), LDE, &dwork[N1*Vblksiz], N2);
                        #else
                        magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, NE, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,0), LDE, dwork, NE);
                        #endif
                    }
                }
            }
        }
    } else if (SIDE == MagmaRight) {
        if (versionR == 91) {
            for (bg =1; bg <= nbGblk; bg++) {
                firstcolj = (bg-1)*Vblksiz + 1;
                rownbm    = plasma_ceildiv((N-(firstcolj+1)), NB);
                if (bg == nbGblk) rownbm    = plasma_ceildiv((N-(firstcolj)), NB);  // last blk has size=1 used for complex to handle A(N,N-1)
                for (m = 1; m <= rownbm; m++) {
                    vlen = 0;
                    vnb  = 0;
                    // for k=0; I compute the fst and then can remove it from the loop
                    colj = (bg-1)*Vblksiz;
                    fst  = (rownbm -m)*NB+colj +1;
                    for (k=0; k < Vblksiz; k++) {
                        colj = (bg-1)*Vblksiz + k;
                        st   = (rownbm -m)*NB+colj +1;
                        ed   = min(st+NB-1, N-1);
                        if (st > ed) break;
                        if ((st == ed) && (colj != N-2)) break;
                        vlen=ed-fst+1;
                        vnb=k+1;
                    }
                    colj     = (bg-1)*Vblksiz;
                    findVTpos(N, NB, Vblksiz, colj, fst, &vpos, &taupos, &tpos, &blkid);
                    //printf("voici bg %d m %d  vlen %d  vnb %d fcolj %d vpos %d taupos %d \n", bg, m, vlen, vnb, colj, vpos, taupos);
                    if ((vlen > 0) && (vnb > 0)) {
                        #if defined(USESTREAM)
                        magmablasSetKernelStream(stream[0]);
                        magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N1, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, N1);
                        magmablasSetKernelStream(stream[1]);
                        magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N2, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(N1, fst), LDE, &dwork[N1*Vblksiz], N2);
                        #else
                        magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, NE);
                        #endif
                    }
                }
            }
        } else if (versionR == 92) {
            rownbm = plasma_ceildiv((N-1), NB);
            for (m = 1; m <= rownbm; m++) {
                ncolinvolvd = min(N-1, m*NB);
                avai_blksiz=min(Vblksiz, ncolinvolvd);
                nbgr = plasma_ceildiv(ncolinvolvd, avai_blksiz);
                for (n = 1; n <= nbgr; n++) {
                    vlen = 0;
                    vnb  = 0;
                    cur_blksiz = min(ncolinvolvd-(n-1)*avai_blksiz, avai_blksiz);
                    colst = (n-1)*avai_blksiz;
                    coled = colst + cur_blksiz -1;
                    fst   = (rownbm -m)*NB+colst +1;
                    for (colj=colst; colj <= coled; colj++) {
                        st = (rownbm -m)*NB+colj +1;
                        ed = min(st+NB-1, N-1);
                        if (st > ed) break;
                        if ((st == ed) && (colj != N-2)) break;
                        vlen=ed-fst+1;
                        vnb=vnb+1;
                    }
                    findVTpos(N, NB, Vblksiz, colst, fst, &vpos, &taupos, &tpos, &blkid);
                    if ((vlen > 0) && (vnb > 0)) {
                        #if defined(USESTREAM)
                        magmablasSetKernelStream(stream[0]);
                        magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N1, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, N1);
                        magmablasSetKernelStream(stream[1]);
                        magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N2, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(N1, fst), LDE, &dwork[N1*Vblksiz], N2);
                        #else
                        magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, NE);
                        #endif
                    }
                }
            }
        }
    } else {
            printf("ERROR SIDE %d\n", SIDE);
    }

#if defined(USESTREAM)
    magma_queue_destroy( stream[0] );
    magma_queue_destroy( stream[1] );
#endif
    magmablasSetKernelStream( orig_stream );
}
Beispiel #4
0
void plasma_pzlarft_blgtrd(plasma_context_t *plasma)
{
    int my_core_id = PLASMA_RANK;
    int cores_num  = plasma->world_size;
    /*===========================*/
    int N, NB,Vblksiz;
    PLASMA_Complex64_t *V;
    PLASMA_Complex64_t *T;
    PLASMA_Complex64_t *TAU;
    PLASMA_sequence *sequence;
    PLASMA_request *request;
    /*===========================
     *   local variables
     *===========================*/
    int LDT, LDV;
    int Vm, Vn, mt, nt;
    int myrow, mycol, blkj, blki;
    int firstrow;
    int blkid,vpos,taupos,tpos;
    int blkpercore,blkcnt, myid;


    plasma_unpack_args_8(N, NB, Vblksiz, V, T, TAU, sequence, request);
    if (sequence->status != PLASMA_SUCCESS)
        return;

    /* Quick return */
    if (N == 0){
        return;
    }
    if (NB == 0){
        return;
    }
    if (NB == 1){
        return;
    }

    findVTsiz(N, NB, Vblksiz, &blkcnt, &LDV);
    blkpercore = blkcnt/cores_num;
    blkpercore = blkpercore==0 ? 1:blkpercore;
    LDT        = Vblksiz;    
    LDV        = NB+Vblksiz-1;

    /*========================================
     * compute the T's in parallel.
     * The Ts are independent so each core pick
     * a T and compute it. The loop is based on 
     * the version 113 of the pzunmqr_blgtrd.c
     * which go over the losange block_column 
     * by block column. but it is not important 
     * here the order because Ts are independent.
     * ========================================
    */ 
    nt  = plasma_ceildiv((N-1),Vblksiz);
    for (blkj=nt-1; blkj>=0; blkj--) {
        /* the index of the first row on the top of block (blkj) */ 
        firstrow = blkj * Vblksiz + 1;
        /*find the number of tile for this block */
        if( blkj == nt-1 )
            mt = plasma_ceildiv( N -  firstrow,    NB);
        else
            mt = plasma_ceildiv( N - (firstrow+1), NB);
        /*loop over the tiles find the size of the Vs and apply it */
        for (blki=mt; blki>0; blki--) {
            /*calculate the size of each losange of Vs= (Vm,Vn)*/
            myrow     = firstrow + (mt-blki)*NB;
            mycol     = blkj*Vblksiz;
            Vm = min( NB+Vblksiz-1, N-myrow);
            if( ( blkj == nt-1 ) && ( blki == mt ) ){
                Vn = min (Vblksiz, Vm);
            } else {
                Vn = min (Vblksiz, Vm-1);
            }
            /*calculate the pointer to the Vs and the Ts.
             * Note that Vs and Ts have special storage done
             * by the bulgechasing function*/
            findVTpos(N,NB,Vblksiz,mycol,myrow, &vpos, &taupos, &tpos, &blkid);
            myid = blkid/blkpercore;
            if( my_core_id==(myid%cores_num) ){
                if( ( Vm > 0 ) && ( Vn > 0 ) ){
                    LAPACKE_zlarft_work(LAPACK_COL_MAJOR, 
                                  lapack_const(PlasmaForward), 
                                  lapack_const(PlasmaColumnwise),
                                  Vm, Vn, V(vpos), LDV, TAU(taupos), T(tpos), LDT);
                }
            }
        }
    }
}