Exemplo n.º 1
0
static void magma_stile_bulge_computeT_parallel(magma_int_t my_core_id, magma_int_t cores_num, float *V, magma_int_t ldv, float *TAU,
                                                float *T, magma_int_t ldt, magma_int_t n, magma_int_t nb, magma_int_t Vblksiz)
{
    //%===========================
    //%   local variables
    //%===========================
    magma_int_t firstcolj;
    magma_int_t rownbm;
    magma_int_t st,ed,fst,vlen,vnb,colj;
    magma_int_t blkid,vpos,taupos,tpos;
    magma_int_t blkpercore, myid;
    
    if(n<=0)
        return ;
    
    magma_int_t blkcnt = magma_bulge_get_blkcnt(n, nb, Vblksiz);
    
    blkpercore = blkcnt/cores_num;
    
    magma_int_t nbGblk  = magma_ceildiv(n-1, Vblksiz);
    
    if(my_core_id==0) printf("  COMPUTE T parallel threads %d with  N %d   NB %d   Vblksiz %d \n",cores_num,n,nb,Vblksiz);
    
    for (magma_int_t bg = nbGblk; bg>0; bg--)
    {
        firstcolj = (bg-1)*Vblksiz + 1;
        rownbm    = magma_ceildiv(n-(firstcolj+1), nb);
        if(bg==nbGblk) 
            rownbm    = magma_ceildiv(n-firstcolj ,nb);  // last blk has size=1 used for real to handle A(N,N-1)

        for (magma_int_t m = rownbm; m>0; m--)
        {
            vlen = 0;
            vnb  = 0;
            colj      = (bg-1)*Vblksiz; // for k=0;I compute the fst and then can remove it from the loop
            fst       = (rownbm -m)*nb+colj +1;
            for (magma_int_t k=0; k<Vblksiz; k++)
            {
                colj     = (bg-1)*Vblksiz + k;
                st       = (rownbm -m)*nb+colj +1;
                ed       = min(st+nb-1,n-1);
                if(st>ed)
                    break;
                if((st==ed)&&(colj!=n-2))
                    break;
                
                vlen=ed-fst+1;
                vnb=k+1;
            }        
            colj     = (bg-1)*Vblksiz;
            magma_bulge_findVTAUTpos(n, nb, Vblksiz, colj, fst, ldv, ldt, &vpos, &taupos, &tpos, &blkid);
            myid = blkid/blkpercore;
            if(my_core_id==(myid%cores_num)){
                if((vlen>0)&&(vnb>0))
                    lapackf77_slarft( "F", "C", &vlen, &vnb, V(vpos), &ldv, TAU(taupos), T(tpos), &ldt);
            }
        }
    }
}
Exemplo n.º 2
0
    void magma_bulge_findpos113(magma_int_t n, magma_int_t nb, magma_int_t Vblksiz, magma_int_t sweep, magma_int_t st, magma_int_t *myblkid)
    {
        magma_int_t prevGblkid, mastersweep;
        magma_int_t locblknb   = 0;
        magma_int_t prevblkcnt = 0;
        magma_int_t myblknb    = 0;
        magma_int_t lstGblkid  = magma_ceildiv((n-1),Vblksiz)-1;//nbGblk-1;
        magma_int_t myGblkid   = sweep/Vblksiz;

        // go forward and for each Gblk(a column of blocks) before my Gblk compute its number of blk.

        //===================================================
        for (prevGblkid = lstGblkid; prevGblkid > myGblkid; prevGblkid--)
        {
            mastersweep  = prevGblkid * Vblksiz;
            if(prevGblkid==lstGblkid)
                locblknb = magma_ceildiv((n-(mastersweep+1)),nb);
            else
                locblknb = magma_ceildiv((n-(mastersweep+2)),nb);
            prevblkcnt   = prevblkcnt + locblknb;
        }
        //===================================================
        /*
        // for best performance, the if condiiton inside the loop
        // is only for prevblkid==lastblkid so I can unroll this
        // out of the loop and so remove the if condition.
        //===================================================
        // for prevGblkid==lstGblkid
        mastersweep  = lstGblkid * Vblksiz;
        locblknb     = magma_ceildiv((n-(mastersweep+1)),nb);
        prevblkcnt   = prevblkcnt + locblknb;
        // the remaining of the loop
        for (prevGblkid = lstGblkid-1; prevGblkid > myGblkid; prevGblkid--)
        {
            mastersweep  = prevGblkid * Vblksiz;
            locblknb     = magma_ceildiv((n-(mastersweep+2)),nb);
            prevblkcnt   = prevblkcnt + locblknb;
        }
        //===================================================
        */
        myblknb = magma_ceildiv((st-sweep),nb);
        *myblkid    = prevblkcnt + myblknb -1;
        //printf("voici sweep %d  lstGblkid %d  myGblkid %d  prevcnt %d  myblknb %d  myblkid %d\n", sweep, lstGblkid,myGblkid,prevblkcnt,myblknb,*myblkid );
    }
Exemplo n.º 3
0
    void magma_bulge_findpos(magma_int_t n, magma_int_t nb, magma_int_t Vblksiz, magma_int_t sweep, magma_int_t st, magma_int_t *myblkid)
    {
        magma_int_t locblknb, prevblkcnt, prevGblkid;
        magma_int_t myblknb, nbprevGblk, mastersweep;

        locblknb = 0;
        prevblkcnt   = 0;
        myblknb  = 0;

        nbprevGblk = sweep/Vblksiz;
        for (prevGblkid = 0; prevGblkid < nbprevGblk; prevGblkid++)
        {
            mastersweep  = prevGblkid * Vblksiz;
            locblknb = magma_ceildiv((n-(mastersweep+2)),nb);
            prevblkcnt   = prevblkcnt + locblknb;
        }
        myblknb = magma_ceildiv((st-sweep),nb);
        *myblkid    = prevblkcnt + myblknb -1;
    }
Exemplo n.º 4
0
    magma_int_t magma_bulge_get_blkcnt(magma_int_t n, magma_int_t nb, magma_int_t Vblksiz)
    {
        magma_int_t colblk, nbcolblk;
        magma_int_t myblknb, mastersweep;

        magma_int_t blkcnt = 0;
        nbcolblk = magma_ceildiv((n-1),Vblksiz);
        for (colblk = 0; colblk<nbcolblk; colblk++)
        {
            mastersweep = colblk * Vblksiz;
            if(colblk == (nbcolblk-1))
                myblknb = magma_ceildiv((n-(mastersweep+1)),nb);
            else
                myblknb = magma_ceildiv((n-(mastersweep+2)),nb);
            blkcnt      = blkcnt + myblknb;
            //printf("voici  nbcolblk %d    master sweep %d     blkcnt %d \n",nbcolblk, mastersweep,*blkcnt);
        }
        return blkcnt;
    }
Exemplo n.º 5
0
    void findVTsiz(magma_int_t N, magma_int_t NB, magma_int_t Vblksiz, magma_int_t *blkcnt, magma_int_t *LDV)
    {
        magma_int_t colblk, nbcolblk;
        magma_int_t myblknb, mastersweep;

        *blkcnt   = 0;
        nbcolblk = plasma_ceildiv((N-1),Vblksiz);
        for (colblk = 0; colblk<nbcolblk; colblk++)
        {
            mastersweep = colblk * Vblksiz;
            if(colblk == (nbcolblk-1))
                myblknb = magma_ceildiv((N-(mastersweep+1)),NB);
            else
                myblknb = magma_ceildiv((N-(mastersweep+2)),NB);

            *blkcnt      = *blkcnt + myblknb;
            //printf("voici  nbcolblk %d    master sweep %d     blkcnt %d \n",nbcolblk, mastersweep,*blkcnt);
        }
        *LDV= NB+Vblksiz;
    }
Exemplo n.º 6
0
// --------------------
int main(int argc, char **argv)
{
    TESTING_INIT();

    real_Double_t gflops, cpu_time=0, cpu_perf=0, gpu_time, gpu_perf, mgpu_time, mgpu_perf, cuda_time, cuda_perf;
    double      Ynorm, error=0, error2=0, work[1];
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    magma_int_t n_local[MagmaMaxGPUs];

    magma_int_t N, Noffset, lda, ldda, blocks, lhwork, ldwork, matsize, vecsize;
    magma_int_t incx = 1;

    magmaDoubleComplex alpha = MAGMA_Z_MAKE(  1.5, -2.3 );
    magmaDoubleComplex beta  = MAGMA_Z_MAKE( -0.6,  0.8 );
    magmaDoubleComplex *A, *X, *Y, *Ylapack, *Ycublas, *Ymagma, *Ymagma1, *hwork;
    magmaDoubleComplex_ptr dA, dX, dY;
    magmaDoubleComplex_ptr d_lA[MagmaMaxGPUs], dwork[MagmaMaxGPUs];

    magma_device_t dev;
    magma_queue_t queues[MagmaMaxGPUs];
    magma_int_t     status = 0;
    
    magma_opts opts;
    opts.parse_opts( argc, argv );
    opts.ngpu = abs( opts.ngpu );  // always uses multi-GPU code
    
    double tol = opts.tolerance * lapackf77_dlamch("E");

    magma_int_t nb = 64;  // required by magmablas_zhemv_mgpu implementation

    for( dev=0; dev < opts.ngpu; ++dev ) {
        magma_queue_create( dev, &queues[dev] );
    }
    
    // currently, tests all offsets in the offsets array;
    // comment out loop below to test a specific offset.
    magma_int_t offset = opts.offset;
    magma_int_t offsets[] = { 0, 1, 31, 32, 33, 63, 64, 65, 100, 200 };
    magma_int_t noffsets = sizeof(offsets) / sizeof(*offsets);
    
    printf("%% uplo = %s, ngpu %d, block size = %d, offset %d\n",
            lapack_uplo_const(opts.uplo), (int) opts.ngpu, (int) nb, (int) offset );
    printf( "%%                 BLAS                CUBLAS              MAGMA 1 GPU         MAGMA MGPU       Error rel  Error rel\n"
            "%%   N  offset     Gflop/s (msec)      Gflop/s (msec)      Gflop/s (msec)      Gflop/s (msec)   to CUBLAS  to LAPACK\n"
            "%%==================================================================================================================\n" );
    for( int itest = 0; itest < opts.ntest; ++itest ) {
      
      // comment out these two lines & end of loop to test a specific offset
      for( int ioffset=0; ioffset < noffsets; ioffset += 1 ) {
        offset = offsets[ioffset];
        
        for( int iter = 0; iter < opts.niter; ++iter ) {
            N       = opts.nsize[itest];
            Noffset = N + offset;
            lda     = Noffset;
            ldda    = magma_roundup( Noffset, opts.align );  // multiple of 32 by default
            matsize = Noffset*ldda;
            vecsize = (Noffset-1)*incx + 1;
            gflops  = FLOPS_ZHEMV( N ) / 1e9;
            
            blocks = magma_ceildiv( N + (offset % nb), nb );
            lhwork = N*opts.ngpu;
            ldwork = ldda*(blocks + 1);

            TESTING_MALLOC_CPU( A,       magmaDoubleComplex, matsize );
            TESTING_MALLOC_CPU( Y,       magmaDoubleComplex, vecsize );
            TESTING_MALLOC_CPU( Ycublas, magmaDoubleComplex, vecsize );
            TESTING_MALLOC_CPU( Ymagma,  magmaDoubleComplex, vecsize );
            TESTING_MALLOC_CPU( Ymagma1, magmaDoubleComplex, vecsize );
            TESTING_MALLOC_CPU( Ylapack, magmaDoubleComplex, vecsize );

            TESTING_MALLOC_PIN( X,       magmaDoubleComplex, vecsize );
            TESTING_MALLOC_PIN( hwork,   magmaDoubleComplex, lhwork  );
            
            magma_setdevice( opts.device );
            TESTING_MALLOC_DEV( dA, magmaDoubleComplex, matsize );
            TESTING_MALLOC_DEV( dX, magmaDoubleComplex, vecsize );
            TESTING_MALLOC_DEV( dY, magmaDoubleComplex, vecsize );
            
            // TODO make magma_zmalloc_bcyclic helper function?
            for( dev=0; dev < opts.ngpu; dev++ ) {
                n_local[dev] = ((Noffset/nb)/opts.ngpu)*nb;
                if (dev < (Noffset/nb) % opts.ngpu)
                    n_local[dev] += nb;
                else if (dev == (Noffset/nb) % opts.ngpu)
                    n_local[dev] += Noffset % nb;
                
                magma_setdevice( dev );
                TESTING_MALLOC_DEV( d_lA[dev],  magmaDoubleComplex, ldda*n_local[dev] );
                TESTING_MALLOC_DEV( dwork[dev], magmaDoubleComplex, ldwork );
            }
            
            //////////////////////////////////////////////////////////////////////////
            
            /* Initialize the matrix */
            lapackf77_zlarnv( &ione, ISEED, &matsize, A );
            magma_zmake_hermitian( Noffset, A, lda );
            
            lapackf77_zlarnv( &ione, ISEED, &vecsize, X );
            lapackf77_zlarnv( &ione, ISEED, &vecsize, Y );
            
            /* =====================================================================
               Performs operation using CUBLAS
               =================================================================== */
            magma_setdevice( opts.device );
            magma_zsetmatrix( Noffset, Noffset, A, lda, dA, ldda, opts.queue );
            magma_zsetvector( Noffset, X, incx, dX, incx, opts.queue );
            magma_zsetvector( Noffset, Y, incx, dY, incx, opts.queue );
            
            cuda_time = magma_sync_wtime(0);
            cublasZhemv( opts.handle, cublas_uplo_const(opts.uplo), N,
                         &alpha, dA + offset + offset*ldda, ldda,
                                 dX + offset, incx,
                         &beta,  dY + offset, incx );
            cuda_time = magma_sync_wtime(0) - cuda_time;
            cuda_perf = gflops / cuda_time;
            
            magma_zgetvector( Noffset, dY, incx, Ycublas, incx, opts.queue );
            
            /* =====================================================================
               Performs operation using MAGMABLAS (1 GPU)
               =================================================================== */
            magma_setdevice( opts.device );
            magma_zsetvector( Noffset, Y, incx, dY, incx, opts.queue );
            
            gpu_time = magma_sync_wtime( opts.queue );
            
            magmablas_zhemv_work( opts.uplo, N,
                                  alpha, dA + offset + offset*ldda, ldda,
                                         dX + offset, incx,
                                  beta,  dY + offset, incx, dwork[ opts.device ], ldwork,
                                  opts.queue );
            
            gpu_time = magma_sync_wtime( opts.queue ) - gpu_time;
            gpu_perf = gflops / gpu_time;
            magma_zgetvector( Noffset, dY, incx, Ymagma1, incx, opts.queue );
            
            /* =====================================================================
               Performs operation using MAGMABLAS (multi-GPU)
               =================================================================== */
            magma_zsetmatrix_1D_col_bcyclic( Noffset, Noffset, A, lda, d_lA, ldda, opts.ngpu, nb, queues );
            blasf77_zcopy( &Noffset, Y, &incx, Ymagma, &incx );
            
            // workspaces do NOT need to be zero -- set to NAN to prove
            for( dev=0; dev < opts.ngpu; ++dev ) {
                magma_setdevice( dev );
                magmablas_zlaset( MagmaFull, ldwork, 1, MAGMA_Z_NAN, MAGMA_Z_NAN, dwork[dev], ldwork, opts.queue );
            }
            lapackf77_zlaset( "Full", &lhwork, &ione, &MAGMA_Z_NAN, &MAGMA_Z_NAN, hwork, &lhwork );
            
            mgpu_time = magma_sync_wtime(0);
            
            magma_int_t info;
            info = magmablas_zhemv_mgpu(
                opts.uplo, N,
                alpha,
                d_lA, ldda, offset,
                X + offset, incx,
                beta,
                Ymagma + offset, incx,
                hwork, lhwork,
                dwork, ldwork,
                opts.ngpu, nb, queues );
            if (info != 0) {
                printf("magmablas_zhemv_mgpu returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            }
            
            info = magmablas_zhemv_mgpu_sync(
                opts.uplo, N,
                alpha,
                d_lA, ldda, offset,
                X + offset, incx,
                beta,
                Ymagma + offset, incx,
                hwork, lhwork,
                dwork, ldwork,
                opts.ngpu, nb, queues );
            if (info != 0) {
                printf("magmablas_zhemv_sync returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            }
            
            mgpu_time = magma_sync_wtime(0) - mgpu_time;
            mgpu_perf = gflops / mgpu_time;
            
            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            if ( opts.lapack ) {
                blasf77_zcopy( &Noffset, Y, &incx, Ylapack, &incx );
                
                cpu_time = magma_wtime();
                blasf77_zhemv( lapack_uplo_const(opts.uplo), &N,
                               &alpha, A + offset + offset*lda, &lda,
                                       X + offset, &incx,
                               &beta,  Ylapack + offset, &incx );
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
    
                /* =====================================================================
                   Compute the Difference LAPACK vs. Magma
                   =================================================================== */
                Ynorm  = lapackf77_zlange( "F", &Noffset, &ione, Ylapack, &Noffset, work );
                blasf77_zaxpy( &Noffset, &c_neg_one, Ymagma, &incx, Ylapack, &incx );
                error2 = lapackf77_zlange( "F", &Noffset, &ione, Ylapack, &Noffset, work ) / Ynorm;
            }
            
            /* =====================================================================
               Compute the Difference Cublas vs. Magma
               =================================================================== */
            Ynorm = lapackf77_zlange( "F", &Noffset, &ione, Ycublas, &Noffset, work );
            blasf77_zaxpy( &Noffset, &c_neg_one, Ymagma, &incx, Ycublas, &incx );
            error = lapackf77_zlange( "F", &Noffset, &ione, Ycublas, &Noffset, work ) / Ynorm;
            
            bool okay = (error < tol && error2 < tol);
            status += ! okay;
            if ( opts.lapack ) {
                printf( "%5d  %5d   %7.2f (%7.2f)   %7.2f (%7.2f)   %7.2f (%7.2f)   %7.2f (%7.2f)   %8.2e   %8.2e   %s\n",
                        (int) N, (int) offset,
                         cpu_perf,  cpu_time*1000.,
                        cuda_perf, cuda_time*1000.,
                         gpu_perf,  gpu_time*1000.,
                        mgpu_perf, mgpu_time*1000.,
                        error, error2, (okay ? "ok" : "failed") );
            }
            else {
                printf( "%5d  %5d     ---   (  ---  )   %7.2f (%7.2f)   %7.2f (%7.2f)   %7.2f (%7.2f)   %8.2e     ---      %s\n",
                        (int) N, (int) offset,
                        cuda_perf, cuda_time*1000.,
                         gpu_perf,  gpu_time*1000.,
                        mgpu_perf, mgpu_time*1000.,
                        error, (okay ? "ok" : "failed") );
            }
            
            /* Free Memory */
            TESTING_FREE_CPU( A );
            TESTING_FREE_CPU( Y );
            TESTING_FREE_CPU( Ycublas );
            TESTING_FREE_CPU( Ymagma  );
            TESTING_FREE_CPU( Ymagma1 );
            TESTING_FREE_CPU( Ylapack );

            TESTING_FREE_PIN( X );
            TESTING_FREE_PIN( hwork   );
            
            magma_setdevice( opts.device );
            TESTING_FREE_DEV( dA );
            TESTING_FREE_DEV( dX );
            TESTING_FREE_DEV( dY );
            
            for( dev=0; dev < opts.ngpu; dev++ ) {
                magma_setdevice( dev );
                TESTING_FREE_DEV( d_lA[dev]  );
                TESTING_FREE_DEV( dwork[dev] );
            }
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
        
      // comment out these two lines line & top of loop test a specific offset
      }  // end for ioffset
      printf( "\n" );
    }
    
    for( dev=0; dev < opts.ngpu; ++dev ) {
        magma_queue_destroy( queues[dev] );
    }
    
    opts.cleanup();
    TESTING_FINALIZE();
    return status;
}
Exemplo n.º 7
0
//##################################################################################################
static void *magma_zapplyQ_m_parallel_section(void *arg)
{
    magma_int_t my_core_id     = ((magma_zapplyQ_m_id_data*)arg) -> id;
    magma_zapplyQ_m_data* data = ((magma_zapplyQ_m_id_data*)arg) -> data;

    magma_int_t ngpu          = data -> ngpu;
    magma_int_t allcores_num   = data -> threads_num;
    magma_int_t n              = data -> n;
    magma_int_t ne             = data -> ne;
    magma_int_t n_gpu          = data -> n_gpu;
    magma_int_t nb             = data -> nb;
    magma_int_t Vblksiz        = data -> Vblksiz;
    magmaDoubleComplex *E         = data -> E;
    magma_int_t lde            = data -> lde;
    magmaDoubleComplex *V         = data -> V;
    magma_int_t ldv            = data -> ldv;
    magmaDoubleComplex *TAU       = data -> TAU;
    magmaDoubleComplex *T         = data -> T;
    magma_int_t ldt            = data -> ldt;
    pthread_barrier_t* barrier = &(data -> barrier);

    magma_int_t info;

    #ifdef ENABLE_TIMER
    real_Double_t timeQcpu=0.0, timeQgpu=0.0;
    #endif

    magma_int_t n_cpu = ne - n_gpu;

    // with MKL and when using omp_set_num_threads instead of mkl_set_num_threads
    // it need that all threads setting it to 1.
    magma_set_lapack_numthreads(1);

#ifdef MAGMA_SETAFFINITY
    //#define PRINTAFFINITY
#ifdef PRINTAFFINITY
    affinity_set print_set;
    print_set.print_affinity(my_core_id, "starting affinity");
#endif
    affinity_set original_set;
    affinity_set new_set(my_core_id);
    int check  = 0;
    int check2 = 0;
    // bind threads
    check = original_set.get_affinity();
    if (check == 0) {
        check2 = new_set.set_affinity();
        if (check2 != 0)
            printf("Error in sched_setaffinity (single cpu)\n");
    }
    else {
        printf("Error in sched_getaffinity\n");
    }
#ifdef PRINTAFFINITY
    print_set.print_affinity(my_core_id, "set affinity");
#endif
#endif

    if (my_core_id == 0) {
        //=============================================
        //   on GPU on thread 0:
        //    - apply V2*Z(:,1:N_GPU)
        //=============================================
        #ifdef ENABLE_TIMER
        timeQgpu = magma_wtime();
        #endif

        magma_zbulge_applyQ_v2_m(ngpu, MagmaLeft, n_gpu, n, nb, Vblksiz, E, lde, V, ldv, T, ldt, &info);
        magma_device_sync();

        #ifdef ENABLE_TIMER
        timeQgpu = magma_wtime()-timeQgpu;
        printf("  Finish Q2_GPU GGG timing= %f\n", timeQgpu);
        #endif
    } else {
        //=============================================
        //   on CPU on threads 1:allcores_num-1:
        //    - apply V2*Z(:,N_GPU+1:NE)
        //=============================================
        #ifdef ENABLE_TIMER
        if (my_core_id == 1)
            timeQcpu = magma_wtime();
        #endif

        magma_int_t n_loc = magma_ceildiv(n_cpu, allcores_num-1);
        magmaDoubleComplex* E_loc = E + (n_gpu+ n_loc * (my_core_id-1))*lde;
        n_loc = min(n_loc,n_cpu - n_loc * (my_core_id-1));

        magma_ztile_bulge_applyQ(my_core_id, MagmaLeft, n_loc, n, nb, Vblksiz, E_loc, lde, V, ldv, TAU, T, ldt);
        pthread_barrier_wait(barrier);

        #ifdef ENABLE_TIMER
        if (my_core_id == 1) {
            timeQcpu = magma_wtime()-timeQcpu;
            printf("  Finish Q2_CPU CCC timing= %f\n", timeQcpu);
        }
        #endif
    } // END if my_core_id

#ifdef MAGMA_SETAFFINITY
    // unbind threads
    if (check == 0) {
        check2 = original_set.set_affinity();
        if (check2 != 0)
            printf("Error in sched_setaffinity (restore cpu list)\n");
    }
#ifdef PRINTAFFINITY
    print_set.print_affinity(my_core_id, "restored_affinity");
#endif
#endif

    return 0;
}
Exemplo n.º 8
0
static void magma_stile_bulge_computeT_parallel(magma_int_t my_core_id, magma_int_t cores_num, float *V, magma_int_t ldv, float *TAU,
                                                float *T, magma_int_t ldt, magma_int_t n, magma_int_t nb, magma_int_t Vblksiz)
{
    //%===========================
    //%   local variables
    //%===========================
    magma_int_t Vm, Vn, mt, nt;
    magma_int_t myrow, mycol, blkj, blki, firstrow;
    magma_int_t blkid,vpos,taupos,tpos;
    magma_int_t blkpercore, myid;

    if(n<=0)
        return ;

    magma_int_t blkcnt = magma_bulge_get_blkcnt(n, nb, Vblksiz);
    blkpercore = blkcnt/cores_num;
    blkpercore = blkpercore==0 ? 1:blkpercore;
    //magma_int_t nbGblk  = magma_ceildiv(n-1, Vblksiz);

    #ifdef ENABLE_DEBUG
    if(my_core_id==0) 
        printf("  COMPUTE T parallel threads %d with  N %d   NB %d   Vblksiz %d \n",cores_num,n,nb,Vblksiz);
    #endif



    /*========================================
     * compute the T's in parallel.
     * The Ts are independent so each core pick
     * a T and compute it. The loop is based on 
     * the version 113 of the applyQ
     * which go over the losange block_column 
     * by block column. but it is not important 
     * here the order because Ts are independent.
     * ========================================
    */ 
    nt  = magma_ceildiv((n-1),Vblksiz);
    for (blkj=nt-1; blkj>=0; blkj--) {
        /* the index of the first row on the top of block (blkj) */ 
        firstrow = blkj * Vblksiz + 1;
        /*find the number of tile for this block */
        if( blkj == nt-1 )
            mt = magma_ceildiv( n -  firstrow,    nb);
        else
            mt = magma_ceildiv( n - (firstrow+1), nb);
        /*loop over the tiles find the size of the Vs and apply it */
        for (blki=mt; blki>0; blki--) {
            /*calculate the size of each losange of Vs= (Vm,Vn)*/
            myrow     = firstrow + (mt-blki)*nb;
            mycol     = blkj*Vblksiz;
            Vm = min( nb+Vblksiz-1, n-myrow);
            if( ( blkj == nt-1 ) && ( blki == mt ) ){
                Vn = min (Vblksiz, Vm);
            } else {
                Vn = min (Vblksiz, Vm-1);
            }
            /*calculate the pointer to the Vs and the Ts.
             * Note that Vs and Ts have special storage done
             * by the bulgechasing function*/
            magma_bulge_findVTAUTpos(n, nb, Vblksiz, mycol, myrow, ldv, ldt, &vpos, &taupos, &tpos, &blkid);
            myid = blkid/blkpercore;
            if( my_core_id==(myid%cores_num) ){
                if( ( Vm > 0 ) && ( Vn > 0 ) ){
                    lapackf77_slarft( "F", "C", &Vm, &Vn, V(vpos), &ldv, TAU(taupos), T(tpos), &ldt);
                }
            }
        }
    }
}
Exemplo n.º 9
0
//##################################################################################################
static void *magma_sapplyQ_parallel_section(void *arg)
{

    magma_int_t my_core_id   = ((magma_sapplyQ_id_data*)arg) -> id;
    magma_sapplyQ_data* data = ((magma_sapplyQ_id_data*)arg) -> data;
    
    magma_int_t allcores_num   = data -> threads_num;
    magma_int_t n              = data -> n;
    magma_int_t ne             = data -> ne;
    magma_int_t n_gpu          = data -> n_gpu;
    magma_int_t nb             = data -> nb;
    magma_int_t Vblksiz        = data -> Vblksiz;
    float *E         = data -> E;
    magma_int_t lde            = data -> lde;
    float *V         = data -> V;
    magma_int_t ldv            = data -> ldv;
    float *TAU       = data -> TAU;
    float *T         = data -> T;
    magma_int_t ldt            = data -> ldt;
    float *dE        = data -> dE;
    magma_int_t ldde           = data -> ldde;
    pthread_barrier_t* barrier = &(data -> barrier);
 
    magma_int_t info;
    
    real_Double_t timeQcpu=0.0, timeQgpu=0.0;
    
    magma_int_t n_cpu = ne - n_gpu;

#if defined(SETAFFINITY)    
    cpu_set_t set;
    CPU_ZERO( &set );
    CPU_SET( my_core_id, &set );
    sched_setaffinity( 0, sizeof(set), &set) ;
#endif
   
            if(my_core_id==0)
            {
                //=============================================
                //   on GPU on thread 0:
                //    - apply V2*Z(:,1:N_GPU)
                //=============================================
                timeQgpu = magma_wtime();
                
                magma_ssetmatrix(n, n_gpu, E, lde, dE, ldde);
                magma_sbulge_applyQ_v2('L', n_gpu, n, nb, Vblksiz, dE, ldde, V, ldv, T, ldt, &info);
                
                magma_device_sync();
                timeQgpu = magma_wtime()-timeQgpu;
                printf("  Finish Q2_GPU GGG timing= %f \n" ,timeQgpu);

            }else{
                //=============================================
                //   on CPU on threads 1:allcores_num-1:
                //    - apply V2*Z(:,N_GPU+1:NE)
                //=============================================
                if(my_core_id == 1)
                    timeQcpu = magma_wtime();
                
                magma_int_t n_loc = magma_ceildiv(n_cpu, allcores_num-1);
                float* E_loc = E + (n_gpu+ n_loc * (my_core_id-1))*lde;
                n_loc = min(n_loc,n_cpu - n_loc * (my_core_id-1));
                
                magma_stile_bulge_applyQ('L', n_loc, n, nb, Vblksiz, E_loc, lde, V, ldv, TAU, T, ldt);
                pthread_barrier_wait(barrier);
                if(my_core_id == 1){
                    timeQcpu = magma_wtime()-timeQcpu;
                    printf("  Finish Q2_CPU CCC timing= %f \n" ,timeQcpu);
                }
                
            } // END if my_core_id
        
    
#if defined(SETAFFINITY)    
    // unbind threads 
    magma_int_t sys_corenbr = 1;
    sys_corenbr = sysconf(_SC_NPROCESSORS_ONLN);
    CPU_ZERO( &set );
    for(magma_int_t i=0; i<sys_corenbr; i++)
        CPU_SET( i, &set );
    sched_setaffinity( 0, sizeof(set), &set) ;
#endif
    
    return 0;
}
Exemplo n.º 10
0
 magma_int_t plasma_ceildiv(magma_int_t a, magma_int_t b)
 {
     return magma_ceildiv(a,b);
 }
Exemplo n.º 11
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing strtri
*/
int main( int argc, char** argv )
{
    TESTING_INIT();

    real_Double_t   gflops, magma_perf, magma_time=0;  //, cpu_perf=0, cpu_time=0;
    float          magma_error, norm_invA, work[1];
    magma_int_t i, j, N, lda, ldda, info;
    magma_int_t jb, nb, nblock, sizeA, size_inv;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t *ipiv;

    float *h_A, *h_dinvA;
    magmaFloat_ptr d_A, d_dinvA;
    float c_neg_one = MAGMA_S_NEG_ONE;
    magma_int_t status = 0;
    
    magma_opts opts;
    opts.parse_opts( argc, argv );
    opts.lapack |= opts.check;  // check (-c) implies lapack (-l)
    
    float tol = opts.tolerance * lapackf77_slamch("E");
    const char *uplo_ = lapack_uplo_const(opts.uplo);

    // this is the NB hard coded into strtri_diag.
    nb = 128;
    
    printf("%% uplo = %s, diag = %s\n",
           lapack_uplo_const(opts.uplo), lapack_diag_const(opts.diag) );
    printf("%%   N  MAGMA Gflop/s (ms)   MAGMA error\n");
    printf("%%======================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            N = opts.nsize[itest];
            lda = N;
            ldda = magma_roundup( lda, opts.align );  // multiple of 32 by default
            nblock = magma_ceildiv( N, nb );
            gflops = nblock * FLOPS_STRTRI( nb ) / 1e9;
            
            TESTING_MALLOC_CPU( h_A,    float, lda*N );
            TESTING_MALLOC_CPU( ipiv,   magma_int_t,        N     );
            
            size_inv = nblock*nb*nb;
            TESTING_MALLOC_DEV( d_A,    float, ldda*N );
            TESTING_MALLOC_DEV( d_dinvA, float, size_inv );
            TESTING_MALLOC_CPU( h_dinvA, float, size_inv );
            
            /* Initialize the matrices */
            /* Factor A into LU to get well-conditioned triangular matrix.
             * Copy L to U, since L seems okay when used with non-unit diagonal
             * (i.e., from U), while U fails when used with unit diagonal. */
            sizeA = lda*N;
            lapackf77_slarnv( &ione, ISEED, &sizeA, h_A );
            lapackf77_sgetrf( &N, &N, h_A, &lda, ipiv, &info );
            for( j = 0; j < N; ++j ) {
                for( i = 0; i < j; ++i ) {
                    *h_A(i,j) = *h_A(j,i);
                }
            }
            
            /* =====================================================================
               Performs operation using MAGMABLAS
               =================================================================== */
            magma_ssetmatrix( N, N, h_A, lda, d_A, ldda, opts.queue );
            
            magma_time = magma_sync_wtime( opts.queue );
            magmablas_strtri_diag( opts.uplo, opts.diag, N, d_A, ldda, d_dinvA, opts.queue );
            magma_time = magma_sync_wtime( opts.queue ) - magma_time;
            magma_perf = gflops / magma_time;
            
            magma_sgetvector( size_inv, d_dinvA, 1, h_dinvA, 1, opts.queue );
            
            if ( opts.verbose ) {
                printf( "A%d=", (int) N );
                magma_sprint( N, N, h_A, lda );
                printf( "d_dinvA%d=", (int) N );
                magma_sprint( min(N+4, nb), min(N+4, nblock*nb), h_dinvA, nb );
            }
            
            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            if ( opts.lapack ) {
                //cpu_time = magma_wtime();
                lapackf77_strtri(
                    lapack_uplo_const(opts.uplo), lapack_diag_const(opts.diag),
                    &N, h_A, &lda, &info );
                //cpu_time = magma_wtime() - cpu_time;
                //cpu_perf = gflops / cpu_time;
            }
            
            /* =====================================================================
               Check the result
               =================================================================== */
            if ( opts.check ) {
                // |invA - invA_magma| / |invA|, accumulated over all diagonal blocks
                magma_error = 0;
                norm_invA   = 0;
                for( i=0; i < N; i += nb ) {
                    jb = min( nb, N-i );
                    sgeadd( jb, jb, c_neg_one, h_A(i, i), lda, h_dinvA(0, i), nb );
                    magma_error = max( magma_error, lapackf77_slantr( "M", uplo_, MagmaNonUnitStr, &jb, &jb, h_dinvA(0, i), &nb,  work ));
                    norm_invA   = max( norm_invA,   lapackf77_slantr( "M", uplo_, MagmaNonUnitStr, &jb, &jb, h_A(i, i),     &lda, work ));
                }
                magma_error /= norm_invA;
                
                // CPU is doing N-by-N inverse, while GPU is doing (N/NB) NB-by-NB inverses.
                // So don't compare performance.
                printf("%5d   %7.2f (%7.2f)   %8.2e   %s\n",
                        (int) N,
                        magma_perf,  1000.*magma_time,
                        //cpu_perf,    1000.*cpu_time,
                        magma_error,
                        (magma_error < tol ? "ok" : "failed"));
                status += ! (magma_error < tol);
            }
            else {
                printf("%5d   %7.2f (%7.2f)      ---\n",
                        (int) N,
                        magma_perf,  1000.*magma_time );
            }
            
            TESTING_FREE_CPU( h_A     );
            TESTING_FREE_CPU( ipiv    );
            
            TESTING_FREE_DEV( d_A     );
            TESTING_FREE_DEV( d_dinvA );
            TESTING_FREE_CPU( h_dinvA );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    opts.cleanup();
    TESTING_FINALIZE();
    return status;
}
Exemplo n.º 12
0
/**
    Purpose
    -------


    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangles of A is stored;
      -     = MagmaLower:  Lower triangles of A is stored.

    @param[in]
    n       INTEGER
            The order of the matrix A.  N >= 0.

    @param[in]
    nb      INTEGER
            The order of the band matrix A.  N >= NB >= 0.

    @param[in]
    Vblksiz INTEGER
            The size of the block of householder vectors applied at once.

    @param[in]
    A       (workspace) COMPLEX_16 array, dimension (LDA, N)
            On entry the band matrix stored in the following way:

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= 2*NB.

    @param[out]
    d       DOUBLE array, dimension (N)
            The diagonal elements of the tridiagonal matrix T:
            D(i) = A(i,i).

    @param[out]
    e       DOUBLE array, dimension (N-1)
            The off-diagonal elements of the tridiagonal matrix T:
            E(i) = A(i,i+1) if UPLO = MagmaUpper, E(i) = A(i+1,i) if UPLO = MagmaLower.

    @param[out]
    V       COMPLEX_16 array, dimension (BLKCNT, LDV, VBLKSIZ)
            On exit it contains the blocks of householder reflectors
            BLKCNT is the number of block and it is returned by the funtion MAGMA_BULGE_GET_BLKCNT.

    @param[in]
    ldv     INTEGER
            The leading dimension of V.
            LDV > NB + VBLKSIZ + 1

    @param[out]
    TAU     COMPLEX_16 dimension(BLKCNT, VBLKSIZ)
            ???

    @param[in]
    compT   INTEGER
            if COMPT = 0 T is not computed
            if COMPT = 1 T is computed

    @param[out]
    T       COMPLEX_16 dimension(LDT *)
            if COMPT = 1 on exit contains the matrices T needed for Q2
            if COMPT = 0 T is not referenced

    @param[in]
    ldt     INTEGER
            The leading dimension of T.
            LDT > Vblksiz

    @ingroup magma_zheev_2stage
    ********************************************************************/
extern "C" magma_int_t
magma_zhetrd_hb2st(
    magma_uplo_t uplo, magma_int_t n, magma_int_t nb, magma_int_t Vblksiz,
    magmaDoubleComplex *A, magma_int_t lda, double *d, double *e,
    magmaDoubleComplex *V, magma_int_t ldv, magmaDoubleComplex *TAU,
    magma_int_t compT, magmaDoubleComplex *T, magma_int_t ldt)
{
    #ifdef ENABLE_TIMER
    real_Double_t timeblg=0.0;
    #endif

    magma_int_t threads = magma_get_parallel_numthreads();
    magma_int_t mklth   = magma_get_lapack_numthreads();
    magma_set_lapack_numthreads(1);

    //const char* uplo_ = lapack_uplo_const( uplo );
    magma_int_t INgrsiz=1;
    magma_int_t blkcnt = magma_bulge_get_blkcnt(n, nb, Vblksiz);
    magma_int_t nbtiles = magma_ceildiv(n, nb);

    memset(T,   0, blkcnt*ldt*Vblksiz*sizeof(magmaDoubleComplex));
    memset(TAU, 0, blkcnt*Vblksiz*sizeof(magmaDoubleComplex));
    memset(V,   0, blkcnt*ldv*Vblksiz*sizeof(magmaDoubleComplex));

    magma_int_t* prog;
    magma_malloc_cpu((void**) &prog, (2*nbtiles+threads+10)*sizeof(magma_int_t));
    memset(prog, 0, (2*nbtiles+threads+10)*sizeof(magma_int_t));

    magma_zbulge_id_data* arg;
    magma_malloc_cpu((void**) &arg, threads*sizeof(magma_zbulge_id_data));

    pthread_t* thread_id;
    magma_malloc_cpu((void**) &thread_id, threads*sizeof(pthread_t));
    pthread_attr_t thread_attr;

    magma_zbulge_data data_bulge(threads, n, nb, nbtiles, INgrsiz, Vblksiz, compT,
                                 A, lda, V, ldv, TAU, T, ldt, prog);

    // Set one thread per core
    pthread_attr_init(&thread_attr);
    pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM);
    pthread_setconcurrency(threads);

    //timing
    #ifdef ENABLE_TIMER
    timeblg = magma_wtime();
    #endif

    // Launch threads
    for (magma_int_t thread = 1; thread < threads; thread++) {
        arg[thread] = magma_zbulge_id_data(thread, &data_bulge);
        pthread_create(&thread_id[thread], &thread_attr, magma_zhetrd_hb2st_parallel_section, &arg[thread]);
    }
    arg[0] = magma_zbulge_id_data(0, &data_bulge);
    magma_zhetrd_hb2st_parallel_section(&arg[0]);

    // Wait for completion
    for (magma_int_t thread = 1; thread < threads; thread++) {
        void *exitcodep;
        pthread_join(thread_id[thread], &exitcodep);
    }

    // timing
    #ifdef ENABLE_TIMER
    timeblg = magma_wtime()-timeblg;
    printf("  time BULGE+T = %f\n", timeblg);
    #endif

    magma_free_cpu(thread_id);
    magma_free_cpu(arg);
    magma_free_cpu(prog);

    magma_set_lapack_numthreads(mklth);
    /*================================================
     *  store resulting diag and lower diag d and e
     *  note that d and e are always real
     *================================================*/

    /* Make diagonal and superdiagonal elements real,
     * storing them in d and e
     */
    /* In complex case, the off diagonal element are
     * not necessary real. we have to make off-diagonal
     * elements real and copy them to e.
     * When using HouseHolder elimination,
     * the ZLARFG give us a real as output so, all the
     * diagonal/off-diagonal element except the last one are already
     * real and thus we need only to take the abs of the last
     * one.
     *  */

#if defined(PRECISION_z) || defined(PRECISION_c)
    if (uplo == MagmaLower) {
        for (magma_int_t i=0; i < n-1; i++) {
            d[i] = MAGMA_Z_REAL( A[i*lda  ] );
            e[i] = MAGMA_Z_REAL( A[i*lda+1] );
        }
        d[n-1] = MAGMA_Z_REAL(A[(n-1)*lda]);
    } else { /* MagmaUpper not tested yet */
        for (magma_int_t i=0; i < n-1; i++) {
            d[i] = MAGMA_Z_REAL( A[i*lda+nb]   );
            e[i] = MAGMA_Z_REAL( A[i*lda+nb-1] );
        }
        d[n-1] = MAGMA_Z_REAL(A[(n-1)*lda+nb]);
    } /* end MagmaUpper */
#else
    if ( uplo == MagmaLower ) {
        for (magma_int_t i=0; i < n-1; i++) {
            d[i] = A[i*lda];   // diag
            e[i] = A[i*lda+1]; // lower diag
        }
        d[n-1] = A[(n-1)*lda];
    } else {
        for (magma_int_t i=0; i < n-1; i++) {
            d[i] = A[i*lda+nb];   // diag
            e[i] = A[i*lda+nb-1]; // lower diag
        }
        d[n-1] = A[(n-1)*lda+nb];
    }
#endif
    return MAGMA_SUCCESS;
}
Exemplo n.º 13
0
/**
    Purpose
    -------
    SGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.

    The factorization has the form
        A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.

    Arguments
    ---------
    @param[in]
    ngpu    INTEGER
            Number of GPUs to use. ngpu > 0.

    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    d_lA    REAL array of pointers on the GPU, dimension (ngpu).
            On entry, the M-by-N matrix A distributed over GPUs
            (d_lA[d] points to the local matrix on d-th GPU).
            It uses 1D block column cyclic format with the block size of nb,
            and each local matrix is stored by column.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    @param[in]
    ldda     INTEGER
            The leading dimension of the array d_lA.  LDDA >= max(1,M).

    @param[out]
    ipiv    INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
      -     > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.

    @ingroup magma_sgesv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_sgetrf_mgpu(
    magma_int_t ngpu,
    magma_int_t m, magma_int_t n,
    magmaFloat_ptr d_lA[], magma_int_t ldda, magma_int_t *ipiv,
    magma_int_t *info)
{
    magma_int_t nb, n_local[MagmaMaxGPUs];
    magma_int_t maxm;
    magma_int_t i, j, d, lddat, lddwork;
    float *d_lAT[MagmaMaxGPUs];
    float *d_panel[MagmaMaxGPUs], *work;
    magma_queue_t queues[MagmaMaxGPUs][2];

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -2;
    else if (n < 0)
        *info = -3;
    else if (ldda < max(1,m))
        *info = -5;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* create the queues */
    for( d=0; d < ngpu; d++ ) {
        magma_queue_create( d, &queues[d][0] );
        magma_queue_create( d, &queues[d][1] );
    }

    /* Function Body */
    nb = magma_get_sgetrf_nb( m, n );

    if (nb <= 1 || nb >= n) {
        /* Use CPU code. */
        magma_smalloc_cpu( &work, m * n );
        if ( work == NULL ) {
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }
        magma_sgetmatrix( m, n, d_lA[0], ldda, work, m, queues[0][0] );
        lapackf77_sgetrf(&m, &n, work, &m, ipiv, info);
        magma_ssetmatrix( m, n, work, m, d_lA[0], ldda, queues[0][0] );
        magma_free_cpu(work);
    } else {
        /* Use hybrid blocked code. */
        magma_device_t orig_dev;
        magma_getdevice( &orig_dev );
        
        maxm = magma_roundup( m, 32 );
        if ( ngpu > ceil((float)n/nb) ) {
            printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu );
            *info = -1;
            return *info;
        }

        /* allocate workspace for each GPU */
        lddat = magma_roundup( ((magma_ceildiv( n, nb )/ngpu)*nb), 32 );
        lddat = magma_ceildiv( n, nb );        /* number of block columns         */
        lddat = magma_ceildiv( lddat, ngpu );  /* number of block columns per GPU */
        lddat = nb*lddat;                      /* number of columns per GPU       */
        lddat = magma_roundup( lddat, 32 );    /* make it a multiple of 32        */
        for (i=0; i < ngpu; i++) {
            magma_setdevice(i);
            
            /* local-n and local-ld */
            n_local[i] = ((n/nb)/ngpu)*nb;
            if (i < (n/nb)%ngpu)
                n_local[i] += nb;
            else if (i == (n/nb)%ngpu)
                n_local[i] += n%nb;
            
            /* workspaces */
            if (MAGMA_SUCCESS != magma_smalloc( &d_panel[i], (3+ngpu)*nb*maxm )) {
                for( j=0; j <= i; j++ ) {
                    magma_setdevice(j);
                }
                for( j=0; j < i; j++ ) {
                    magma_setdevice(j);
                    magma_free( d_panel[j] );
                    magma_free( d_lAT[j]   );
                }
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }
            
            /* local-matrix storage */
            if (MAGMA_SUCCESS != magma_smalloc( &d_lAT[i], lddat*maxm )) {
                for( j=0; j <= i; j++ ) {
                    magma_setdevice(j);
                    magma_free( d_panel[j] );
                }
                for( j=0; j < i; j++ ) {
                    magma_setdevice(j);
                    magma_free( d_lAT[j] );
                }
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }
            
            magmablas_stranspose( m, n_local[i], d_lA[i], ldda, d_lAT[i], lddat, queues[i][1] );
        }
        for (i=0; i < ngpu; i++) {
            magma_setdevice(i);
            magma_queue_sync(queues[i][0]);
        }
        magma_setdevice(0);

        /* cpu workspace */
        lddwork = maxm;
        if (MAGMA_SUCCESS != magma_smalloc_pinned( &work, lddwork*nb*ngpu )) {
            for (i=0; i < ngpu; i++ ) {
                magma_setdevice(i);
                magma_free( d_panel[i] );
                magma_free( d_lAT[i]   );
            }
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }

        /* calling multi-gpu interface with allocated workspaces and queues */
        magma_sgetrf2_mgpu(ngpu, m, n, nb, 0, d_lAT, lddat, ipiv, d_panel, work, maxm,
                           queues, info);

        /* clean up */
        for( d=0; d < ngpu; d++ ) {
            magma_setdevice(d);
            
            /* save on output */
            magmablas_stranspose( n_local[d], m, d_lAT[d], lddat, d_lA[d], ldda, queues[d][0] );
            magma_queue_sync(queues[d][0]);
            magma_queue_sync(queues[d][1]);

            magma_free( d_lAT[d]   );
            magma_free( d_panel[d] );
        } /* end of for d=1,..,ngpu */
        magma_setdevice( orig_dev );
        magma_free_pinned( work );
    }

    /* clean up */
    for( d=0; d < ngpu; d++ ) {
        magma_setdevice(d);
        magma_queue_destroy( queues[d][0] );
        magma_queue_destroy( queues[d][1] );
    }

    return *info;
}
Exemplo n.º 14
0
extern "C" magma_int_t
magma_zmslice(
    magma_int_t num_slices,
    magma_int_t slice,
    magma_z_matrix A, 
    magma_z_matrix *B,
    magma_z_matrix *ALOC,
    magma_z_matrix *ANLOC,
    magma_index_t *comm_i,
    magmaDoubleComplex *comm_v,
    magma_int_t *start,
    magma_int_t *end,
    magma_queue_t queue )
{
    magma_int_t info = 0;
    
    if( A.num_rows != A.num_cols ){
        printf("%%  error: only supported for square matrices.\n");
        info = MAGMA_ERR_NOT_SUPPORTED;
        goto cleanup;
    }
    
    if ( A.memory_location == Magma_CPU
            && A.storage_type == Magma_CSR ){
        CHECK( magma_zmconvert( A, B, Magma_CSR, Magma_CSR, queue ) );
        magma_free_cpu( B->col );
        magma_free_cpu( B->val );
        CHECK( magma_zmconvert( A, ALOC, Magma_CSR, Magma_CSR, queue ) );
        magma_free_cpu( ALOC->col );
        magma_free_cpu( ALOC->row );
        magma_free_cpu( ALOC->val );
        CHECK( magma_zmconvert( A, ANLOC, Magma_CSR, Magma_CSR, queue ) );
        magma_free_cpu( ANLOC->col );
        magma_free_cpu( ANLOC->row );
        magma_free_cpu( ANLOC->val );
        
        magma_int_t i,j,k, nnz, nnz_loc=0, loc_row = 0, nnz_nloc = 0;
        magma_index_t col;
        magma_int_t size = magma_ceildiv( A.num_rows, num_slices ); 
        magma_int_t lstart = slice*size;
        magma_int_t lend = min( (slice+1)*size, A.num_rows );
        // correct size for last slice
        size = lend-lstart;
        CHECK( magma_index_malloc_cpu( &ALOC->row, size+1 ) );
        CHECK( magma_index_malloc_cpu( &ANLOC->row, size+1 ) );
        
        // count elements for slice - identity for rest
        nnz = A.row[ lend ] - A.row[ lstart ] + ( A.num_rows - size );
        CHECK( magma_index_malloc_cpu( &B->col, nnz ) );
        CHECK( magma_zmalloc_cpu( &B->val, nnz ) );         
        
        // for the communication plan
        for( i=0; i<A.num_rows; i++ ) {
            comm_i[i] = 0;
            comm_v[i] = MAGMA_Z_ZERO;
        }
        
        k=0;
        B->row[i] = 0;
        ALOC->row[0] = 0;
        ANLOC->row[0] = 0;
        // identity above slice
        for( i=0; i<lstart; i++ ) {
            B->row[i+1]   = B->row[i]+1;
            B->val[k] = MAGMA_Z_ONE;
            B->col[k] = i;
            k++;
        }
        
        // slice        
        for( i=lstart; i<lend; i++ ) {
            B->row[i+1]   = B->row[i] + (A.row[i+1]-A.row[i]);
            for( j=A.row[i]; j<A.row[i+1]; j++ ){
                B->val[k] = A.val[j];
                col = A.col[j];
                B->col[k] = col;
                // communication plan
                if( col<lstart || col>=lend ){
                    comm_i[ col ] = 1;
                    comm_v[ col ] = comm_v[ col ] 
                            + MAGMA_Z_MAKE( MAGMA_Z_ABS( A.val[j] ), 0.0 );
                    nnz_nloc++;
                } else {
                    nnz_loc++;   
                }
                k++;
            }
            loc_row++;
            ALOC->row[ loc_row ] = nnz_loc;
            ANLOC->row[ loc_row ] = nnz_nloc;
        }
        CHECK( magma_index_malloc_cpu( &ALOC->col, nnz_loc ) );
        CHECK( magma_zmalloc_cpu( &ALOC->val, nnz_loc ) ); 
        ALOC->num_rows = size;
        ALOC->num_cols = size;
        ALOC->nnz = nnz_loc;
        
        CHECK( magma_index_malloc_cpu( &ANLOC->col, nnz_nloc ) );
        CHECK( magma_zmalloc_cpu( &ANLOC->val, nnz_nloc ) ); 
        ANLOC->num_rows = size;
        ANLOC->num_cols = A.num_cols;
        ANLOC->nnz = nnz_nloc;
        
        nnz_loc = 0;
        nnz_nloc = 0;
        // local/nonlocal matrix        
        for( i=lstart; i<lend; i++ ) {
            for( j=A.row[i]; j<A.row[i+1]; j++ ){
                col = A.col[j];
                // insert only in local part in ALOC, nonlocal in ANLOC
                if( col<lstart || col>=lend ){
                    ANLOC->val[ nnz_nloc ] = A.val[j];
                    ANLOC->col[ nnz_nloc ] = col;  
                    nnz_nloc++;
                } else {
                    ALOC->val[ nnz_loc ] = A.val[j];
                    ALOC->col[ nnz_loc ] = col-lstart;  
                    nnz_loc++;
                }
            }
        }
        
        // identity below slice
        for( i=lend; i<A.num_rows; i++ ) {
            B->row[i+1] = B->row[i]+1;
            B->val[k] = MAGMA_Z_ONE;
            B->col[k] = i;
            k++;
        }
        B->nnz = k;
        *start = lstart;
        *end = lend;
        
    }
    else {
        printf("error: mslice only supported for CSR matrices on the CPU: %d %d.\n", 
                int(A.memory_location), int(A.storage_type) );
        info = MAGMA_ERR_NOT_SUPPORTED;
    }
cleanup:
    return info;
}
Exemplo n.º 15
0
void magmablas_ssymm_mgpu_spec(
    magma_side_t side, magma_uplo_t uplo, magma_int_t m, magma_int_t n,
    float alpha,
    magmaFloat_ptr dA[],    magma_int_t ldda,  magma_int_t offset,
    magmaFloat_ptr dB[],    magma_int_t lddb,
    float beta,
    magmaFloat_ptr dC[],    magma_int_t lddc,
    magmaFloat_ptr dwork[], magma_int_t dworksiz,
    float *C,          magma_int_t ldc,
    float *work[],     magma_int_t worksiz,  // TODO unused
    magma_int_t ngpu, magma_int_t nb, 
    magma_queue_t queues[][20], magma_int_t nqueue, 
    magma_event_t redevents[][MagmaMaxGPUs*MagmaMaxGPUs+10], magma_int_t nbevents, 
    magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2], magma_int_t nbcmplx )
{
    #define dA(dev, i, j) (dA[dev] + (i) + (j)*ldda)
    #define dB(dev, i, j) (dB[dev] + (i) + (j)*lddb)
    #define dC(dev, i, j) (dC[dev] + (i) + (j)*lddc)
    #define dwork(dev, i, j) (dwork[dev] + (i) + (j)*lddwork)
    #define C(i, j) (C + (i) + (j)*ldc)
    
    if ( side != MagmaLeft || uplo != MagmaLower ) {
        fprintf( stderr, "%s: only Left Lower implemented\n", __func__ );
    }
    
    assert( ldda >= m );
    assert( lddb >= m );
    assert( lddc >= m );
    assert( nqueue >= ngpu );
    assert( nbevents >= ngpu*ngpu );
    
    magmaFloat_ptr dwork1[MagmaMaxGPUs];
    magmaFloat_ptr dwork2[MagmaMaxGPUs];


    magma_int_t lddwork = lddc;
    for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
        dwork1[dev] = dwork[dev];
        dwork2[dev] = dwork[dev]+n*lddwork;
    }
    assert( dworksiz >= (2*n*lddwork) );




        
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_t cstream;
    magmablasGetKernelStream(&cstream);


    magma_int_t dev, devperm, myblk, mycolsize, myblkoffst;
    magma_int_t gdev, gcolsize, gmaster, gngpu;
    magma_int_t masterdev, lcdev, lccolsize, myngpu;

    magma_int_t stdev       = (offset/nb)%ngpu;  
    magma_int_t blockoffset = offset % nb;  
    magma_int_t fstblksiz   = 0;
    if(blockoffset>0){
        fstblksiz   = min(m, (nb - blockoffset));
    }
    //magma_int_t nbblk       = magma_ceildiv(m, nb);
    magma_int_t nbblk       = magma_ceildiv((m+blockoffset), nb);
    magma_int_t maxgsize    = n*nb*magma_ceildiv(nbblk, ngpu);
    magma_int_t remm        = m- fstblksiz;
    magma_int_t nbblkoffst  = offset/nb;


    magma_int_t nblstblks = -1;
    magma_int_t devlstblk = -1;
    magma_int_t lstblksiz = remm%nb;
    if(lstblksiz>0){
        nblstblks = nbblk%ngpu;
        devlstblk = (nblstblks-1+ngpu)%ngpu;
    }

    magma_int_t nbcmplxactive =  0;
    magma_int_t cmplxisactive[MagmaMaxGPUs];
    magma_int_t gpuisactive[MagmaMaxGPUs];
    memset(gpuisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t));
    memset(cmplxisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t));


    //*******************************
    //  each GPU make a GEMM with the
    //  transpose of its blocks to compute
    //  a final portion of X=A*VT
    //*******************************
    /* dB = V*T already ==> dB**H = T**H * V**H
     * compute T**H * V**H * X is equal to compute locally (VT)**H_i*X_i 
     * then  each GPU broadcast its X_i to assemble the full X which is used
     * to compute W  =  X  - 0.5 * V * T**H * V**H * X  = X - 0.5 * V *dwork3
     */
    if(ngpu ==1){
        magma_setdevice( 0 );
        magmablasSetKernelStream( queues[ 0 ][ 0 ] );
        // compute X[me] = A*VT = A[me]^tr *VT;
        magma_sgemm( MagmaConjTrans, MagmaNoTrans, m, n, m,
                     alpha, dA(0, offset, offset), ldda,
                            dB[0],         lddb,
                     beta,  dC[0], lddc );
        return;
    }
    //ngpu>1
    for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
        masterdev     = -1;
        gnode[cmplxid][MagmaMaxGPUs+1] = -1;
        myngpu = gnode[cmplxid][MagmaMaxGPUs];
        for( magma_int_t idev = 0; idev < myngpu; ++idev ) {
            dev         = gnode[cmplxid][idev];
            devperm     = (dev-stdev+ngpu)%ngpu;
            myblk       = (nbblk/ngpu) + (nbblk%ngpu > devperm ?  1:0 );
            mycolsize   = myblk*nb;
            myblkoffst  = nb*((nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0));            
            if(dev==stdev){
                mycolsize  -=  blockoffset;
                myblkoffst +=  blockoffset;     // local index in parent matrix
            }
            if((devperm==devlstblk)&&(lstblksiz>0)){
                mycolsize -=  (nb-(remm%nb));
            }
            mycolsize = min(mycolsize, m);

        
            if(mycolsize>0){
                if(masterdev==-1) masterdev     = dev;
                //printf("dev %d devperm %d on cmplx %d  master %d nbblk %d myblk %d m %d n %d mycolsize %d stdev %d fstblksize %d lastdev %d lastsize %d dA(%d, %d, %d) ==> dwork(%d, %d)\n", dev, devperm, cmplxid, masterdev, nbblk, myblk, m, n, mycolsize, stdev, fstblksiz, devlstblk, remm%nb, dev, offset, myblkoffst, dev, maxgsize*dev);
                gpuisactive[dev] = mycolsize;
                magma_setdevice( dev );
                magmablasSetKernelStream( queues[ dev ][ dev ] );    

                magma_sgemm( MagmaConjTrans, MagmaNoTrans, mycolsize, n, m,
                             alpha, dA(dev, offset, myblkoffst), ldda,
                                    dB(dev, 0, 0),    lddb,
                             beta,  &dwork[dev][maxgsize*dev], mycolsize );
                magma_event_record(redevents[dev][dev*ngpu+dev], queues[dev][dev]);
            }
            if(dev == masterdev){
                nbcmplxactive = nbcmplxactive +1;
                cmplxisactive[cmplxid] = 1;
                gnode[cmplxid][MagmaMaxGPUs+1] = masterdev;
            }
        }
    }



/*
    for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
        magma_setdevice( dev );
        magma_queue_sync( queues[ dev ][ dev ] );
    }
*/


    //*******************************
    //  each Master GPU has the final
    //  result either by receiving 
    //  from CPU of by making the add
    //  by himself, so now it is time 
    //  to broadcast over the GPUs of 
    //  its board.
    //*******************************
    //printf("=======================================================================\n");
    //printf("                           sending                                     \n");
    //printf("=======================================================================\n");
    
    for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
        myngpu    = gnode[cmplxid][MagmaMaxGPUs];
        masterdev = gnode[cmplxid][MagmaMaxGPUs+1];
        for( magma_int_t idev = 0; idev < myngpu; ++idev ) {
            dev         = gnode[cmplxid][idev];
            mycolsize   = gpuisactive[dev];
            if(mycolsize>0){
                // I am an active GPU send my portion local 
                // to all active gpu of my cmplex and global to the 
                // active master of the other real and they should 
                // send it out to their actives slaves.
                magma_setdevice( dev );        
                //==============================================
                // sending to the master of the active real
                //==============================================
                //printf     ("\n\n**************GPU %d\n ", dev);
                //printf     ("             GPU %d sending to cmplx masters\n", dev);
                for( magma_int_t k = 0; k < nbcmplx; ++k ) {
                    if(k!=cmplxid){
                        gmaster = gnode[k][MagmaMaxGPUs+1];
                        if(gmaster!=-1){ //real is active
                            //printf     ("                    device %d from cmplx %d is sending to master %d on cmplx %d block of size %d event %d\n", dev, cmplxid, gmaster, k, mycolsize, redevents[dev][gmaster*ngpu+dev]);
                            magma_queue_wait_event(queues[ dev ][ gmaster ], redevents[dev][dev*ngpu+dev]);
                            magma_scopymatrix_async(
                                mycolsize, n,
                                &dwork[dev    ][maxgsize*dev], mycolsize,
                                &dwork[gmaster][maxgsize*dev], mycolsize, queues[dev][gmaster] );
                            magma_event_record(redevents[dev][gmaster*ngpu+dev], queues[dev][gmaster]);
                        }
                    }
                }
                //==============================================
                //
                //==============================================
                // sending to the active GPUs of my real
                //==============================================
                //printf     ("              GPU %d sending internal\n", dev);                
                for( magma_int_t l = 0; l < myngpu; ++l ) {
                    lcdev         = gnode[cmplxid][l];
                    lccolsize     = gpuisactive[lcdev];
                    if((lcdev!=dev)&&(lccolsize>0)){
                        //printf     ("                    device %d from cmplx %d is sending internal to dev %d block of size %d event %d\n", dev, cmplxid, lcdev, mycolsize, redevents[dev][lcdev*ngpu+dev]);
                        magma_queue_wait_event(queues[ dev ][ lcdev ], redevents[dev][dev*ngpu+dev]);
                        magma_scopymatrix_async(
                            mycolsize, n,
                            &dwork[dev  ][maxgsize*dev], mycolsize,
                            &dwork[lcdev][maxgsize*dev], mycolsize, queues[dev][lcdev] );
                        magma_event_record(redevents[dev][lcdev*ngpu+dev], queues[dev][lcdev]);
                    }
                }
                //==============================================
            }// end if mycolsize>0
        }// for idev
    }// for cmplxid


    //printf("=======================================================================\n");
    //printf("                master wait and resend internally                      \n");
    //printf("=======================================================================\n");
    
    for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
        myngpu    = gnode[cmplxid][MagmaMaxGPUs];
        masterdev = gnode[cmplxid][MagmaMaxGPUs+1];
        //==============================================
        // if I am active master so wait receiving contribution
        // of the GPUs of other real and send it locally
        //==============================================
        if(masterdev != -1){
            mycolsize   = gpuisactive[masterdev];
            magma_setdevice( masterdev );
            //printf("              GPU %d distributing internal\n", masterdev);
            for( magma_int_t k = 0; k < nbcmplx; ++k ) {
                if(k!=cmplxid){
                    gngpu   = gnode[k][MagmaMaxGPUs];
                    for( magma_int_t g = 0; g < gngpu; ++g ) {
                        gdev         = gnode[k][g];
                        gcolsize     = gpuisactive[gdev];
                        // check if I received from this GPU,
                        // if yes send it to my group
                        if(gcolsize>0){
                           magma_queue_wait_event(queues[ masterdev ][ gdev ], redevents[gdev][masterdev*ngpu+gdev]);
                           for( magma_int_t l = 0; l < myngpu; ++l ) {
                                lcdev         = gnode[cmplxid][l];
                                lccolsize     = gpuisactive[lcdev];
                                if((lcdev!=masterdev)&&(lccolsize>0)){
                                    //printf("                    Master %d on cmplx %d waiting on event %d is distributing internal results of %d to lcdev %d block of size %d event %d\n", masterdev, cmplxid, redevents[gdev][masterdev*ngpu+gdev], gdev, lcdev, gcolsize, redevents[masterdev][lcdev*ngpu+gdev]);
                                    magma_scopymatrix_async(
                                        gcolsize, n,
                                        &dwork[masterdev][maxgsize*gdev], gcolsize,
                                        &dwork[lcdev    ][maxgsize*gdev], gcolsize, queues[masterdev][gdev] );
                                    magma_event_record(redevents[masterdev][lcdev*ngpu+gdev], queues[masterdev][gdev]);
                                }
                            }
                        }
                    }
                }
            }
        }// if active master 
        //==============================================
    }// for cmplxid





/*

    for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
        magma_setdevice( dev );
                magma_queue_sync( queues[ dev ][ 0 ] );
        for( magma_int_t s = 0; s < ngpu; ++s ) {
                magma_queue_sync( queues[ dev ][ s ] );
        }
    }
*/
    //printf("=======================================================================\n");
    //printf("                           distributing                                \n");
    //printf("=======================================================================\n");

    magma_int_t lcblki, gbblki, gblk, ib;
    
    for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
        myngpu    = gnode[cmplxid][MagmaMaxGPUs];
        masterdev = gnode[cmplxid][MagmaMaxGPUs+1];
        for( magma_int_t idev = 0; idev < myngpu; ++idev ) {
            dev         = gnode[cmplxid][idev];
            mycolsize   = gpuisactive[dev];
            if(mycolsize>0){ // I am an active GPU
                //printf("\n\n==============GPU %d collecting\n", dev);
                magma_setdevice( dev );        
                // collect my results first as tyhere is no need to wait to   
                // receive nothing, just wait that my gemm are done.
                // in theory this should be inside the loop but cuda was not 
                // able to run it first for all gpu and on gpu>0 it was waiting
                // however it was on different stream so it should run. but maybe
                // this is because there are too many function call and this make 
                // cuda not handleit so nice. anyway it coul dbe removed when cuda
                // is able to lunch it first without wait.
                gdev = dev;
                gcolsize     = gpuisactive[gdev];
                if(gcolsize>0){
                    devperm     = (gdev-stdev+ngpu)%ngpu;
                    gblk        = (nbblk/ngpu) + (nbblk%ngpu > devperm ?  1:0 );
                    magmablasSetKernelStream( queues[ dev ][ gdev ] );
                    magma_queue_wait_event(queues[ dev ][ gdev ], redevents[gdev][dev*ngpu+gdev]);
                    //printf     ("              GPU %d stream %d doing slacpy\n", dev, queues[ dev ][ gdev ]);
                    for( magma_int_t blki = 0; blki < gblk; ++blki){
                        gbblki = (blki*ngpu + devperm)*nb - blockoffset;
                        lcblki = blki*nb;
                        ib     = nb;//min(nb, m-gbblki);
                        if(gdev==stdev){
                            lcblki = blki*nb-blockoffset;
                            if(blki==0){
                                gbblki = 0;
                                lcblki = 0;
                                ib     = nb-blockoffset;
                            }
                        }
                        ib     = min(ib, m-gbblki);
                        //printf("                    blockoffset %d nbblk %d stdev %d  receiving from gdev %d gblk %d  gcolsize %d copying blki %d of size ibxn %dx%d from work[%d] to C[%d]\n", blockoffset, nbblk, stdev, gdev, gblk, gcolsize, blki, ib, n, lcblki, gbblki);
                        magmablas_slacpy( MagmaFull, ib, n, &dwork[dev][maxgsize*gdev+lcblki], gcolsize, &dC[dev][gbblki], lddc);
                    }// end blki
                }


                
                for( magma_int_t k = 0; k < nbcmplx; ++k ) {
                    gngpu   = gnode[k][MagmaMaxGPUs];
                    for( magma_int_t g = 0; g < gngpu; ++g ) {
                        gdev         = gnode[k][g];
                        gcolsize     = gpuisactive[gdev];
                        // if gcolsize>0, ==> gpu gdev was active and so 
                        // I received from him/computed a portion of dwork, 
                        // so go over its gblk and distribute it on dC.
                        if(gdev!=dev){
                            if(gcolsize>0){
                                devperm     = (gdev-stdev+ngpu)%ngpu;
                                gblk        = (nbblk/ngpu) + (nbblk%ngpu > devperm ?  1:0 );
                                magmablasSetKernelStream( queues[ dev ][ gdev ] );
                                if(k==cmplxid){
                                    //we are on the same group so wait on event issued by gdev for me citing his id
                                    magma_queue_wait_event(queues[ dev ][ gdev ], redevents[gdev][dev*ngpu+gdev]);
                                    //printf     ("              GPU %d queue %d waiting on event %d to collecte from %d the size of gcolsize %d\n", dev, queues[ dev ][ gdev ], redevents[gdev][dev*ngpu+gdev], gdev, gcolsize);
                                }else{
                                    //we are on different group so:
                                    //if I am the master wait on the event issued by gdev for me citing his id
                                    //else  wait event issued by my master for me on the behalf of gdev
                                    //printf     ("              GPU %d queue %d waiting on event %d to collecte from %d the size of gcolsize %d\n", dev, queues[ dev ][ gdev ], redevents[masterdev][dev*ngpu+gdev], gdev, gcolsize);
                                    if(dev==masterdev)
                                        magma_queue_wait_event(queues[ dev ][ gdev ], redevents[gdev][dev*ngpu+gdev]);
                                    else
                                        magma_queue_wait_event(queues[ dev ][ gdev ], redevents[masterdev][dev*ngpu+gdev]);
                                }
                                //printf     ("              GPU %d stream %d doing slacpy\n", dev, queues[ dev ][ gdev ]);
                                for( magma_int_t blki = 0; blki < gblk; ++blki){
                                    gbblki = (blki*ngpu + devperm)*nb - blockoffset;
                                    lcblki = blki*nb;
                                    ib     = nb;//min(nb, m-gbblki);
                                    if(gdev==stdev){
                                        lcblki = blki*nb-blockoffset;
                                        if(blki==0){
                                            gbblki = 0;
                                            lcblki = 0;
                                            ib     = nb-blockoffset;
                                        }
                                    }
                                    ib     = min(ib, m-gbblki);
                                    //printf("                    blockoffset %d nbblk %d stdev %d  receiving from gdev %d gblk %d  gcolsize %d copying blki %d of size ibxn %dx%d from work[%d] to C[%d]\n", blockoffset, nbblk, stdev, gdev, gblk, gcolsize, blki, ib, n, lcblki, gbblki);
                                    magmablas_slacpy( MagmaFull, ib, n, &dwork[dev][maxgsize*gdev+lcblki], gcolsize, &dC[dev][gbblki], lddc);
                                }// end blki
                            }// en gcolsize>0 meaning gdev is active
                        } // end if gdev != dev
                    }// end loop over the g gpus of the cmplx k
                }//end loop over the real k
            }// end mycolsize>0 meaning that I am active
        }// end loop over idev of cmplxid
    }// end loop of the cmplx







    for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
        magma_setdevice( dev );
        magma_device_sync();
    }

    // put back the input gpu and its input stream 
    magma_setdevice( cdev );
    magmablasSetKernelStream( cstream );

}
Exemplo n.º 16
0
int main(int argc, char **argv)
{
    TESTING_INIT();

    const float c_neg_one = MAGMA_S_NEG_ONE;
    const magma_int_t        ione      = 1;
    
    real_Double_t   atomics_perf=0, atomics_time=0;
    real_Double_t   gflops, magma_perf=0, magma_time=0, cublas_perf, cublas_time, cpu_perf, cpu_time;
    float          magma_error=0, atomics_error=0, cublas_error, work[1];
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t N, lda, ldda, sizeA, sizeX, sizeY, blocks, ldwork;
    magma_int_t incx = 1;
    magma_int_t incy = 1;
    magma_int_t nb   = 64;
    float alpha = MAGMA_S_MAKE(  1.5, -2.3 );
    float beta  = MAGMA_S_MAKE( -0.6,  0.8 );
    float *A, *X, *Y, *Yatomics, *Ycublas, *Ymagma;
    magmaFloat_ptr dA, dX, dY, dwork;
    magma_int_t status = 0;
    
    magma_opts opts;
    opts.parse_opts( argc, argv );
    
    float tol = opts.tolerance * lapackf77_slamch("E");

    printf("%% uplo = %s\n", lapack_uplo_const(opts.uplo) );
    printf("%%   N   MAGMA Gflop/s (ms)    Atomics Gflop/s      CUBLAS Gflop/s       CPU Gflop/s   MAGMA error  Atomics    CUBLAS\n");
    printf("%%=====================================================================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            N = opts.nsize[itest];
            lda    = N;
            ldda   = magma_roundup( N, opts.align );  // multiple of 32 by default
            sizeA  = N*lda;
            sizeX  = N*incx;
            sizeY  = N*incy;
            gflops = FLOPS_SSYMV( N ) / 1e9;
            
            TESTING_MALLOC_CPU( A,        float, sizeA );
            TESTING_MALLOC_CPU( X,        float, sizeX );
            TESTING_MALLOC_CPU( Y,        float, sizeY );
            TESTING_MALLOC_CPU( Yatomics, float, sizeY );
            TESTING_MALLOC_CPU( Ycublas,  float, sizeY );
            TESTING_MALLOC_CPU( Ymagma,   float, sizeY );
            
            TESTING_MALLOC_DEV( dA, float, ldda*N );
            TESTING_MALLOC_DEV( dX, float, sizeX );
            TESTING_MALLOC_DEV( dY, float, sizeY );
            
            blocks = magma_ceildiv( N, nb );
            ldwork = ldda*blocks;
            TESTING_MALLOC_DEV( dwork, float, ldwork );
            
            magmablas_slaset( MagmaFull, ldwork, 1, MAGMA_S_NAN, MAGMA_S_NAN, dwork, ldwork, opts.queue );
            magmablas_slaset( MagmaFull, ldda,   N, MAGMA_S_NAN, MAGMA_S_NAN, dA,    ldda,   opts.queue );
            
            /* Initialize the matrix */
            lapackf77_slarnv( &ione, ISEED, &sizeA, A );
            magma_smake_symmetric( N, A, lda );
            
            // should not use data from the opposite triangle -- fill with NAN to check
            magma_int_t N1 = N-1;
            if ( opts.uplo == MagmaUpper ) {
                lapackf77_slaset( "Lower", &N1, &N1, &MAGMA_S_NAN, &MAGMA_S_NAN, &A[1], &lda );
            }
            else {
                lapackf77_slaset( "Upper", &N1, &N1, &MAGMA_S_NAN, &MAGMA_S_NAN, &A[lda], &lda );
            }
            
            lapackf77_slarnv( &ione, ISEED, &sizeX, X );
            lapackf77_slarnv( &ione, ISEED, &sizeY, Y );
            
            /* =====================================================================
               Performs operation using CUBLAS
               =================================================================== */
            magma_ssetmatrix( N, N, A, lda, dA, ldda, opts.queue );
            magma_ssetvector( N, X, incx, dX, incx, opts.queue );
            magma_ssetvector( N, Y, incy, dY, incy, opts.queue );
            
            cublas_time = magma_sync_wtime( opts.queue );
            #ifdef HAVE_CUBLAS
                cublasSsymv( opts.handle, cublas_uplo_const(opts.uplo),
                             N, &alpha, dA, ldda, dX, incx, &beta, dY, incy );
            #else
                magma_ssymv( opts.uplo, N, alpha, dA, 0, ldda, dX, 0, incx, beta, dY, 0, incy, opts.queue );
            #endif
            cublas_time = magma_sync_wtime( opts.queue ) - cublas_time;
            cublas_perf = gflops / cublas_time;
            
            magma_sgetvector( N, dY, incy, Ycublas, incy, opts.queue );
            
            /* =====================================================================
               Performs operation using CUBLAS - using atomics
               =================================================================== */
            #ifdef HAVE_CUBLAS
                cublasSetAtomicsMode( opts.handle, CUBLAS_ATOMICS_ALLOWED );
                magma_ssetvector( N, Y, incy, dY, incy, opts.queue );
                
                // sync on queue doesn't work -- need device sync or use NULL stream -- bug in CUBLAS?
                atomics_time = magma_sync_wtime( NULL /*opts.queue*/ );
                cublasSsymv( opts.handle, cublas_uplo_const(opts.uplo),
                             N, &alpha, dA, ldda, dX, incx, &beta, dY, incy );
                atomics_time = magma_sync_wtime( NULL /*opts.queue*/ ) - atomics_time;
                atomics_perf = gflops / atomics_time;
                
                magma_sgetvector( N, dY, incy, Yatomics, incy, opts.queue );
                cublasSetAtomicsMode( opts.handle, CUBLAS_ATOMICS_NOT_ALLOWED );
            #endif
            
            /* =====================================================================
               Performs operation using MAGMABLAS
               =================================================================== */
            #ifdef HAVE_CUBLAS
                magma_ssetvector( N, Y, incy, dY, incy, opts.queue );
                
                magma_time = magma_sync_wtime( opts.queue );
                if ( opts.version == 1 ) {
                    magmablas_ssymv_work( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy, dwork, ldwork, opts.queue );
                }
                else {
                    // non-work interface (has added overhead)
                    magmablas_ssymv( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy, opts.queue );
                }
                magma_time = magma_sync_wtime( opts.queue ) - magma_time;
                magma_perf = gflops / magma_time;
                
                magma_sgetvector( N, dY, incy, Ymagma, incy, opts.queue );
            #endif
            
            /* =====================================================================
               Performs operation using CPU BLAS
               =================================================================== */
            cpu_time = magma_wtime();
            blasf77_ssymv( lapack_uplo_const(opts.uplo), &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy );
            cpu_time = magma_wtime() - cpu_time;
            cpu_perf = gflops / cpu_time;
            
            /* =====================================================================
               Check the result
               =================================================================== */
            blasf77_saxpy( &N, &c_neg_one, Y, &incy, Ycublas, &incy );
            cublas_error = lapackf77_slange( "M", &N, &ione, Ycublas, &N, work ) / N;
            
            #ifdef HAVE_CUBLAS
                blasf77_saxpy( &N, &c_neg_one, Y, &incy, Yatomics, &incy );
                atomics_error = lapackf77_slange( "M", &N, &ione, Yatomics, &N, work ) / N;
                
                blasf77_saxpy( &N, &c_neg_one, Y, &incy, Ymagma, &incy );
                magma_error = lapackf77_slange( "M", &N, &ione, Ymagma, &N, work ) / N;
            #endif
            
            bool okay = (magma_error < tol && cublas_error < tol && atomics_error < tol);
            status += ! okay;
            printf("%5d   %7.2f (%7.2f)   %7.2f (%7.2f)   %7.2f (%7.2f)   %7.2f (%7.2f)   %8.2e   %8.2e   %8.2e   %s\n",
                   (int) N,
                   magma_perf,   1000.*magma_time,
                   atomics_perf, 1000.*atomics_time,
                   cublas_perf,  1000.*cublas_time,
                   cpu_perf,     1000.*cpu_time,
                   magma_error, cublas_error, atomics_error,
                   (okay ? "ok" : "failed"));
            
            TESTING_FREE_CPU( A );
            TESTING_FREE_CPU( X );
            TESTING_FREE_CPU( Y );
            TESTING_FREE_CPU( Ycublas  );
            TESTING_FREE_CPU( Yatomics );
            TESTING_FREE_CPU( Ymagma   );
            
            TESTING_FREE_DEV( dA );
            TESTING_FREE_DEV( dX );
            TESTING_FREE_DEV( dY );
            TESTING_FREE_DEV( dwork );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    opts.cleanup();
    TESTING_FINALIZE();
    return status;
}
Exemplo n.º 17
0
/**
    Purpose
    -------


    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangles of A is stored;
      -     = MagmaLower:  Lower triangles of A is stored.

    @param[in]
    n       INTEGER
            The order of the matrix A.  n >= 0.

    @param[in]
    nb      INTEGER
            The order of the band matrix A.  n >= nb >= 0.

    @param[in]
    Vblksiz INTEGER
            The size of the block of householder vectors applied at once.

    @param[in]
    A       (workspace) DOUBLE PRECISION array, dimension (lda, n)
            On entry the band matrix stored in the following way:

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  lda >= 2*nb.

    @param[out]
    d       DOUBLE array, dimension (n)
            The diagonal elements of the tridiagonal matrix T:
            D(i) = A(i,i).

    @param[out]
    e       DOUBLE array, dimension (n-1)
            The off-diagonal elements of the tridiagonal matrix T:
            E(i) = A(i,i+1) if UPLO = MagmaUpper, E(i) = A(i+1,i) if UPLO = MagmaLower.

    @param[out]
    V       DOUBLE PRECISION array, dimension (BLKCNT, LDV, VBLKSIZ)
            On exit it contains the blocks of householder reflectors
            BLKCNT is the number of block and it is returned by the funtion MAGMA_BULGE_GET_BLKCNT.

    @param[in]
    ldv     INTEGER
            The leading dimension of V.
            LDV > nb + VBLKSIZ + 1

    @param[out]
    TAU     DOUBLE PRECISION dimension(BLKCNT, VBLKSIZ)
            ???

    @param[in]
    wantz   INTEGER
            if COMPT = 0 T is not computed
            if COMPT = 1 T is computed

    @param[out]
    T       DOUBLE PRECISION dimension(LDT *)
            if COMPT = 1 on exit contains the matrices T needed for Q2
            if COMPT = 0 T is not referenced

    @param[in]
    ldt     INTEGER
            The leading dimension of T.
            LDT > Vblksiz

    @ingroup magma_dsyev_2stage
    ********************************************************************/
extern "C" magma_int_t
magma_dsytrd_sb2st(
    magma_uplo_t uplo, magma_int_t n, magma_int_t nb, magma_int_t Vblksiz,
    double *A, magma_int_t lda, double *d, double *e,
    double *V, magma_int_t ldv, double *TAU,
    magma_int_t wantz, double *T, magma_int_t ldt)
{
    #ifdef ENABLE_TIMER
    real_Double_t timeblg=0.0;
    #endif

    magma_int_t parallel_threads = magma_get_parallel_numthreads();
    magma_int_t mklth   = magma_get_lapack_numthreads();
    magma_int_t ompth   = magma_get_omp_numthreads();

    //magma_set_omp_numthreads(1);
    //magma_set_lapack_numthreads(1);

    magma_int_t blkcnt, sizTAU2, sizT2, sizV2;
    magma_dbulge_getstg2size(n, nb, wantz, 
                          Vblksiz, ldv, ldt, &blkcnt, 
                          &sizTAU2, &sizT2, &sizV2);
    memset(T,   0, sizT2*sizeof(double));
    memset(TAU, 0, sizTAU2*sizeof(double));
    memset(V,   0, sizV2*sizeof(double));

    magma_int_t INgrsiz=1;
    magma_int_t nbtiles = magma_ceildiv(n, nb);
    volatile magma_int_t* prog;
    magma_malloc_cpu((void**) &prog, (2*nbtiles+parallel_threads+10)*sizeof(magma_int_t));
    memset((void *) prog, 0, (2*nbtiles+parallel_threads+10)*sizeof(magma_int_t));

    magma_dbulge_id_data* arg;
    magma_malloc_cpu((void**) &arg, parallel_threads*sizeof(magma_dbulge_id_data));

    pthread_t* thread_id;
    magma_malloc_cpu((void**) &thread_id, parallel_threads*sizeof(pthread_t));
    pthread_attr_t thread_attr;

    magma_dbulge_data data_bulge;
    magma_dbulge_data_init(&data_bulge, parallel_threads, n, nb, nbtiles, INgrsiz, Vblksiz, wantz,
                                 A, lda, V, ldv, TAU, T, ldt, prog);

    // Set one thread per core
    pthread_attr_init(&thread_attr);
    pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM);
    pthread_setconcurrency(parallel_threads);

    //timing
    #ifdef ENABLE_TIMER
    timeblg = magma_wtime();
    #endif

    // Launch threads
    for (magma_int_t thread = 1; thread < parallel_threads; thread++) {
        magma_dbulge_id_data_init(&(arg[thread]), thread, &data_bulge);
        pthread_create(&thread_id[thread], &thread_attr, magma_dsytrd_sb2st_parallel_section, &arg[thread]);
    }
    magma_dbulge_id_data_init(&(arg[0]), 0, &data_bulge);
    magma_dsytrd_sb2st_parallel_section(&arg[0]);

    // Wait for completion
    for (magma_int_t thread = 1; thread < parallel_threads; thread++) {
        void *exitcodep;
        pthread_join(thread_id[thread], &exitcodep);
    }

    // timing
    #ifdef ENABLE_TIMER
    timeblg = magma_wtime()-timeblg;
    printf("  time BULGE+T = %f\n", timeblg);
    #endif

    magma_free_cpu(thread_id);
    magma_free_cpu(arg);
    magma_free_cpu((void *) prog);
    magma_dbulge_data_destroy(&data_bulge);

    magma_set_omp_numthreads(ompth);
    magma_set_lapack_numthreads(mklth);
    /*================================================
     *  store resulting diag and lower diag d and e
     *  note that d and e are always real
     *================================================*/

    /* Make diagonal and superdiagonal elements real,
     * storing them in d and e
     */
    /* In real case, the off diagonal element are
     * not necessary real. we have to make off-diagonal
     * elements real and copy them to e.
     * When using HouseHolder elimination,
     * the DLARFG give us a real as output so, all the
     * diagonal/off-diagonal element except the last one are already
     * real and thus we need only to take the abs of the last
     * one.
     *  */

#ifdef COMPLEX
    if (uplo == MagmaLower) {
        for (magma_int_t i=0; i < n-1; i++) {
            d[i] = MAGMA_D_REAL( A[i*lda  ] );
            e[i] = MAGMA_D_REAL( A[i*lda+1] );
        }
        d[n-1] = MAGMA_D_REAL(A[(n-1)*lda]);
    } else { /* MagmaUpper not tested yet */
        for (magma_int_t i=0; i < n-1; i++) {
            d[i] = MAGMA_D_REAL( A[i*lda+nb]   );
            e[i] = MAGMA_D_REAL( A[i*lda+nb-1] );
        }
        d[n-1] = MAGMA_D_REAL(A[(n-1)*lda+nb]);
    } /* end MagmaUpper */
#else
    if ( uplo == MagmaLower ) {
        for (magma_int_t i=0; i < n-1; i++) {
            d[i] = A[i*lda];   // diag
            e[i] = A[i*lda+1]; // lower diag
        }
        d[n-1] = A[(n-1)*lda];
    } else {
        for (magma_int_t i=0; i < n-1; i++) {
            d[i] = A[i*lda+nb];   // diag
            e[i] = A[i*lda+nb-1]; // lower diag
        }
        d[n-1] = A[(n-1)*lda+nb];
    }
#endif
    return MAGMA_SUCCESS;
}
Exemplo n.º 18
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing dgeqrf_mgpu
*/
int main( int argc, char** argv )
{
    TESTING_INIT();

    real_Double_t    gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0;
    double           error, work[1];
    double c_neg_one = MAGMA_D_NEG_ONE;
    double *h_A, *h_R, *tau, *h_work, tmp[1];
    magmaDouble_ptr d_lA[ MagmaMaxGPUs ];
    magma_int_t M, N, n2, lda, ldda, n_local, ngpu;
    magma_int_t info, min_mn, nb, lhwork;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1}, ISEED2[4];
    
    magma_opts opts;
    opts.parse_opts( argc, argv );
    opts.ngpu = abs( opts.ngpu );  // always uses multi-GPU code
    opts.lapack |= (opts.check == 2);  // check (-c2) implies lapack (-l)
 
    magma_int_t status = 0;
    double eps = lapackf77_dlamch("E");
    double tol = opts.tolerance * lapackf77_dlamch("E");

    printf("%% ngpu %d\n", (int) opts.ngpu );
    if ( opts.check == 1 ) {
        printf("%%   M     N   CPU Gflop/s (sec)   GPU Gflop/s (sec)   ||R-Q'A||_1 / (M*||A||_1) ||I-Q'Q||_1 / M\n");
        printf("%%===============================================================================================\n");
    }
    else {
        printf("%%   M     N   CPU Gflop/s (sec)   GPU Gflop/s (sec)   ||R||_F /(M*||A||_F)\n");
        printf("%%=========================================================================\n");
    }
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[itest];
            N = opts.nsize[itest];
            min_mn = min(M, N);
            lda    = M;
            n2     = lda*N;
            ldda   = magma_roundup( M, opts.align );  // multiple of 32 by default
            nb     = magma_get_dgeqrf_nb( M, N );
            gflops = FLOPS_DGEQRF( M, N ) / 1e9;
            
            // ngpu must be at least the number of blocks
            ngpu = min( opts.ngpu, magma_ceildiv(N,nb) );
            if ( ngpu < opts.ngpu ) {
                printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu );
            }
            
            // query for workspace size
            lhwork = -1;
            lapackf77_dgeqrf( &M, &N, NULL, &M, NULL, tmp, &lhwork, &info );
            lhwork = (magma_int_t) MAGMA_D_REAL( tmp[0] );
            
            // Allocate host memory for the matrix
            TESTING_MALLOC_CPU( tau,    double, min_mn );
            TESTING_MALLOC_CPU( h_A,    double, n2     );
            TESTING_MALLOC_CPU( h_work, double, lhwork );
            
            TESTING_MALLOC_PIN( h_R,    double, n2     );
            
            // Allocate device memory
            for( int dev = 0; dev < ngpu; dev++ ) {
                n_local = ((N/nb)/ngpu)*nb;
                if (dev < (N/nb) % ngpu)
                    n_local += nb;
                else if (dev == (N/nb) % ngpu)
                    n_local += N % nb;
                magma_setdevice( dev );
                TESTING_MALLOC_DEV( d_lA[dev], double, ldda*n_local );
            }
            
            /* Initialize the matrix */
            for( int j=0; j < 4; j++ )
                ISEED2[j] = ISEED[j]; // save seeds
            lapackf77_dlarnv( &ione, ISEED, &n2, h_A );
            lapackf77_dlacpy( MagmaFullStr, &M, &N, h_A, &lda, h_R, &lda );
            
            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            if ( opts.lapack ) {
                double *tau2;
                TESTING_MALLOC_CPU( tau2, double, min_mn );
                cpu_time = magma_wtime();
                lapackf77_dgeqrf( &M, &N, h_A, &lda, tau2, h_work, &lhwork, &info );
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
                if (info != 0) {
                    printf("lapack_dgeqrf returned error %d: %s.\n",
                           (int) info, magma_strerror( info ));
                }
                TESTING_FREE_CPU( tau2 );
            }
            
            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            magma_dsetmatrix_1D_col_bcyclic( M, N, h_R, lda, d_lA, ldda, ngpu, nb );

            gpu_time = magma_wtime();
            magma_dgeqrf2_mgpu( ngpu, M, N, d_lA, ldda, tau, &info );
            gpu_time = magma_wtime() - gpu_time;
            gpu_perf = gflops / gpu_time;
            if (info != 0) {
                printf("magma_dgeqrf2 returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            }
            
            magma_dgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_R, lda, ngpu, nb );
            
            if ( opts.check == 1 && M >= N ) {
                /* =====================================================================
                   Check the result -- dqrt02 requires M >= N
                   =================================================================== */
                magma_int_t lwork = n2+N;
                double *h_W1, *h_W2, *h_W3;
                double *h_RW, results[2];
            
                TESTING_MALLOC_CPU( h_W1, double, n2    ); // Q
                TESTING_MALLOC_CPU( h_W2, double, n2    ); // R
                TESTING_MALLOC_CPU( h_W3, double, lwork ); // WORK
                TESTING_MALLOC_CPU( h_RW, double, M );  // RWORK
                lapackf77_dlarnv( &ione, ISEED2, &n2, h_A );
                lapackf77_dqrt02( &M, &N, &min_mn, h_A, h_R, h_W1, h_W2, &lda, tau, h_W3, &lwork,
                                  h_RW, results );
                results[0] *= eps;
                results[1] *= eps;

                if ( opts.lapack ) {
                    printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)   %8.2e                 %8.2e",
                           (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, results[0], results[1] );
                } else {
                    printf("%5d %5d     ---   (  ---  )   %7.2f (%7.2f)    %8.2e                 %8.2e",
                           (int) M, (int) N, gpu_perf, gpu_time, results[0], results[1] );
                }
                // todo also check results[1] < tol?
                printf("   %s\n", (results[0] < tol ? "ok" : "failed"));
                status += ! (results[0] < tol);

                TESTING_FREE_CPU( h_W1 );
                TESTING_FREE_CPU( h_W2 );
                TESTING_FREE_CPU( h_W3 );
                TESTING_FREE_CPU( h_RW );
            }
            else if ( opts.check == 2 ) {
                /* =====================================================================
                   Check the result compared to LAPACK
                   =================================================================== */
                error = lapackf77_dlange("f", &M, &N, h_A, &lda, work );
                blasf77_daxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione );
                error = lapackf77_dlange("f", &M, &N, h_R, &lda, work ) / (min_mn*error);
                
                printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)   %8.2e   %s\n",
                       (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time,
                       error, (error < tol ? "ok" : "failed"));
                status += ! (error < tol);
            }
            else {
                if ( opts.lapack ) {
                    printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)   ---",
                           (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time );
                } else {
                    printf("%5d %5d     ---   (  ---  )   %7.2f (%7.2f)     ---",
                           (int) M, (int) N, gpu_perf, gpu_time);
                }
                printf("%s\n", (opts.check != 0 ? "  (error check only for M >= N)" : ""));
            }
            
            TESTING_FREE_CPU( tau    );
            TESTING_FREE_CPU( h_A    );
            TESTING_FREE_CPU( h_work );
            
            TESTING_FREE_PIN( h_R    );
            
            for( int dev=0; dev < ngpu; dev++ ) {
                magma_setdevice( dev );
                TESTING_FREE_DEV( d_lA[dev] );
            }
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }
 
    opts.cleanup();
    TESTING_FINALIZE();
    return status;
}
Exemplo n.º 19
0
void magmablas_ssymm_mgpu_com(
    magma_side_t side, magma_uplo_t uplo, magma_int_t m, magma_int_t n,
    float alpha,
    float *dA[],    magma_int_t ldda,  magma_int_t offset,
    float *dB[],    magma_int_t lddb,
    float beta,     float *dC[], magma_int_t lddc,
    float *dwork[], magma_int_t dworksiz,
    float *C,       magma_int_t ldc,
    float *work[],  magma_int_t worksiz,
    magma_int_t ngpu, magma_int_t nb, 
    magma_queue_t streams[][20], magma_int_t nstream, 
    magma_event_t redevents[][MagmaMaxGPUs*MagmaMaxGPUs+10], magma_int_t nbevents, 
    magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2], magma_int_t nbcmplx )
{
    #define dA(dev, i, j) (dA[dev] + (i) + (j)*ldda)
    #define dB(dev, i, j) (dB[dev] + (i) + (j)*lddb)
    #define dC(dev, i, j) (dC[dev] + (i) + (j)*lddc)
    #define dwork(dev, i, j) (dwork[dev] + (i) + (j)*lddwork)
    #define C(i, j) (C + (i) + (j)*ldc)
    //printf("####################################################\n");
    //printf("                      start ssymm                   \n");
    //printf("####################################################\n");
   
    if ( side != MagmaLeft || uplo != MagmaLower ) {
        fprintf( stderr, "%s: only Left Lower implemented\n", __func__ );
    }
    
    assert( ldda >= m );
    assert( lddb >= m );
    assert( lddc >= m );
    assert( nstream >= ngpu );
    assert( nbevents >= ngpu*ngpu );
   
    
    float c_one  = MAGMA_S_ONE;

    float *dwork1[MagmaMaxGPUs];
    float *dwork2[MagmaMaxGPUs];


    magma_int_t maxgsize    = n*m;
    magma_int_t lddwork = lddc;
    magma_int_t ldwork  = m;
    for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
        dwork1[dev] = dwork[dev];  // size of dwork1 is n*lddwork
        dwork2[dev] = dwork[dev]+n*lddwork;  // size of dwork2 is maxgsize*ngpu
    }
    assert( dworksiz >= (n*lddwork+maxgsize*ngpu) );
    assert( worksiz  >= (n*ldwork) );

        
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_t cstream;
    magmablasGetKernelStream(&cstream);


    magma_int_t dev, devperm, myblk, mycolsize, myblkoffst;
    magma_int_t gmaster;
    magma_int_t masterdev, lcdev, lccolsize, myngpu;

    magma_int_t stdev       = (offset/nb)%ngpu;  
    magma_int_t blockoffset = offset % nb;  
    magma_int_t fstblksiz   = 0;
    if(blockoffset>0){
        fstblksiz   = min(m, (nb - blockoffset));
    }
    //magma_int_t nbblk       = magma_ceildiv(m, nb);
    magma_int_t nbblk       = magma_ceildiv((m+blockoffset), nb);
    magma_int_t remm        = m- fstblksiz;
    magma_int_t nbblkoffst  = offset/nb;


    magma_int_t nblstblks = -1;
    magma_int_t devlstblk = -1;
    magma_int_t lstblksiz = remm%nb;
    if(lstblksiz>0){
        nblstblks = nbblk%ngpu;
        devlstblk = (nblstblks-1+ngpu)%ngpu;
    }

    magma_int_t nbcmplxactive =  0;
    magma_int_t cmplxisactive[MagmaMaxGPUs];
    magma_int_t gpuisactive[MagmaMaxGPUs];
    memset(gpuisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t));
    memset(cmplxisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t));


    for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
        magma_setdevice( dev );
        magmablasSetKernelStream( streams[ dev ][ 0 ] );
        cudaMemset(dwork(dev,0,0), 0, (lddwork)*(n)*sizeof(float) );
        // put all dC on all dev to 0 except the one which
        // hold i==0 because this one has to multiply by beta.
        if(dev!=stdev){
           cudaMemset(dC(dev,0,0), 0, (lddc)*(n)*sizeof(float) );
        }
    }

    magma_int_t newoffset = offset;
    // 1. symmetrize
    if(blockoffset>0){
        newoffset  = offset+fstblksiz; // newoffset is adjusted over nb
        magma_int_t myblkoffst = (nbblkoffst/ngpu)+(nbblkoffst%ngpu > stdev?1:0);
        //printf("STDEV %d  voici offset %d remm %d   myblockoffset %d    siz %d \n", stdev, offset, remm, myblkoffst, fstblksiz);
        magma_setdevice( stdev );
        magmablasSetKernelStream( streams[ stdev ][ 0 ] );
        magmablas_ssymmetrize_tiles(  MagmaLower,  fstblksiz,  dA(stdev, offset, myblkoffst*nb+blockoffset),  ldda,  1,  ngpu*nb,  nb  );         
    }

    for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
        magma_int_t newstdev      = (newoffset/nb)%ngpu;
        magma_int_t nbblk = remm/nb; // number of block of size nb. if m%nb>0 then a last block exist and is of size ib=m%nb
        magma_int_t myblk = (nbblk/ngpu) + (nbblk%ngpu > ((dev-newstdev+ngpu)%ngpu) ?  1:0 );
        magma_int_t devperm   = (dev-newstdev+ngpu)%ngpu;
        magma_int_t nbblkoffst = newoffset/nb;
        magma_int_t myblkoffst = (nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0);
        //printf("dev %d  devperm %d   newoffset %d  rowoff %d    coloff %d    myblk %d  \n", dev, devperm, newoffset, newoffset+devperm*nb, myblkoffst*nb, myblk);
        magma_setdevice( dev );
        magmablasSetKernelStream( streams[ dev ][ 0 ] );
        magmablas_ssymmetrize_tiles(  MagmaLower,  nb,  dA(dev, newoffset+devperm*nb, myblkoffst*nb),  ldda,  myblk,  ngpu*nb,  nb  );
        if(remm%nb>0){
            magma_int_t nblstblks = (nbblk+1)%ngpu;
            magma_int_t devlstblk = (nblstblks-1+ngpu)%ngpu;
            //printf("==> siz %d devperm %d,    devlstblk %d,    newoffset+nbblk*nb %d,   myblkoffst*nb+ myblk*nb %d\n", remm % nb, devperm, devlstblk, newoffset+nbblk*nb, myblkoffst*nb+ myblk*nb);
            if(devperm==devlstblk)
                magmablas_ssymmetrize(  MagmaLower,  remm % nb,  dA(dev, newoffset+nbblk*nb, myblkoffst*nb+ myblk*nb),  ldda );  // last partial tile
        }
    }


    

/*
    magma_int_t siz = m+offset;
    float *R;
    magma_smalloc_cpu( &R, siz*siz );
    // collecte back A
    magmablas_sgetmatrix_1D_bcyclic( siz, siz, dA, ldda, R, siz, ngpu, nb );
    magma_setdevice( 0 );
    magmablasSetKernelStream( streams[ dev ][ 0 ] );
    //magma_sgetmatrix( siz, siz, dA[0], ldda, R, siz );
    FILE *trace_file;
    trace_file = fopen("AJETE/Aafter", "w");
    for (int j = 0; j < siz ; j++) 
          for (int i = 0; i < siz ; i++) 
                         fprintf(trace_file, "%10d%10d%40.30e\n", i+1, j+1, R[j*siz+i]);
    fclose(trace_file);
return;
*/
    

    // ROW GEMM transpose a row and make a gemm with a block
    // if only 1 GPU used the ROW GEMM is integrated with the 
    // COL GEMM (better accuracy observed) and better perf
    if(ngpu>1){
        for( magma_int_t i = fstblksiz; i < m; i += nb ) {
            magma_int_t ib     = min( nb, m-i );      // block size
            magma_int_t ioff   = i + offset;          // start global index in parent matrix
            //magma_int_t dev    = (ioff / nb) % ngpu;
            magma_int_t nbblkoffst = offset/nb;
            magma_int_t nbblk      = magma_ceildiv(i, nb);
            for( magma_int_t dev = 0; dev < ngpu; ++dev ) {


                magma_int_t myblk = (nbblk/ngpu) + (nbblk%ngpu > ((dev-stdev+ngpu)%ngpu) ?  1:0 );
                magma_int_t myblkoffst = (nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0);

                magma_int_t myrowsize = myblk * nb;
                magma_int_t coloffset = myblkoffst*nb;
                if(dev==stdev) {
                    myrowsize = myrowsize -blockoffset;
                    coloffset = myblkoffst*nb+blockoffset;
                }
                //printf("ROW GEMM: voici i %d   ib %d    ioff %d   nbblkoffst %d stdev %d  dev %d myblk %d  myblkoffset %d  coloffset %d  rowsize %d\n", i, ib, ioff, nbblkoffst, stdev, dev, myblk, myblkoffst, coloffset, myrowsize);
                if(myrowsize>0){
                    magma_setdevice( dev );
                    magmablasSetKernelStream( streams[ dev ][ 1 ] );    
                    magma_sgemm( MagmaConjTrans, MagmaNoTrans, myrowsize, n, ib,
                                 alpha, dA(dev,ioff,coloffset), ldda,
                                        dB(dev,i,0),    lddb,
                                 c_one, dwork(dev,0,0), lddwork );
                }
            }
        }
        for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
            magma_setdevice( dev );
            magma_event_record(redevents[dev][1], streams[dev][1]);
        }
    }
    

    // COL GEMM
    // blockoffset is offset within first block; for subsequent blocks it is 0
    if(blockoffset>0){
        magma_int_t ib     = min( nb-blockoffset, m );  // block size
        magma_int_t iblock = (offset / nb) / ngpu;          // local block id
        magma_int_t di     = iblock*nb+blockoffset;       // local index in parent matrix
        magma_setdevice( stdev );
        magmablasSetKernelStream( streams[ stdev ][ 0 ] );        
        //printf("DEV %d COL GEMM first   ioff %d  di %d   m %d   n %d   ib %d \n", stdev, offset, di, m, n, ib);
        magma_sgemm( MagmaNoTrans, MagmaNoTrans, m, n, ib,
                        alpha, dA(stdev,offset,di), ldda,
                               dB(stdev,0,0),     lddb,
                        beta,  dC(stdev,0,0),     lddc );
    }
   


    // COL GEMM
    for( magma_int_t i = fstblksiz; i < m; i += nb ) {
        magma_int_t ib     = min( nb, m-i );      // block size
        magma_int_t ioff   = i + offset;          // start global index in parent matrix
        magma_int_t iblock = (ioff / nb) / ngpu;  // local block id
        magma_int_t dev    = (ioff / nb) % ngpu;
        magma_int_t di     = iblock*nb;           // local index in parent matrix
        
        //printf("DEV %d COL GEMM i %d      ioff %d  di %d m-i %d    n %d   ib %d \n", dev, i, ioff, di, m-i, n, ib);
        
        magma_setdevice( dev );
        magmablasSetKernelStream( streams[ dev ][ 0 ] );
        if(i==0){
           magma_sgemm( MagmaNoTrans, MagmaNoTrans, m-i, n, ib,
                        alpha, dA(dev,ioff,di), ldda,
                               dB(dev,i,0),     lddb,
                        beta,  dC(dev,i,0),     lddc );
        }else{
           magma_sgemm( MagmaNoTrans, MagmaNoTrans, m-i, n, ib,
                        alpha, dA(dev,ioff,di), ldda,
                               dB(dev,i,0),        lddb,
                        c_one, dC(dev,i,0),     lddc );
        }
        magma_event_record(redevents[dev][0], streams[dev][0]);
        // if only 1 GPU is used, do the ROW GEMM
        if(ngpu==1){
            // NOTE THAT because the COL gemm write dC below the diagonal (i) 
            // and the ROW GEMM write dC from 0 to diag-1, so they could 
            // run in parallel on different streams. 
            // 
            // NO NO NO because
            // it might happen that col finished i and strated i+1 while row still at i    
            // magmablasSetKernelStream( streams[ dev ][ 0 ] );
            magma_sgemm( MagmaConjTrans, MagmaNoTrans, i, n, ib,
                         alpha, dA(dev,ioff,offset), ldda,
                                dB(dev,i,0),    lddb,
                         c_one, dC(dev,0,0),    lddc );
        }
    }


    
    if(ngpu>1){
        for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
            magma_int_t nbblk    = magma_ceildiv((m+blockoffset), nb);
            magma_int_t nbblkrow = nbblk-1; 
            magma_int_t devperm  = (dev-stdev+ngpu)%ngpu;
            magma_int_t myblk = (nbblkrow/ngpu) + (nbblkrow%ngpu > devperm ?  1:0 );
            magma_int_t myrowsize = myblk * nb;
             if(dev==stdev) {
                myrowsize = myrowsize - blockoffset;
            }
      
            //printf("blockoffset %d nbblkrow %d devperm %d  DEV %d RECEIVING myblk %d  myrowsize %d\n", blockoffset, nbblkrow, devperm, dev, myblk, myrowsize);
            if(myrowsize>0){
                magma_setdevice( dev );
                magmablasSetKernelStream( streams[ dev ][ 0 ] );
                magma_queue_wait_event(streams[ dev ][ 0 ], redevents[dev][1]);
                //magma_queue_sync( streams[ dev ][ 1 ] );
                // for each dev add the computed ROW block each on its placment with dC
                for( magma_int_t blki = 0; blki < myblk; ++blki){
                    magma_int_t gbblki = (blki*ngpu + devperm)*nb - blockoffset;
                    magma_int_t lcblki = blki*nb;
                    magma_int_t ib     = nb;// min(nb, m-gbblki);
                    if(dev==stdev){
                        lcblki = blki*nb-blockoffset;
                        if(blki==0){
                            gbblki = 0;
                            lcblki = 0;
                            ib     = nb-blockoffset;
                        }
                    }
                    magmablas_sgeadd(ib, n, c_one, 
                                    &dwork[dev][lcblki], lddwork, 
                                    &dC[dev][gbblki]   , lddc   );
                }
                magma_event_record(redevents[dev][0], streams[dev][0]);                
            }
        }
    }




    // ===========================================================
    //             COMMUNICATION ALL_REDUCE_SUM 
    // ===========================================================
    if(ngpu==1){
        return;
    }
    // INITIALIZE COMM
    for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
        masterdev     = -1;
        gnode[cmplxid][MagmaMaxGPUs+1] = -1;
        myngpu = gnode[cmplxid][MagmaMaxGPUs];
        for( magma_int_t idev = 0; idev < myngpu; ++idev ) {
            dev         = gnode[cmplxid][idev];
            devperm     = (dev-stdev+ngpu)%ngpu;
            myblk       = (nbblk/ngpu) + (nbblk%ngpu > devperm ?  1:0 );
            mycolsize   = myblk*nb;
            myblkoffst  = nb*((nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0));            
            if(dev==stdev){
                mycolsize  -=  blockoffset;
                myblkoffst +=  blockoffset;     // local index in parent matrix
            }
            if((devperm==devlstblk)&&(lstblksiz>0)){
                mycolsize -=  (nb-(remm%nb));
            }
            mycolsize = min(mycolsize, m);
            if(mycolsize>0){
                gpuisactive[dev] = mycolsize;
                if(masterdev==-1) {
                    masterdev     = dev;
                    nbcmplxactive = nbcmplxactive +1;
                    cmplxisactive[cmplxid] = 1;
                    gnode[cmplxid][MagmaMaxGPUs+1] = masterdev;
                }
            }
        }
    }
/*
    for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
        magma_setdevice( dev );
        magma_device_sync();
    }
*/
    //*******************************
    //  each GPU send its result
    //  to its master. The master make
    //  the addition and then send to 
    //  to the masters of other real
    //  and receive from the masters of 
    //  other real make the addition 
    //  and broadcast locally the final 
    //  result.
    //*******************************
    //printf("=======================================================================\n");
    //printf("                     sending to my master                             \n");
    //printf("=======================================================================\n");
    for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
        myngpu    = gnode[cmplxid][MagmaMaxGPUs];
        masterdev = gnode[cmplxid][MagmaMaxGPUs+1];
        //check if real is active
        if(masterdev!=-1){ 
            for( magma_int_t idev = 0; idev < myngpu; ++idev ) {
                dev         = gnode[cmplxid][idev];
                mycolsize   = gpuisactive[dev];
                if(mycolsize>0){
                    // I am an active GPU. if I am not the master, then send my result to my master.
                    // store result on dwork[masterdev][dev*maxgsize]
                    if(dev!=masterdev){
                        magma_setdevice( dev );        
                        //printf("             GPU %d sending to my master %d\n", dev, masterdev);
                        // wait the geadd of my ROW and COL GEMM is done
                        magma_queue_wait_event(streams[ dev ][ 0 ], redevents[dev][0]);
                        // sending to the master of my real
                        magma_scopymatrix_async(
                            m, n,
                            &dC[dev][0], lddc,
                            &dwork2[masterdev][maxgsize*dev], m, streams[dev][0] );
                        magma_event_record(redevents[dev][masterdev], streams[dev][0]);
                    } // end I am not the masterdev
                }// end if mycolsize>0
            }// for idev
        }// end of if masterdev!=-1 maening real is active
    }// for cmplxid
/*
    for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
        magma_setdevice( dev );
        magma_device_sync();
    }
*/

    //printf("=======================================================================\n");
    //printf(" each master do addition of local result and broadcast to other masters \n");
    //printf("=======================================================================\n");
    for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
        myngpu    = gnode[cmplxid][MagmaMaxGPUs];
        masterdev = gnode[cmplxid][MagmaMaxGPUs+1];
        //check if real is active
        if(masterdev!=-1){ 
            magma_setdevice( masterdev ); 
            // addition is done on stream 0 sequentially
            magmablasSetKernelStream( streams[ masterdev ][ 0 ] );
            // wait the geadd of my ROW and COL GEMM is done
            magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[masterdev][0]);
            // ========================================
            //     local addition
            // ========================================
            for( magma_int_t l = 0; l < myngpu; ++l ) {
                lcdev         = gnode[cmplxid][l];
                lccolsize     = gpuisactive[lcdev];
                if((lcdev!=masterdev)&&(lccolsize>0)){
                    //printf("             master %d receiving from %d and adding \n", masterdev, lcdev);
                    // this is an active GPU of my real. 
                    // wait I received what he send it to me and then do addition.
                    magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[lcdev][masterdev]);
                    magmablas_sgeadd(m, n, c_one, 
                                    &dwork2[masterdev][maxgsize*lcdev], m, 
                                    &dC[masterdev][0]   , lddc   );
                }
            }// for l=1:myngpu
            // because addition is done sequentially on stream 0, 
            // I have to record this to be able to synch using it 
            magma_event_record(redevents[masterdev][masterdev], streams[masterdev][0]);
            // ========================================
            //
            // ========================================
            //      send to other masters
            // ========================================
            for( magma_int_t k = 0; k < nbcmplx; ++k ) {
                if(k!=cmplxid){
                    gmaster = gnode[k][MagmaMaxGPUs+1];
                    if(gmaster!=-1){ //real is active
                         //Master has to  wait until finish the local addition then send using gmaster stream.
                         //use stream 0 to make it sequential or stream gmaster to make it parallel.
                         //Now both re the same.
                        //printf("             master %d from cmplx %d sending to other master %d on cmplx %d \n", masterdev, cmplxid, gmaster, k);
                        magma_queue_wait_event(streams[ masterdev ][ gmaster ], redevents[masterdev][masterdev]);
                        magma_scopymatrix_async(
                            m, n,
                            &dC[masterdev][0], lddc,
                            &dwork2[gmaster][maxgsize*masterdev], m, streams[masterdev][gmaster] );
                        magma_event_record(redevents[masterdev][gmaster], streams[masterdev][gmaster]);
                        magma_event_record(redevents[masterdev][masterdev], streams[masterdev][gmaster]);
                      } // end of gmaster!=-1
                } // end of k!=cmplxid
            }// for k = 0: nbcmplx
            // ========================================
        }// end of if masterdev!=-1 maening real is active
    }// for cmplxid
/*
    for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
        magma_setdevice( dev );
        magma_device_sync();
    }
*/
    //printf("=======================================================================\n");
    //printf(" each master wait receiving other masters results, do the addition and broadcast locally \n");
    //printf("=======================================================================\n");
    for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
        myngpu    = gnode[cmplxid][MagmaMaxGPUs];
        masterdev = gnode[cmplxid][MagmaMaxGPUs+1];
        //check if real is active
        if(masterdev!=-1){ 
            magma_setdevice( masterdev ); 
            // addition is done on stream 0 sequentially
            magmablasSetKernelStream( streams[ masterdev ][ 0 ] );
            // master has to wait until finishing all the send to other masters.
            magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[masterdev][masterdev]);
            // ========================================
            //  addition of results from other masters
            // ========================================
            for( magma_int_t k = 0; k < nbcmplx; ++k ) {
                if(k!=cmplxid){
                    gmaster = gnode[k][MagmaMaxGPUs+1];
                    if(gmaster!=-1){ //real is active
                        //Master has to  wait until receiving from gmaster, then do addition using stream 0
                        //printf("             master %d from cmplx %d receiving from other master %d on cmplx %d and adding \n", masterdev, cmplxid, gmaster, k);
                        magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[gmaster][masterdev]);
                        magmablas_sgeadd(m, n, c_one, 
                                        &dwork2[masterdev][maxgsize*gmaster], m, 
                                        &dC[masterdev][0]   , lddc   );
                    } // end of gmaster!=-1
                } // end of k!=cmplxid
            }// for k = 0: nbcmplx
            // because addition is done sequentially on stream 0, 
            // I have to record this to be able to synch using it 
            magma_event_record(redevents[masterdev][masterdev], streams[masterdev][0]);
            // ========================================
            // ========================================
            //     local broadcast of final results
            // ========================================
            for( magma_int_t l = 0; l < myngpu; ++l ) {
                lcdev         = gnode[cmplxid][l];
                lccolsize     = gpuisactive[lcdev];
                if((lcdev!=masterdev)&&(lccolsize>0)){
                    // this is an active GPU of my real. 
                    // wait the previous addition is done maening stream 0 is finished and broadcast sequentially for now.
                    // to make it parallel put stream lcdev instead of stream 0
                    //printf("             master %d broadcasting local to %d  \n", masterdev, lcdev);
                    magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[masterdev][masterdev]);
                    magma_scopymatrix_async(
                        m, n,
                        &dC[masterdev][0], lddc,
                        &dC[lcdev][0],     lddc, streams[masterdev][0] );
                    magma_event_record(redevents[masterdev][lcdev], streams[masterdev][0]);
                }
            }// for l=1:myngpu
            // ========================================
        }// end of if masterdev!=-1 maening real is active
    }// for cmplxid
/*
    for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
        magma_setdevice( dev );
        magma_device_sync();
    }
*/


    for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
        myngpu    = gnode[cmplxid][MagmaMaxGPUs];
        masterdev = gnode[cmplxid][MagmaMaxGPUs+1];
        //check if real is active
        if(masterdev!=-1){ 
            for( magma_int_t l = 0; l < myngpu; ++l ) {
                lcdev         = gnode[cmplxid][l];
                lccolsize     = gpuisactive[lcdev];
                if(lccolsize>0){
                    magma_setdevice( lcdev );
                    magma_queue_wait_event(streams[ lcdev ][ 0 ], redevents[lcdev][0]);
                    magma_queue_wait_event(streams[ lcdev ][ 0 ], redevents[masterdev][lcdev]);
                }
            }// for l=1:myngpu
        }// end of if masterdev!=-1 maening real is active
    }// for cmplxid


 
   //printf("****************************************************\n");
   //printf("                      finish ssymm                   \n");
   //printf("****************************************************\n");

    magma_setdevice( cdev );
    magmablasSetKernelStream( cstream );

}
Exemplo n.º 20
0
/**
    Purpose
    -------
    ZUNMQR overwrites the general complex M-by-N matrix C with

    @verbatim
                                SIDE = MagmaLeft    SIDE = MagmaRight
    TRANS = MagmaNoTrans:       Q * C               C * Q
    TRANS = Magma_ConjTrans:    Q**H * C            C * Q**H
    @endverbatim

    where Q is a complex unitary matrix defined as the product of k
    elementary reflectors

          Q = H(1) H(2) . . . H(k)

    as returned by ZGEQRF. Q is of order M if SIDE = MagmaLeft and of order N
    if SIDE = MagmaRight.

    Arguments
    ---------
    @param[in]
    ngpu    INTEGER
            Number of GPUs to use. ngpu > 0.

    @param[in]
    side    magma_side_t
      -     = MagmaLeft:      apply Q or Q**H from the Left;
      -     = MagmaRight:     apply Q or Q**H from the Right.

    @param[in]
    trans   magma_trans_t
      -     = MagmaNoTrans:    No transpose, apply Q;
      -     = Magma_ConjTrans: Conjugate transpose, apply Q**H.

    @param[in]
    m       INTEGER
            The number of rows of the matrix C. M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix C. N >= 0.

    @param[in]
    k       INTEGER
            The number of elementary reflectors whose product defines
            the matrix Q.
            If SIDE = MagmaLeft,  M >= K >= 0;
            if SIDE = MagmaRight, N >= K >= 0.

    @param[in]
    A       COMPLEX_16 array, dimension (LDA,K)
            The i-th column must contain the vector which defines the
            elementary reflector H(i), for i = 1,2,...,k, as returned by
            ZGEQRF in the first k columns of its array argument A.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.
            If SIDE = MagmaLeft,  LDA >= max(1,M);
            if SIDE = MagmaRight, LDA >= max(1,N).

    @param[in]
    tau     COMPLEX_16 array, dimension (K)
            TAU(i) must contain the scalar factor of the elementary
            reflector H(i), as returned by ZGEQRF.

    @param[in,out]
    C       COMPLEX_16 array, dimension (LDC,N)
            On entry, the M-by-N matrix C.
            On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q.

    @param[in]
    ldc     INTEGER
            The leading dimension of the array C. LDC >= max(1,M).

    @param[out]
    work    (workspace) COMPLEX_16 array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.
            If SIDE = MagmaLeft,  LWORK >= max(1,N);
            if SIDE = MagmaRight, LWORK >= max(1,M).
            For optimum performance LWORK >= N*NB if SIDE = MagmaLeft, and
            LWORK >= M*NB if SIDE = MagmaRight, where NB is the optimal
            blocksize.
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value

    @ingroup magma_zgeqrf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_zunmqr_m(
    magma_int_t ngpu,
    magma_side_t side, magma_trans_t trans,
    magma_int_t m, magma_int_t n, magma_int_t k,
    magmaDoubleComplex *A,    magma_int_t lda,
    magmaDoubleComplex *tau,
    magmaDoubleComplex *C,    magma_int_t ldc,
    magmaDoubleComplex *work, magma_int_t lwork,
    magma_int_t *info)
{
#define  A(i, j) (A + (j)*lda  + (i))
#define  C(i, j) (C + (j)*ldc  + (i))

#define    dC(gpui,      i, j) (dw[gpui] + (j)*lddc + (i))
#define  dA_c(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddac)
#define  dA_r(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddar)
#define    dT(gpui, ind)       (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + (ind)*((nb+1)*nb))
#define dwork(gpui, ind)       (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + 2*((nb+1)*nb) + (ind)*(lddwork*nb))

    /* Constants */
    magmaDoubleComplex c_zero = MAGMA_Z_ZERO;
    magmaDoubleComplex c_one  = MAGMA_Z_ONE;

    /* Local variables */
    const char* side_  = lapack_side_const( side );
    const char* trans_ = lapack_trans_const( trans );

    magma_int_t nb = 128;
    magmaDoubleComplex *T = NULL;
    magmaDoubleComplex_ptr dw[MagmaMaxGPUs] = { NULL };
    magma_queue_t queues[MagmaMaxGPUs][2] = {{ NULL }};
    magma_event_t events[MagmaMaxGPUs][2] = {{ NULL }};

    magma_int_t ind_c;
    magma_device_t dev;
    
    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );

    *info = 0;

    magma_int_t left   = (side == MagmaLeft);
    magma_int_t notran = (trans == MagmaNoTrans);
    magma_int_t lquery = (lwork == -1);

    /* NQ is the order of Q and NW is the minimum dimension of WORK */
    magma_int_t nq, nw;
    if (left) {
        nq = m;
        nw = n;
    } else {
        nq = n;
        nw = m;
    }

    if (! left && side != MagmaRight) {
        *info = -1;
    } else if (! notran && trans != Magma_ConjTrans) {
        *info = -2;
    } else if (m < 0) {
        *info = -3;
    } else if (n < 0) {
        *info = -4;
    } else if (k < 0 || k > nq) {
        *info = -5;
    } else if (lda < max(1,nq)) {
        *info = -7;
    } else if (ldc < max(1,m)) {
        *info = -10;
    } else if (lwork < max(1,nw) && ! lquery) {
        *info = -12;
    }

    magma_int_t lwkopt = max(1,nw) * nb;
    if (*info == 0) {
        work[0] = magma_zmake_lwork( lwkopt );
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0 || k == 0) {
        work[0] = c_one;
        return *info;
    }

    if (nb >= k) {
        /* Use CPU code */
        lapackf77_zunmqr(side_, trans_, &m, &n, &k, A, &lda, tau,
                         C, &ldc, work, &lwork, info);
        return *info;
    }

    magma_int_t lddc = magma_roundup( m, 64 );  // TODO why 64 instead of 32 ?
    magma_int_t lddac = nq;
    magma_int_t lddar = nb;
    magma_int_t lddwork = nw;

    magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 };

    magma_int_t nb_l=256;
    magma_int_t nbl = magma_ceildiv( n, nb_l ); // number of blocks
    magma_int_t maxnlocal = magma_ceildiv( nbl, ngpu )*nb_l;

    ngpu = min( ngpu, magma_ceildiv( n, nb_l )); // Don't use GPU that will not have data.

    magma_int_t ldw = maxnlocal*lddc // dC
                    + 2*lddac*lddar // 2*dA
                    + 2*(nb + 1 + lddwork)*nb; // 2*(dT and dwork)

    if (MAGMA_SUCCESS != magma_zmalloc_pinned( &T, nb*nb )) {
        *info = MAGMA_ERR_HOST_ALLOC;
        goto cleanup;
    }
    for (dev = 0; dev < ngpu; ++dev) {
        magma_setdevice( dev );
        if (MAGMA_SUCCESS != magma_zmalloc( &dw[dev], ldw )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            goto cleanup;
        }
        magma_queue_create( dev, &queues[dev][0] );
        magma_queue_create( dev, &queues[dev][1] );
        magma_event_create( &events[dev][0] );
        magma_event_create( &events[dev][1] );
    }

    /* Use hybrid CPU-MGPU code */
    if (left) {
        //copy C to mgpus
        for (magma_int_t i = 0; i < nbl; ++i) {
            dev = i % ngpu;
            magma_setdevice( dev );
            magma_int_t kb = min(nb_l, n-i*nb_l);
            magma_zsetmatrix_async( m, kb,
                                   C(0, i*nb_l), ldc,
                                   dC(dev, 0, i/ngpu*nb_l), lddc, queues[dev][0] );
            nlocal[dev] += kb;
        }

        magma_int_t i1, i2, i3;
        if ( !notran ) {
            i1 = 0;
            i2 = k;
            i3 = nb;
        } else {
            i1 = (k - 1) / nb * nb;
            i2 = 0;
            i3 = -nb;
        }

        ind_c = 0;

        for (magma_int_t i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) {
            // start the copy of A panel
            magma_int_t kb = min(nb, k - i);
            for (dev = 0; dev < ngpu; ++dev) {
                magma_setdevice( dev );
                magma_event_sync( events[dev][ind_c] ); // check if the new data can be copied
                magma_zsetmatrix_async(nq-i, kb,
                                       A(i, i),                 lda,
                                       dA_c(dev, ind_c, i, 0), lddac, queues[dev][0] );
                // set upper triangular part of dA to identity
                magmablas_zlaset_band( MagmaUpper, kb, kb, kb, c_zero, c_one, dA_c(dev, ind_c, i, 0), lddac, queues[dev][0] );
            }

            /* Form the triangular factor of the block reflector
             H = H(i) H(i+1) . . . H(i+ib-1) */
            magma_int_t nqi = nq - i;
            lapackf77_zlarft("F", "C", &nqi, &kb, A(i, i), &lda,
                             &tau[i], T, &kb);

            /* H or H' is applied to C(1:m,i:n) */

            /* Apply H or H'; First copy T to the GPU */
            for (dev = 0; dev < ngpu; ++dev) {
                magma_setdevice( dev );
                magma_zsetmatrix_async(kb, kb,
                                       T,               kb,
                                       dT(dev, ind_c), kb, queues[dev][0] );
            }

            for (dev = 0; dev < ngpu; ++dev) {
                magma_setdevice( dev );
                magma_queue_sync( queues[dev][0] ); // check if the data was copied
                magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise,
                                 m-i, nlocal[dev], kb,
                                 dA_c(dev, ind_c, i, 0), lddac, dT(dev, ind_c), kb,
                                 dC(dev, i, 0), lddc,
                                 dwork(dev, ind_c), lddwork, queues[dev][1] );
                magma_event_record(events[dev][ind_c], queues[dev][1] );
            }

            ind_c = (ind_c+1)%2;
        }

        for (dev = 0; dev < ngpu; ++dev) {
            magma_setdevice( dev );
            magma_queue_sync( queues[dev][1] );
        }

        //copy C from mgpus
        for (magma_int_t i = 0; i < nbl; ++i) {
            dev = i % ngpu;
            magma_setdevice( dev );
            magma_int_t kb = min(nb_l, n-i*nb_l);
            magma_zgetmatrix( m, kb,
                              dC(dev, 0, i/ngpu*nb_l), lddc,
                              C(0, i*nb_l), ldc, queues[dev][1] );
//            magma_zgetmatrix_async( m, kb,
//                                   dC(dev, 0, i/ngpu*nb_l), lddc,
//                                   C(0, i*nb_l), ldc, queues[dev][0] );
        }
    } else {
        *info = MAGMA_ERR_NOT_IMPLEMENTED;
        magma_xerbla( __func__, -(*info) );
        goto cleanup;
        
        /*
        if ( notran ) {
            i1 = 0;
            i2 = k;
            i3 = nb;
        } else {
            i1 = (k - 1) / nb * nb;
            i2 = 0;
            i3 = -nb;
        }

        mi = m;
        ic = 0;

        for (i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) {
            ib = min(nb, k - i);
            
            // Form the triangular factor of the block reflector
            // H = H(i) H(i+1) . . . H(i+ib-1)
            i__4 = nq - i;
            lapackf77_zlarft("F", "C", &i__4, &ib, A(i, i), &lda,
            &tau[i], T, &ib);
            
            // 1) copy the panel from A to the GPU, and
            // 2) set upper triangular part of dA to identity
            magma_zsetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda, queues[dev][1] );
            magmablas_zlaset_band( MagmaUpper, ib, ib, ib, c_zero, c_one, dA(i, 0), ldda, queues[dev][1] );
            
            // H or H' is applied to C(1:m,i:n)
            ni = n - i;
            jc = i;
            
            // Apply H or H'; First copy T to the GPU
            magma_zsetmatrix( ib, ib, T, ib, dT, ib, queues[dev][1] );
            magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise,
            mi, ni, ib,
            dA(i, 0), ldda, dT, ib,
            dC(ic, jc), lddc,
            dwork, lddwork, queues[dev][1] );
        }
        */
    }

cleanup:
    work[0] = magma_zmake_lwork( lwkopt );

    for (dev = 0; dev < ngpu; ++dev) {
        magma_setdevice( dev );
        magma_event_destroy( events[dev][0] );
        magma_event_destroy( events[dev][1] );
        magma_queue_destroy( queues[dev][0] );
        magma_queue_destroy( queues[dev][1] );
        magma_free( dw[dev] );
    }
    magma_setdevice( orig_dev );
    magma_free_pinned( T );

    return *info;
} /* magma_zunmqr */
Exemplo n.º 21
0
extern "C" magma_int_t
magma_d_spmv(
    double alpha,
    magma_d_matrix A,
    magma_d_matrix x,
    double beta,
    magma_d_matrix y,
    magma_queue_t queue )
{
    magma_int_t info = 0;

    magma_d_matrix x2={Magma_CSR};

    cusparseHandle_t cusparseHandle = 0;
    cusparseMatDescr_t descr = 0;
    // make sure RHS is a dense matrix
    if ( x.storage_type != Magma_DENSE ) {
         printf("error: only dense vectors are supported for SpMV.\n");
         info = MAGMA_ERR_NOT_SUPPORTED;
         goto cleanup;
    }

    if ( A.memory_location != x.memory_location ||
                            x.memory_location != y.memory_location ) {
        printf("error: linear algebra objects are not located in same memory!\n");
        printf("memory locations are: %d   %d   %d\n",
                        A.memory_location, x.memory_location, y.memory_location );
        info = MAGMA_ERR_INVALID_PTR;
        goto cleanup;
    }

    // DEV case
    if ( A.memory_location == Magma_DEV ) {
        if ( A.num_cols == x.num_rows && x.num_cols == 1 ) {
             if ( A.storage_type == Magma_CSR || A.storage_type == Magma_CUCSR
                            || A.storage_type == Magma_CSRL
                            || A.storage_type == Magma_CSRU ) {
              CHECK_CUSPARSE( cusparseCreate( &cusparseHandle ));
              CHECK_CUSPARSE( cusparseSetStream( cusparseHandle, queue->cuda_stream() ));
              CHECK_CUSPARSE( cusparseCreateMatDescr( &descr ));
            
              CHECK_CUSPARSE( cusparseSetMatType( descr, CUSPARSE_MATRIX_TYPE_GENERAL ));
              CHECK_CUSPARSE( cusparseSetMatIndexBase( descr, CUSPARSE_INDEX_BASE_ZERO ));
            
              cusparseDcsrmv( cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE,
                            A.num_rows, A.num_cols, A.nnz, &alpha, descr,
                            A.dval, A.drow, A.dcol, x.dval, &beta, y.dval );
             }
             else if ( A.storage_type == Magma_ELL ) {
                 //printf("using ELLPACKT kernel for SpMV: ");
                 CHECK( magma_dgeelltmv( MagmaNoTrans, A.num_rows, A.num_cols,
                    A.max_nnz_row, alpha, A.dval, A.dcol, x.dval, beta,
                    y.dval, queue ));
                 //printf("done.\n");
             }
             else if ( A.storage_type == Magma_ELLPACKT ) {
                 //printf("using ELL kernel for SpMV: ");
                 CHECK( magma_dgeellmv( MagmaNoTrans, A.num_rows, A.num_cols,
                    A.max_nnz_row, alpha, A.dval, A.dcol, x.dval, beta,
                    y.dval, queue ));
                 //printf("done.\n");
             }
             else if ( A.storage_type == Magma_ELLRT ) {
                 //printf("using ELLRT kernel for SpMV: ");
                 CHECK( magma_dgeellrtmv( MagmaNoTrans, A.num_rows, A.num_cols,
                            A.max_nnz_row, alpha, A.dval, A.dcol, A.drow, x.dval,
                         beta, y.dval, A.alignment, A.blocksize, queue ));
                 //printf("done.\n");
             }
             else if ( A.storage_type == Magma_SELLP ) {
                 //printf("using SELLP kernel for SpMV: ");
                 CHECK( magma_dgesellpmv( MagmaNoTrans, A.num_rows, A.num_cols,
                    A.blocksize, A.numblocks, A.alignment,
                    alpha, A.dval, A.dcol, A.drow, x.dval, beta, y.dval, queue ));

                 //printf("done.\n");
             }
             else if ( A.storage_type == Magma_DENSE ) {
                 //printf("using DENSE kernel for SpMV: ");
                 magmablas_dgemv( MagmaNoTrans, A.num_rows, A.num_cols, alpha,
                                A.dval, A.num_rows, x.dval, 1, beta,  y.dval,
                                1, queue );
                 //printf("done.\n");
             }
             else if ( A.storage_type == Magma_SPMVFUNCTION ) {
                 //printf("using DENSE kernel for SpMV: ");
                 CHECK( magma_dcustomspmv( alpha, x, beta, y, queue ));
                 //printf("done.\n");
             }
             else if ( A.storage_type == Magma_BCSR ) {
                 //printf("using CUSPARSE BCSR kernel for SpMV: ");
                // CUSPARSE context //
                cusparseDirection_t dirA = CUSPARSE_DIRECTION_ROW;
                int mb = magma_ceildiv( A.num_rows, A.blocksize );
                int nb = magma_ceildiv( A.num_cols, A.blocksize );
                CHECK_CUSPARSE( cusparseCreate( &cusparseHandle ));
                CHECK_CUSPARSE( cusparseSetStream( cusparseHandle, queue->cuda_stream() ));
                CHECK_CUSPARSE( cusparseCreateMatDescr( &descr ));
                cusparseDbsrmv( cusparseHandle, dirA,
                    CUSPARSE_OPERATION_NON_TRANSPOSE, mb, nb, A.numblocks,
                    &alpha, descr, A.dval, A.drow, A.dcol, A.blocksize, x.dval,
                    &beta, y.dval );
             }
             else {
                 printf("error: format not supported.\n");
                 info = MAGMA_ERR_NOT_SUPPORTED; 
             }
        }
        else if ( A.num_cols < x.num_rows || x.num_cols > 1 ) {
            magma_int_t num_vecs = x.num_rows / A.num_cols * x.num_cols;
            if ( A.storage_type == Magma_CSR ) {
                CHECK_CUSPARSE( cusparseCreate( &cusparseHandle ));
                CHECK_CUSPARSE( cusparseSetStream( cusparseHandle, queue->cuda_stream() ));
                CHECK_CUSPARSE( cusparseCreateMatDescr( &descr ));
                CHECK_CUSPARSE( cusparseSetMatType( descr, CUSPARSE_MATRIX_TYPE_GENERAL ));
                CHECK_CUSPARSE( cusparseSetMatIndexBase( descr, CUSPARSE_INDEX_BASE_ZERO ));

                if ( x.major == MagmaColMajor) {
                    cusparseDcsrmm(cusparseHandle,
                    CUSPARSE_OPERATION_NON_TRANSPOSE,
                    A.num_rows,   num_vecs, A.num_cols, A.nnz,
                    &alpha, descr, A.dval, A.drow, A.dcol,
                    x.dval, A.num_cols, &beta, y.dval, A.num_cols);
                } else if ( x.major == MagmaRowMajor) {
                    /*cusparseDcsrmm2(cusparseHandle,
                    CUSPARSE_OPERATION_NON_TRANSPOSE,
                    CUSPARSE_OPERATION_TRANSPOSE,
                    A.num_rows,   num_vecs, A.num_cols, A.nnz,
                    &alpha, descr, A.dval, A.drow, A.dcol,
                    x.dval, A.num_cols, &beta, y.dval, A.num_cols);
                    */
                }
             } else if ( A.storage_type == Magma_SELLP ) {
                if ( x.major == MagmaRowMajor) {
                 CHECK( magma_dmgesellpmv( MagmaNoTrans, A.num_rows, A.num_cols,
                    num_vecs, A.blocksize, A.numblocks, A.alignment,
                    alpha, A.dval, A.dcol, A.drow, x.dval, beta, y.dval, queue ));
                }
                else if ( x.major == MagmaColMajor) {
                    // transpose first to row major
                    CHECK( magma_dvtranspose( x, &x2, queue ));
                    CHECK( magma_dmgesellpmv( MagmaNoTrans, A.num_rows, A.num_cols,
                    num_vecs, A.blocksize, A.numblocks, A.alignment,
                    alpha, A.dval, A.dcol, A.drow, x2.dval, beta, y.dval, queue ));
                }
             }
             /*if ( A.storage_type == Magma_DENSE ) {
                 //printf("using DENSE kernel for SpMV: ");
                 magmablas_dmgemv( MagmaNoTrans, A.num_rows, A.num_cols,
                            num_vecs, alpha, A.dval, A.num_rows, x.dval, 1,
                            beta,  y.dval, 1 );
                 //printf("done.\n");
             }*/
             else {
                 printf("error: format not supported.\n");
                 info = MAGMA_ERR_NOT_SUPPORTED;
             }
        }
    }
    // CPU case missing!
    else {
        printf("error: CPU not yet supported.\n");
        info = MAGMA_ERR_NOT_SUPPORTED;
    }

cleanup:
    cusparseDestroyMatDescr( descr );
    cusparseDestroy( cusparseHandle );
    cusparseHandle = 0;
    descr = 0;
    magma_dmfree(&x2, queue );
    
    return info;
}
Exemplo n.º 22
0
//##################################################################################################
static void *magma_dapplyQ_parallel_section(void *arg)
{
    magma_int_t my_core_id   = ((magma_dapplyQ_id_data*)arg) -> id;
    magma_dapplyQ_data* data = ((magma_dapplyQ_id_data*)arg) -> data;

    magma_int_t allcores_num   = data -> threads_num;
    magma_int_t n              = data -> n;
    magma_int_t ne             = data -> ne;
    magma_int_t n_gpu          = data -> n_gpu;
    magma_int_t nb             = data -> nb;
    magma_int_t Vblksiz        = data -> Vblksiz;
    double *E         = data -> E;
    magma_int_t lde            = data -> lde;
    double *V         = data -> V;
    magma_int_t ldv            = data -> ldv;
    double *TAU       = data -> TAU;
    double *T         = data -> T;
    magma_int_t ldt            = data -> ldt;
    double *dE        = data -> dE;
    magma_int_t ldde           = data -> ldde;
    pthread_barrier_t* barrier = &(data -> barrier);

    magma_int_t info;

    #ifdef ENABLE_TIMER
    real_Double_t timeQcpu=0.0, timeQgpu=0.0;
    #endif

    magma_int_t n_cpu = ne - n_gpu;

    // with MKL and when using omp_set_num_threads instead of mkl_set_num_threads
    // it need that all threads setting it to 1.
    magma_set_lapack_numthreads(1);

#ifdef MAGMA_SETAFFINITY
    //#define PRINTAFFINITY
#ifdef PRINTAFFINITY
    affinity_set print_set;
    print_set.print_affinity(my_core_id, "starting affinity");
#endif
    cpu_set_t old_set, new_set;

    //store current affinity
    CPU_ZERO(&old_set);
    sched_getaffinity( 0, sizeof(old_set), &old_set);
    //set new affinity
    // bind threads
    CPU_ZERO(&new_set);
    CPU_SET(my_core_id, &new_set);
    sched_setaffinity( 0, sizeof(new_set), &new_set);
#ifdef PRINTAFFINITY
    print_set.print_affinity(my_core_id, "set affinity");
#endif
#endif

    if (my_core_id == 0) {
        //=============================================
        //   on GPU on thread 0:
        //    - apply V2*Z(:,1:N_GPU)
        //=============================================
        #ifdef ENABLE_TIMER
        timeQgpu = magma_wtime();
        #endif

        magma_dsetmatrix(n, n_gpu, E, lde, dE, ldde);
        magma_dbulge_applyQ_v2(MagmaLeft, n_gpu, n, nb, Vblksiz, dE, ldde, V, ldv, T, ldt, &info);
        magma_device_sync();

        #ifdef ENABLE_TIMER
        timeQgpu = magma_wtime()-timeQgpu;
        printf("  Finish Q2_GPU GGG timing= %f\n", timeQgpu);
        #endif
    } else {
        //=============================================
        //   on CPU on threads 1:allcores_num-1:
        //    - apply V2*Z(:,N_GPU+1:NE)
        //=============================================
        #ifdef ENABLE_TIMER
        if (my_core_id == 1)
            timeQcpu = magma_wtime();
        #endif

        magma_int_t n_loc = magma_ceildiv(n_cpu, allcores_num-1);
        double* E_loc = E + (n_gpu+ n_loc * (my_core_id-1))*lde;
        n_loc = min(n_loc,n_cpu - n_loc * (my_core_id-1));

        magma_dtile_bulge_applyQ(my_core_id, MagmaLeft, n_loc, n, nb, Vblksiz, E_loc, lde, V, ldv, TAU, T, ldt);
        pthread_barrier_wait(barrier);

        #ifdef ENABLE_TIMER
        if (my_core_id == 1) {
            timeQcpu = magma_wtime()-timeQcpu;
            printf("  Finish Q2_CPU CCC timing= %f\n", timeQcpu);
        }
        #endif
    } // END if my_core_id

#ifdef MAGMA_SETAFFINITY
    //restore old affinity
    sched_setaffinity(0, sizeof(old_set), &old_set);
#ifdef PRINTAFFINITY
    print_set.print_affinity(my_core_id, "restored_affinity");
#endif
#endif

    return 0;
}
Exemplo n.º 23
0
/***************************************************************************//**
    Purpose
    -------
    SGETRF_m computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.  This version does not
    require work space on the GPU passed as input. GPU memory is allocated
    in the routine. The matrix may exceed the GPU memory.

    The factorization has the form
       A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.

    Note: The factorization of big panel is done calling multiple-gpu-interface.
    Pivots are applied on GPU within the big panel.

    Arguments
    ---------
    @param[in]
    ngpu    INTEGER
            Number of GPUs to use. ngpu > 0.

    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    A       REAL array, dimension (LDA,N)
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.
    \n
            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    @param[out]
    ipiv    INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
      -     > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.

    @ingroup magma_getrf
*******************************************************************************/
extern "C" magma_int_t
magma_sgetrf_m(
    magma_int_t ngpu,
    magma_int_t m, magma_int_t n,
    float *A, magma_int_t lda, magma_int_t *ipiv,
    magma_int_t *info)
{
#define     A(i,j) (A      + (j)*lda + (i))
#define dAT(d,i,j) (dAT[d] + (i)*nb*ldn_local + (j)*nb)
#define dPT(d,i,j) (dPT[d] + (i)*nb*nb + (j)*nb*maxm)

    magma_timer_t time=0, time_total=0, time_alloc=0, time_set=0, time_get=0, time_comp=0;
    timer_start( time_total );
    //real_Double_t flops;

    float c_one     = MAGMA_S_ONE;
    float c_neg_one = MAGMA_S_NEG_ONE;
    float *dAT[MagmaMaxGPUs], *dA[MagmaMaxGPUs], *dPT[MagmaMaxGPUs];
    magma_int_t        iinfo = 0, nb, nbi, maxm, n_local[MagmaMaxGPUs], ldn_local;
    magma_int_t        N, M, NB, NBk, I, d, ngpu0 = ngpu;
    magma_int_t        ii, jj, h, offset, ib, rows;
    
    magma_queue_t queues[MagmaMaxGPUs][2];
    magma_event_t  event[MagmaMaxGPUs][2];

    *info = 0;
    if (m < 0)
        *info = -1;
    else if (n < 0)
        *info = -2;
    else if (lda < max(1,m))
        *info = -4;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );
    
    /* initialize nb */
    nb = magma_get_sgetrf_nb( m, n );
    maxm = magma_roundup( m, 32 );

    /* figure out NB */
    size_t freeMem, totalMem;
    cudaMemGetInfo( &freeMem, &totalMem );
    freeMem /= sizeof(float);
    
    /* number of columns in the big panel */
    h = 1+(2+ngpu0);
    NB = (magma_int_t)(0.8*freeMem/maxm-h*nb);
    const char* ngr_nb_char = getenv("MAGMA_NGR_NB");
    if ( ngr_nb_char != NULL )
        NB = max( nb, min( NB, atoi(ngr_nb_char) ) );
    //NB = 5*max(nb,32);

    if ( ngpu0 > magma_ceildiv( NB, nb )) {
        ngpu = magma_ceildiv( NB, nb );
        h = 1+(2+ngpu);
        NB = (magma_int_t)(0.8*freeMem/maxm-h*nb);
    } else {
        ngpu = ngpu0;
    }
    if ( ngpu*NB >= n ) {
        #ifdef CHECK_SGETRF_OOC
        printf( "      * still fit in GPU memory.\n" );
        #endif
        NB = n;
    } else {
        #ifdef CHECK_SGETRF_OOC
        printf( "      * don't fit in GPU memory.\n" );
        #endif
        NB = ngpu*NB;
        NB = max( nb, (NB / nb) * nb); /* making sure it's devisable by nb (x64) */
    }

    #ifdef CHECK_SGETRF_OO
    if ( NB != n ) printf( "      * running in out-core mode (n=%lld, NB=%lld, nb=%lld, freeMem=%.2e).\n", (long long) n, (long long) NB, (long long) nb, (float) freeMem );
    else           printf( "      * running in in-core mode  (n=%lld, NB=%lld, nb=%lld, freeMem=%.2e).\n", (long long) n, (long long) NB, (long long) nb, (float) freeMem );
    #endif

    if ( (nb <= 1) || (nb >= min(m,n)) ) {
        /* Use CPU code for scalar of one tile. */
        lapackf77_sgetrf(&m, &n, A, &lda, ipiv, info);
    } else {
        /* Use hybrid blocked code. */

        /* allocate memory on GPU to store the big panel */
        timer_start( time_alloc );
        n_local[0] = (NB/nb)/ngpu;
        if ( NB%(nb*ngpu) != 0 )
            n_local[0]++;
        n_local[0] *= nb;
        ldn_local = magma_roundup( n_local[0], 32 );
    
        for( d=0; d < ngpu; d++ ) {
            magma_setdevice(d);
            if (MAGMA_SUCCESS != magma_smalloc( &dA[d], (ldn_local+h*nb)*maxm )) {
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }
            dPT[d] = dA[d] + nb*maxm;      /* for storing the previous panel from CPU */
            dAT[d] = dA[d] + h*nb*maxm;    /* for storing the big panel               */
            magma_queue_create( d, &queues[d][0] );
            magma_queue_create( d, &queues[d][1] );
            magma_event_create( &event[d][0] );
            magma_event_create( &event[d][1] );
        }
        //magma_setdevice(0);
        timer_stop( time_alloc );
        
        for( I=0; I < n; I += NB ) {
            M = m;
            N = min( NB, n-I );       /* number of columns in this big panel             */
            //s = min( max(m-I,0), N )/nb; /* number of small block-columns in this big panel */
    
            maxm = magma_roundup( M, 32 );
            if ( ngpu0 > magma_ceildiv( NB, nb ) ) {
                ngpu = magma_ceildiv( NB, nb );
            } else {
                ngpu = ngpu0;
            }
    
            for( d=0; d < ngpu; d++ ) {
                n_local[d] = ((N/nb)/ngpu)*nb;
                if (d < (N/nb)%ngpu)
                    n_local[d] += nb;
                else if (d == (N/nb)%ngpu)
                    n_local[d] += N%nb;
            }
            ldn_local = magma_roundup( n_local[0], 32 );
            
            /* upload the next big panel into GPU, transpose (A->A'), and pivot it */
            timer_start( time );
            magmablas_ssetmatrix_transpose_mgpu(ngpu, M, N, nb, A(0,I), lda,
                                                dAT, ldn_local, dA, maxm, queues);
            for( d=0; d < ngpu; d++ ) {
                magma_setdevice(d);
                magma_queue_sync( queues[d][0] );
                magma_queue_sync( queues[d][1] );
            }
            time_set += timer_stop( time );
    
            timer_start( time );
            /* --------------------------------------------------------------- */
            /* loop around the previous big-panels to update the new big-panel */
            for( offset = 0; offset < min(m,I); offset += NB ) {
                NBk = min( m-offset, NB );
                /* start sending the first tile from the previous big-panels to gpus */
                for( d=0; d < ngpu; d++ ) {
                    magma_setdevice(d);
                    nbi  = min( nb, NBk );
                    magma_ssetmatrix_async( (M-offset), nbi,
                                            A(offset,offset), lda,
                                            dA[d],            (maxm-offset), queues[d][0] );
                    
                    /* make sure the previous update finished */
                    //magma_queue_sync( queues[d][1] );
                    magma_queue_wait_event( queues[d][0], event[d][0] );
                    
                    /* transpose */
                    magmablas_stranspose( M-offset, nbi, dA[d], maxm-offset, dPT(d,0,0), nb,
                                          queues[d][0]);
                }
                
                /* applying the pivot from the previous big-panel */
                for( d=0; d < ngpu; d++ ) {
                    magma_setdevice(d);
                    magmablas_slaswp_q( ldn_local, dAT(d,0,0), ldn_local, offset+1, offset+NBk, ipiv, 1, 
                                        queues[d][1] );
                }
                
                /* going through each block-column of previous big-panels */
                for( jj=0, ib=offset/nb; jj < NBk; jj += nb, ib++ ) {
                    ii   = offset+jj;
                    rows = maxm - ii;
                    nbi  = min( nb, NBk-jj );
                    for( d=0; d < ngpu; d++ ) {
                        magma_setdevice(d);
                        
                        /* wait for a block-column on GPU */
                        magma_queue_sync( queues[d][0] );
                        
                        /* start sending next column */
                        if ( jj+nb < NBk ) {
                            magma_ssetmatrix_async( (M-ii-nb), min(nb,NBk-jj-nb),
                                                    A(ii+nb,ii+nb), lda,
                                                    dA[d],          (rows-nb), 
                                                    queues[d][0] );
                            
                            /* make sure the previous update finished */
                            //magma_queue_sync( queues[d][1] );
                            magma_queue_wait_event( queues[d][0], event[d][(1+jj/nb)%2] );
                            
                            /* transpose next column */
                            magmablas_stranspose( M-ii-nb, nb, dA[d], rows-nb, dPT(d,0,(1+jj/nb)%2), nb,
                                                  queues[d][0]);
                        }
                        
                        /* update with the block column */
                        magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                                     n_local[d], nbi, c_one, dPT(d,0,(jj/nb)%2), nb, dAT(d,ib,0), ldn_local,
                                     queues[d][1]);
                        if ( M > ii+nb ) {
                            magma_sgemm( MagmaNoTrans, MagmaNoTrans,
                                         n_local[d], M-(ii+nb), nbi, c_neg_one, dAT(d,ib,0), ldn_local,
                                         dPT(d,1,(jj/nb)%2), nb, c_one, dAT(d,ib+1,0), ldn_local,
                                         queues[d][1]);
                        }
                        magma_event_record( event[d][(jj/nb)%2], queues[d][1] );
                    } /* end of for each block-columns in a big-panel */
                }
            } /* end of for each previous big-panels */
            for( d=0; d < ngpu; d++ ) {
                magma_setdevice(d);
                magma_queue_sync( queues[d][0] );
                magma_queue_sync( queues[d][1] );
            }
    
            /* calling magma-gpu interface to panel-factorize the big panel */
            if ( M > I ) {
                magma_sgetrf2_mgpu(ngpu, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, A(0,I), lda,
                                   queues, &iinfo);
                if ( iinfo < 0 ) {
                    *info = iinfo;
                    break;
                } else if ( iinfo != 0 ) {
                    *info = iinfo + I * NB;
                    //break;
                }
                /* adjust pivots */
                for( ii=I; ii < min(I+N,m); ii++ )
                    ipiv[ii] += I;
            }
            time_comp += timer_stop( time );
    
            /* get the current big panel to CPU from devices */
            timer_start( time );
            magmablas_sgetmatrix_transpose_mgpu(ngpu, M, N, nb, dAT, ldn_local,
                                                A(0,I), lda, dA, maxm, queues);
            for( d=0; d < ngpu; d++ ) {
                magma_setdevice(d);
                magma_queue_sync( queues[d][0] );
                magma_queue_sync( queues[d][1] );
            }
            time_get += timer_stop( time );
        } /* end of for */
    
        //timer_stop( time_total );
        //flops = FLOPS_SGETRF( m, n ) / 1e9;
        //timer_printf(" memory-allocation time: %e\n", time_alloc );
        //timer_printf(" NB=%lld nb=%lld\n", (long long) NB, (long long) nb );
        //timer_printf(" memcopy and transpose %e seconds\n", time_set );
        //timer_printf(" total time %e seconds\n", time_total );
        //timer_printf(" Performance %f GFlop/s, %f seconds without htod and dtoh\n",     flops / (time_comp),               time_comp               );
        //timer_printf(" Performance %f GFlop/s, %f seconds with    htod\n",              flops / (time_comp + time_set),    time_comp + time_set    );
        //timer_printf(" Performance %f GFlop/s, %f seconds with    dtoh\n",              flops / (time_comp + time_get),    time_comp + time_get    );
        //timer_printf(" Performance %f GFlop/s, %f seconds without memory-allocation\n", flops / (time_total - time_alloc), time_total - time_alloc );
    
        for( d=0; d < ngpu0; d++ ) {
            magma_setdevice(d);
            magma_free( dA[d] );
            magma_event_destroy( event[d][0] );
            magma_event_destroy( event[d][1] );
            magma_queue_destroy( queues[d][0] );
            magma_queue_destroy( queues[d][1] );
        }
        magma_setdevice( orig_dev );
    }
    if ( *info >= 0 )
        magma_sgetrf_piv(m, n, NB, A, lda, ipiv, info);
    return *info;
} /* magma_sgetrf_m */
Exemplo n.º 24
0
extern "C" magma_int_t
magma_dmtransfer(
    magma_d_matrix A,
    magma_d_matrix *B,
    magma_location_t src,
    magma_location_t dst,
    magma_queue_t queue )
{
    magma_int_t info = 0;
    
    B->val = NULL;
    B->diag = NULL;
    B->row = NULL;
    B->rowidx = NULL;
    B->col = NULL;
    B->blockinfo = NULL;
    B->dval = NULL;
    B->ddiag = NULL;
    B->drow = NULL;
    B->drowidx = NULL;
    B->dcol = NULL;
    B->diag = NULL;
    B->ddiag = NULL;
    B->list = NULL;
    B->dlist = NULL;
    

    // first case: copy matrix from host to device
    if ( src == Magma_CPU && dst == Magma_DEV ) {
        //CSR-type
        if ( A.storage_type == Magma_CSR || A.storage_type == Magma_CUCSR
                                        || A.storage_type == Magma_CSC
                                        || A.storage_type == Magma_CSRD
                                        || A.storage_type == Magma_CSRL
                                        || A.storage_type == Magma_CSRU ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, A.nnz ));
            CHECK( magma_index_malloc( &B->drow, A.num_rows + 1 ));
            CHECK( magma_index_malloc( &B->dcol, A.nnz ));
            // data transfer
            magma_dsetvector( A.nnz, A.val, 1, B->dval, 1, queue );
            magma_index_setvector( A.num_rows + 1, A.row, 1, B->drow, 1, queue );
            magma_index_setvector( A.nnz, A.col, 1, B->dcol, 1, queue );
        }
        //COO-type
        else if ( A.storage_type == Magma_COO ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, A.nnz ));
            CHECK( magma_index_malloc( &B->dcol, A.nnz ));
            CHECK( magma_index_malloc( &B->drowidx, A.nnz ));
            // data transfer
            magma_dsetvector( A.nnz, A.val, 1, B->dval, 1, queue );
            magma_index_setvector( A.nnz, A.col, 1, B->dcol, 1, queue );
            magma_index_setvector( A.nnz, A.rowidx, 1, B->drowidx, 1, queue );
        }
        //CSRCOO-type
        else if ( A.storage_type == Magma_CSRCOO ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, A.nnz ));
            CHECK( magma_index_malloc( &B->drow, A.num_rows + 1 ));
            CHECK( magma_index_malloc( &B->dcol, A.nnz ));
            CHECK( magma_index_malloc( &B->drowidx, A.nnz ));
            // data transfer
            magma_dsetvector( A.nnz, A.val, 1, B->dval, 1, queue );
            magma_index_setvector( A.num_rows + 1, A.row, 1, B->drow, 1, queue );
            magma_index_setvector( A.nnz, A.col, 1, B->dcol, 1, queue );
            magma_index_setvector( A.nnz, A.rowidx, 1, B->drowidx, 1, queue );
        }
        //ELL/ELLPACKT-type
        else if ( A.storage_type == Magma_ELLPACKT || A.storage_type == Magma_ELL ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, A.num_rows * A.max_nnz_row ));
            CHECK( magma_index_malloc( &B->dcol, A.num_rows * A.max_nnz_row ));
            // data transfer
            magma_dsetvector( A.num_rows * A.max_nnz_row, A.val, 1, B->dval, 1, queue );
            magma_index_setvector( A.num_rows * A.max_nnz_row, A.col, 1, B->dcol, 1, queue );
        }
        //ELLD-type
        else if ( A.storage_type == Magma_ELLD ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, A.num_rows * A.max_nnz_row ));
            CHECK( magma_index_malloc( &B->dcol, A.num_rows * A.max_nnz_row ));
            // data transfer
            magma_dsetvector( A.num_rows * A.max_nnz_row, A.val, 1, B->dval, 1, queue );
            magma_index_setvector( A.num_rows * A.max_nnz_row, A.col, 1, B->dcol, 1, queue );
        }
        //ELLRT-type
        else if ( A.storage_type == Magma_ELLRT ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            B->blocksize = A.blocksize;
            B->alignment = A.alignment;
            magma_int_t rowlength = magma_roundup( A.max_nnz_row, A.alignment );
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, A.num_rows * rowlength ));
            CHECK( magma_index_malloc( &B->dcol, A.num_rows * rowlength ));
            CHECK( magma_index_malloc( &B->drow, A.num_rows ));
            // data transfer
            magma_dsetvector( A.num_rows * rowlength, A.val, 1, B->dval, 1, queue );
            magma_index_setvector( A.num_rows * rowlength, A.col, 1, B->dcol, 1, queue );
            magma_index_setvector( A.num_rows, A.row, 1, B->drow, 1, queue );
        }
        //SELLP-type
        else if ( A.storage_type == Magma_SELLP ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            B->blocksize = A.blocksize;
            B->numblocks = A.numblocks;
            B->alignment = A.alignment;
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, A.nnz ));
            CHECK( magma_index_malloc( &B->dcol, A.nnz ));
            CHECK( magma_index_malloc( &B->drow, A.numblocks + 1 ));
            // data transfer
            magma_dsetvector( A.nnz, A.val, 1, B->dval, 1, queue );
            magma_index_setvector( A.nnz, A.col, 1, B->dcol, 1, queue );
            magma_index_setvector( A.numblocks + 1, A.row, 1, B->drow, 1, queue );
        }
        //BCSR-type
        else if ( A.storage_type == Magma_BCSR ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            B->blocksize = A.blocksize;
            B->numblocks = A.numblocks;
            B->alignment = A.alignment;
            magma_int_t size_b = A.blocksize;
            //magma_int_t c_blocks = ceil( (float)A.num_cols / (float)size_b );
                    // max number of blocks per row
            //magma_int_t r_blocks = ceil( (float)A.num_rows / (float)size_b );
            magma_int_t r_blocks = magma_ceildiv( A.num_rows, size_b );
                    // max number of blocks per column
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, size_b * size_b * A.numblocks ));
            CHECK( magma_index_malloc( &B->drow, r_blocks + 1 ));
            CHECK( magma_index_malloc( &B->dcol, A.numblocks ));
            // data transfer
            magma_dsetvector( size_b * size_b * A.numblocks, A.val, 1, B->dval, 1, queue );
            magma_index_setvector( r_blocks + 1, A.row, 1, B->drow, 1, queue );
            magma_index_setvector( A.numblocks, A.col, 1, B->dcol, 1, queue );
        }
        //DENSE-type
        else if ( A.storage_type == Magma_DENSE ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            B->major = A.major;
            B->ld = A.ld;
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, A.num_rows * A.num_cols ));
            // data transfer
            magma_dsetvector( A.num_rows * A.num_cols, A.val, 1, B->dval, 1, queue );
        }
    }

    // second case: copy matrix from host to host
    else if ( src == Magma_CPU && dst == Magma_CPU ) {
        //CSR-type
        if ( A.storage_type == Magma_CSR || A.storage_type == Magma_CUCSR
                                        || A.storage_type == Magma_CSC
                                        || A.storage_type == Magma_CSRD
                                        || A.storage_type == Magma_CSRL
                                        || A.storage_type == Magma_CSRU ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, A.nnz ));
            CHECK( magma_index_malloc_cpu( &B->row, A.num_rows + 1 ));
            CHECK( magma_index_malloc_cpu( &B->col, A.nnz ));
            // data transfer
            for( magma_int_t i=0; i<A.nnz; i++ ) {
                B->val[i] = A.val[i];
                B->col[i] = A.col[i];
            }
            for( magma_int_t i=0; i<A.num_rows+1; i++ ) {
                B->row[i] = A.row[i];
            }
        }
        //COO-type
        else if ( A.storage_type == Magma_COO ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, A.nnz ));
            CHECK( magma_index_malloc_cpu( &B->col, A.nnz ));
            CHECK( magma_index_malloc_cpu( &B->rowidx, A.nnz ));
            // data transfer
            for( magma_int_t i=0; i<A.nnz; i++ ) {
                B->val[i] = A.val[i];
                B->col[i] = A.col[i];
                B->rowidx[i] = A.rowidx[i];
            }
        }
        //CSRCOO-type
        else if ( A.storage_type == Magma_CSRCOO ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, A.nnz ));
            CHECK( magma_index_malloc_cpu( &B->row, A.num_rows + 1 ));
            CHECK( magma_index_malloc_cpu( &B->col, A.nnz ));
            CHECK( magma_index_malloc_cpu( &B->rowidx, A.nnz ));
            // data transfer
            for( magma_int_t i=0; i<A.nnz; i++ ) {
                B->val[i] = A.val[i];
                B->col[i] = A.col[i];
                B->rowidx[i] = A.rowidx[i];
            }
            for( magma_int_t i=0; i<A.num_rows+1; i++ ) {
                B->row[i] = A.row[i];
            }
        }
        //ELL/ELLPACKT-type
        else if ( A.storage_type == Magma_ELLPACKT || A.storage_type == Magma_ELL ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, A.num_rows * A.max_nnz_row ));
            CHECK( magma_index_malloc_cpu( &B->col, A.num_rows * A.max_nnz_row ));
            // data transfer
            for( magma_int_t i=0; i<A.num_rows*A.max_nnz_row; i++ ) {
                B->val[i] = A.val[i];
                B->col[i] = A.col[i];
            }
        }
        //ELLD-type
        else if ( A.storage_type == Magma_ELLD ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, A.num_rows * A.max_nnz_row ));
            CHECK( magma_index_malloc_cpu( &B->col, A.num_rows * A.max_nnz_row ));
            // data transfer
            for( magma_int_t i=0; i<A.num_rows*A.max_nnz_row; i++ ) {
                B->val[i] = A.val[i];
                B->col[i] = A.col[i];
            }
        }
        //ELLRT-type
        else if ( A.storage_type == Magma_ELLRT ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            B->blocksize = A.blocksize;
            B->alignment = A.alignment;
            //int threads_per_row = A.alignment;
            //int rowlength = magma_roundup( A.max_nnz_row, threads_per_row );
            magma_int_t rowlength = magma_roundup( A.max_nnz_row, A.alignment );
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, rowlength * A.num_rows ));
            CHECK( magma_index_malloc_cpu( &B->row, A.num_rows ));
            CHECK( magma_index_malloc_cpu( &B->col, rowlength * A.num_rows ));
            // data transfer
            for( magma_int_t i=0; i<A.num_rows*rowlength; i++ ) {
                B->val[i] = A.val[i];
                B->col[i] = A.col[i];
            }
            for( magma_int_t i=0; i<A.num_rows; i++ ) {
                B->row[i] = A.row[i];
            }
        }
        //SELLP-type
        else if (  A.storage_type == Magma_SELLP ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            B->blocksize = A.blocksize;
            B->alignment = A.alignment;
            B->numblocks = A.numblocks;
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, A.nnz ));
            CHECK( magma_index_malloc_cpu( &B->col, A.nnz ));
            CHECK( magma_index_malloc_cpu( &B->row, A.numblocks + 1 ));
            // data transfer
            for( magma_int_t i=0; i<A.nnz; i++ ) {
                B->val[i] = A.val[i];
                B->col[i] = A.col[i];
            }
            for( magma_int_t i=0; i<A.numblocks+1; i++ ) {
                B->row[i] = A.row[i];
            }
        }
        //BCSR-type
        else if ( A.storage_type == Magma_BCSR ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            B->blocksize = A.blocksize;
            B->numblocks = A.numblocks;
            B->alignment = A.alignment;
            magma_int_t size_b = A.blocksize;
            //magma_int_t c_blocks = ceil( (float)A.num_cols / (float)size_b );
                    // max number of blocks per row
            //magma_int_t r_blocks = ceil( (float)A.num_rows / (float)size_b );
            magma_int_t r_blocks = magma_ceildiv( A.num_rows, size_b );
                    // max number of blocks per column
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, size_b * size_b * A.numblocks ));
            CHECK( magma_index_malloc_cpu( &B->row, r_blocks + 1 ));
            CHECK( magma_index_malloc_cpu( &B->col, A.numblocks ));
            // data transfer
            //magma_dsetvector( size_b * size_b * A.numblocks, A.val, 1, B->dval, 1, queue );
            for( magma_int_t i=0; i<size_b*size_b*A.numblocks; i++ ) {
                B->dval[i] = A.val[i];
            }
            //magma_index_setvector( r_blocks + 1, A.row, 1, B->drow, 1, queue );
            for( magma_int_t i=0; i<r_blocks+1; i++ ) {
                B->drow[i] = A.row[i];
            }
            //magma_index_setvector( A.numblocks, A.col, 1, B->dcol, 1, queue );
            for( magma_int_t i=0; i<A.numblocks; i++ ) {
                B->dcol[i] = A.col[i];
            }
        }
        //DENSE-type
        else if ( A.storage_type == Magma_DENSE ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            B->major = A.major;
            B->ld = A.ld;
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, A.num_rows * A.num_cols ));
            // data transfer
            for( magma_int_t i=0; i<A.num_rows*A.num_cols; i++ ) {
                B->val[i] = A.val[i];
            }
        }
    }

    // third case: copy matrix from device to host
    else if ( src == Magma_DEV && dst == Magma_CPU ) {
        //CSR-type
        if ( A.storage_type == Magma_CSR || A.storage_type == Magma_CUCSR
                                        || A.storage_type == Magma_CSC
                                        || A.storage_type == Magma_CSRD
                                        || A.storage_type == Magma_CSRL
                                        || A.storage_type == Magma_CSRU ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, A.nnz ));
            CHECK( magma_index_malloc_cpu( &B->row, A.num_rows + 1 ));
            CHECK( magma_index_malloc_cpu( &B->col, A.nnz ));
            // data transfer
            magma_dgetvector( A.nnz, A.dval, 1, B->val, 1, queue );
            magma_index_getvector( A.num_rows + 1, A.drow, 1, B->row, 1, queue );
            magma_index_getvector( A.nnz, A.dcol, 1, B->col, 1, queue );
        }
        //COO-type
        else if ( A.storage_type == Magma_COO ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, A.nnz ));
            CHECK( magma_index_malloc_cpu( &B->col, A.nnz ));
            CHECK( magma_index_malloc_cpu( &B->rowidx, A.nnz ));
            // data transfer
            magma_dgetvector( A.nnz, A.dval, 1, B->val, 1, queue );
            magma_index_getvector( A.nnz, A.dcol, 1, B->col, 1, queue );
            magma_index_getvector( A.nnz, A.drowidx, 1, B->rowidx, 1, queue );
        }
        //CSRCOO-type
        else if ( A.storage_type == Magma_CSRCOO ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, A.nnz ));
            CHECK( magma_index_malloc_cpu( &B->row, A.num_rows + 1 ));
            CHECK( magma_index_malloc_cpu( &B->col, A.nnz ));
            CHECK( magma_index_malloc_cpu( &B->rowidx, A.nnz ));
            // data transfer
            magma_dgetvector( A.nnz, A.dval, 1, B->val, 1, queue );
            magma_index_getvector( A.num_rows + 1, A.drow, 1, B->row, 1, queue );
            magma_index_getvector( A.nnz, A.dcol, 1, B->col, 1, queue );
            magma_index_getvector( A.nnz, A.drowidx, 1, B->rowidx, 1, queue );
        }
        //ELL/ELLPACKT-type
        else if ( A.storage_type == Magma_ELLPACKT || A.storage_type == Magma_ELL ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, A.num_rows * A.max_nnz_row ));
            CHECK( magma_index_malloc_cpu( &B->col, A.num_rows * A.max_nnz_row ));
            // data transfer
            magma_dgetvector( A.num_rows * A.max_nnz_row, A.dval, 1, B->val, 1, queue );
            magma_index_getvector( A.num_rows * A.max_nnz_row, A.dcol, 1, B->col, 1, queue );
        }
        //ELLD-type
        else if (  A.storage_type == Magma_ELLD ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, A.num_rows * A.max_nnz_row ));
            CHECK( magma_index_malloc_cpu( &B->col, A.num_rows * A.max_nnz_row ));
            // data transfer
            magma_dgetvector( A.num_rows * A.max_nnz_row, A.dval, 1, B->val, 1, queue );
            magma_index_getvector( A.num_rows * A.max_nnz_row, A.dcol, 1, B->col, 1, queue );
        }
        //ELLRT-type
        else if ( A.storage_type == Magma_ELLRT ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            B->blocksize = A.blocksize;
            B->alignment = A.alignment;
            //int threads_per_row = A.alignment;
            //int rowlength = magma_roundup( A.max_nnz_row, threads_per_row );
            magma_int_t rowlength = magma_roundup( A.max_nnz_row, A.alignment );
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, rowlength * A.num_rows ));
            CHECK( magma_index_malloc_cpu( &B->row, A.num_rows ));
            CHECK( magma_index_malloc_cpu( &B->col, rowlength * A.num_rows ));
            // data transfer
            magma_dgetvector( A.num_rows * rowlength, A.dval, 1, B->val, 1, queue );
            magma_index_getvector( A.num_rows * rowlength, A.dcol, 1, B->col, 1, queue );
            magma_index_getvector( A.num_rows, A.drow, 1, B->row, 1, queue );
        }
        //SELLP-type
        else if ( A.storage_type == Magma_SELLP ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            B->blocksize = A.blocksize;
            B->numblocks = A.numblocks;
            B->alignment = A.alignment;
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, A.nnz ));
            CHECK( magma_index_malloc_cpu( &B->col, A.nnz ));
            CHECK( magma_index_malloc_cpu( &B->row, A.numblocks + 1 ));
            // data transfer
            magma_dgetvector( A.nnz, A.dval, 1, B->val, 1, queue );
            magma_index_getvector( A.nnz, A.dcol, 1, B->col, 1, queue );
            magma_index_getvector( A.numblocks + 1, A.drow, 1, B->row, 1, queue );
        }
        //BCSR-type
        else if ( A.storage_type == Magma_BCSR ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            B->blocksize = A.blocksize;
            B->numblocks = A.numblocks;
            B->alignment = A.alignment;
            magma_int_t size_b = A.blocksize;
            //magma_int_t c_blocks = ceil( (float)A.num_cols / (float)size_b );
                    // max number of blocks per row
            //magma_int_t r_blocks = ceil( (float)A.num_rows / (float)size_b );
            magma_int_t r_blocks = magma_ceildiv( A.num_rows, size_b );
                    // max number of blocks per column
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, size_b * size_b * A.numblocks ));
            CHECK( magma_index_malloc_cpu( &B->row, r_blocks + 1 ));
            CHECK( magma_index_malloc_cpu( &B->col, A.numblocks ));
            // data transfer
            magma_dgetvector( size_b * size_b * A.numblocks, A.dval, 1, B->val, 1, queue );
            magma_index_getvector( r_blocks + 1, A.drow, 1, B->row, 1, queue );
            magma_index_getvector( A.numblocks, A.dcol, 1, B->col, 1, queue );
        }
        //DENSE-type
        else if ( A.storage_type == Magma_DENSE ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_CPU;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            B->major = A.major;
            B->ld = A.ld;
            // memory allocation
            CHECK( magma_dmalloc_cpu( &B->val, A.num_rows * A.num_cols ));
            // data transfer
            magma_dgetvector( A.num_rows * A.num_cols, A.dval, 1, B->val, 1, queue );
        }
    }

    // fourth case: copy matrix from device to device
    else if ( src == Magma_DEV && dst == Magma_DEV ) {
        //CSR-type
        if ( A.storage_type == Magma_CSR || A.storage_type == Magma_CUCSR
                                        || A.storage_type == Magma_CSC
                                        || A.storage_type == Magma_CSRD
                                        || A.storage_type == Magma_CSRL
                                        || A.storage_type == Magma_CSRU ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, A.nnz ));
            CHECK( magma_index_malloc( &B->drow, A.num_rows + 1 ));
            CHECK( magma_index_malloc( &B->dcol, A.nnz ));
            // data transfer
            magma_dcopyvector( A.nnz, A.dval, 1, B->dval, 1, queue );
            magma_index_copyvector( A.num_rows + 1, A.drow, 1, B->drow, 1, queue );
            magma_index_copyvector( A.nnz, A.dcol, 1, B->dcol, 1, queue );
        }
        //COO-type
        else if ( A.storage_type == Magma_COO ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, A.nnz ));
            CHECK( magma_index_malloc( &B->dcol, A.nnz ));
            CHECK( magma_index_malloc( &B->drowidx, A.nnz ));
            // data transfer
            magma_dcopyvector( A.nnz, A.dval, 1, B->dval, 1, queue );
            magma_index_copyvector( A.nnz, A.dcol, 1, B->dcol, 1, queue );
            magma_index_copyvector( A.nnz, A.drowidx, 1, B->drowidx, 1, queue );
        }
        //CSRCOO-type
        else if ( A.storage_type == Magma_CSRCOO ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, A.nnz ));
            CHECK( magma_index_malloc( &B->drow, A.num_rows + 1 ));
            CHECK( magma_index_malloc( &B->dcol, A.nnz ));
            CHECK( magma_index_malloc( &B->drowidx, A.nnz ));
            // data transfer
            magma_dcopyvector( A.nnz, A.dval, 1, B->dval, 1, queue );
            magma_index_copyvector( A.num_rows + 1, A.drow, 1, B->drow, 1, queue );
            magma_index_copyvector( A.nnz, A.dcol, 1, B->dcol, 1, queue );
            magma_index_copyvector( A.nnz, A.drowidx, 1, B->drowidx, 1, queue );
        }
        //ELL/ELLPACKT-type
        else if ( A.storage_type == Magma_ELLPACKT || A.storage_type == Magma_ELL ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, A.num_rows * A.max_nnz_row ));
            CHECK( magma_index_malloc( &B->dcol, A.num_rows * A.max_nnz_row ));
            // data transfer
            magma_dcopyvector( A.num_rows * A.max_nnz_row, A.dval, 1, B->dval, 1, queue );
            magma_index_copyvector( A.num_rows * A.max_nnz_row, A.dcol, 1, B->dcol, 1, queue );
        }
        //ELLD-type
        else if ( A.storage_type == Magma_ELLD ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, A.num_rows * A.max_nnz_row ));
            CHECK( magma_index_malloc( &B->dcol, A.num_rows * A.max_nnz_row ));
            // data transfer
            magma_dcopyvector( A.num_rows * A.max_nnz_row, A.dval, 1, B->dval, 1, queue );
            magma_index_copyvector( A.num_rows * A.max_nnz_row, A.dcol, 1, B->dcol, 1, queue );
        }
        //ELLRT-type
        else if ( A.storage_type == Magma_ELLRT ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            B->blocksize = A.blocksize;
            B->alignment = A.alignment;
            //int threads_per_row = A.alignment;
            //int rowlength = magma_roundup( A.max_nnz_row, threads_per_row );
            magma_int_t rowlength = magma_roundup( A.max_nnz_row, A.alignment );
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, A.num_rows * rowlength ));
            CHECK( magma_index_malloc( &B->dcol, A.num_rows * rowlength ));
            CHECK( magma_index_malloc( &B->drow, A.num_rows ));
            // data transfer
            magma_dcopyvector( A.num_rows * rowlength, A.dval, 1, B->dval, 1, queue );
            magma_index_copyvector( A.num_rows * rowlength, A.dcol, 1, B->dcol, 1, queue );
            magma_index_copyvector( A.num_rows, A.drow, 1, B->drow, 1, queue );
        }
        //SELLP-type
        else if ( A.storage_type == Magma_SELLP ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            B->blocksize = A.blocksize;
            B->numblocks = A.numblocks;
            B->alignment = A.alignment;
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, A.nnz ));
            CHECK( magma_index_malloc( &B->dcol, A.nnz ));
            CHECK( magma_index_malloc( &B->drow, A.numblocks + 1 ));
            // data transfer
            magma_dcopyvector( A.nnz, A.dval, 1, B->dval, 1, queue );
            magma_index_copyvector( A.nnz, A.dcol, 1, B->dcol, 1, queue );
            magma_index_copyvector( A.numblocks + 1, A.drow, 1, B->drow, 1, queue );
        }
        //BCSR-type
        else if ( A.storage_type == Magma_BCSR ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            B->blocksize = A.blocksize;
            B->numblocks = A.numblocks;
            B->alignment = A.alignment;
            magma_int_t size_b = A.blocksize;
            //magma_int_t c_blocks = ceil( (float)A.num_cols / (float)size_b );
                    // max number of blocks per row
            //magma_int_t r_blocks = ceil( (float)A.num_rows / (float)size_b );
            magma_int_t r_blocks = magma_ceildiv( A.num_rows, size_b );
                    // max number of blocks per column
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, size_b * size_b * A.numblocks ));
            CHECK( magma_index_malloc( &B->drow, r_blocks + 1 ));
            CHECK( magma_index_malloc( &B->dcol, A.numblocks ));
            // data transfer
            magma_dcopyvector( size_b * size_b * A.numblocks, A.dval, 1, B->dval, 1, queue );
            magma_index_copyvector( r_blocks + 1, A.drow, 1, B->drow, 1, queue );
            magma_index_copyvector( A.numblocks, A.dcol, 1, B->dcol, 1, queue );
        }
        //DENSE-type
        else if ( A.storage_type == Magma_DENSE ) {
            // fill in information for B
            B->storage_type = A.storage_type;
            B->memory_location = Magma_DEV;
            B->sym = A.sym;
            B->diagorder_type = A.diagorder_type;
            B->fill_mode = A.fill_mode;
            B->num_rows = A.num_rows;
            B->num_cols = A.num_cols;
            B->nnz = A.nnz; B->true_nnz = A.true_nnz;
            B->max_nnz_row = A.max_nnz_row;
            B->diameter = A.diameter;
            B->major = A.major;
            B->ld = A.ld;
            // memory allocation
            CHECK( magma_dmalloc( &B->dval, A.num_rows * A.num_cols ));
            // data transfer
            magma_dcopyvector( A.num_rows * A.num_cols, A.dval, 1, B->dval, 1, queue );
        }
    }
    
    
cleanup:
    if( info != 0 ){
        magma_dmfree( B, queue );
    }
    return info;
}
Exemplo n.º 25
0
/**
    Purpose
    -------
    SSYEVD_GPU computes all eigenvalues and, optionally, eigenvectors of
    a real symmetric matrix A.  If eigenvectors are desired, it uses a
    divide and conquer algorithm.

    The divide and conquer algorithm makes very mild assumptions about
    floating point arithmetic. It will work on machines with a guard
    digit in add/subtract, or on those binary machines without guard
    digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
    Cray-2. It could conceivably fail on hexadecimal or decimal machines
    without guard digits, but we know of none.

    Arguments
    ---------
    @param[in]
    jobz    magma_vec_t
      -     = MagmaNoVec:  Compute eigenvalues only;
      -     = MagmaVec:    Compute eigenvalues and eigenvectors.

    @param[in]
    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of A is stored;
      -     = MagmaLower:  Lower triangle of A is stored.

    @param[in]
    n       INTEGER
            The order of the matrix A.  N >= 0.

    @param[in,out]
    dA      REAL array on the GPU,
            dimension (LDDA, N).
            On entry, the symmetric matrix A.  If UPLO = MagmaUpper, the
            leading N-by-N upper triangular part of A contains the
            upper triangular part of the matrix A.  If UPLO = MagmaLower,
            the leading N-by-N lower triangular part of A contains
            the lower triangular part of the matrix A.
            On exit, if JOBZ = MagmaVec, then if INFO = 0, A contains the
            orthonormal eigenvectors of the matrix A.
            If JOBZ = MagmaNoVec, then on exit the lower triangle (if UPLO=MagmaLower)
            or the upper triangle (if UPLO=MagmaUpper) of A, including the
            diagonal, is destroyed.

    @param[in]
    ldda    INTEGER
            The leading dimension of the array DA.  LDDA >= max(1,N).

    @param[out]
    w       REAL array, dimension (N)
            If INFO = 0, the eigenvalues in ascending order.

    @param
    wA      (workspace) REAL array, dimension (LDWA, N)

    @param[in]
    ldwa    INTEGER
            The leading dimension of the array wA.  LDWA >= max(1,N).

    @param[out]
    work    (workspace) REAL array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The length of the array WORK.
            If N <= 1,                      LWORK >= 1.
            If JOBZ = MagmaNoVec and N > 1, LWORK >= 2*N + N*NB.
            If JOBZ = MagmaVec   and N > 1, LWORK >= max( 2*N + N*NB, 1 + 6*N + 2*N**2 ).
            NB can be obtained through magma_get_ssytrd_nb(N).
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal sizes of the WORK and IWORK
            arrays, returns these values as the first entries of the WORK
            and IWORK arrays, and no error message related to LWORK or
            LIWORK is issued by XERBLA.

    @param[out]
    iwork   (workspace) INTEGER array, dimension (MAX(1,LIWORK))
            On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK.

    @param[in]
    liwork  INTEGER
            The dimension of the array IWORK.
            If N <= 1,                       LIWORK >= 1.
            If JOBZ = MagmaNoVec and N > 1, LIWORK >= 1.
            If JOBZ  = MagmaVec   and N > 1, LIWORK >= 3 + 5*N.
    \n
            If LIWORK = -1, then a workspace query is assumed; the
            routine only calculates the optimal sizes of the WORK and
            IWORK arrays, returns these values as the first entries of
            the WORK and IWORK arrays, and no error message related to
            LWORK or LIWORK is issued by XERBLA.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
      -     > 0:  if INFO = i and JOBZ = MagmaNoVec, then the algorithm failed
                  to converge; i off-diagonal elements of an intermediate
                  tridiagonal form did not converge to zero;
                  if INFO = i and JOBZ = MagmaVec, then the algorithm failed
                  to compute an eigenvalue while working on the submatrix
                  lying in rows and columns INFO/(N+1) through
                  mod(INFO,N+1).

    Further Details
    ---------------
    Based on contributions by
       Jeff Rutter, Computer Science Division, University of California
       at Berkeley, USA

    Modified description of INFO. Sven, 16 Feb 05.

    @ingroup magma_ssyev_driver
    ********************************************************************/
extern "C" magma_int_t
magma_ssyevd_gpu(
    magma_vec_t jobz, magma_uplo_t uplo,
    magma_int_t n,
    magmaFloat_ptr dA, magma_int_t ldda,
    float *w,
    float *wA,  magma_int_t ldwa,
    float *work, magma_int_t lwork,
    #ifdef COMPLEX
    float *rwork, magma_int_t lrwork,
    #endif
    magma_int_t *iwork, magma_int_t liwork,
    magma_int_t *info)
{
    magma_int_t ione = 1;

    float d__1;

    float eps;
    magma_int_t inde;
    float anrm;
    float rmin, rmax;
    float sigma;
    magma_int_t iinfo, lwmin;
    magma_int_t lower;
    magma_int_t wantz;
    magma_int_t indwk2, llwrk2;
    magma_int_t iscale;
    float safmin;
    float bignum;
    magma_int_t indtau;
    magma_int_t indwrk, liwmin;
    magma_int_t llwork;
    float smlnum;
    magma_int_t lquery;

    magmaFloat_ptr dwork;
    magma_int_t lddc = ldda;

    wantz = (jobz == MagmaVec);
    lower = (uplo == MagmaLower);
    lquery = (lwork == -1 || liwork == -1);

    *info = 0;
    if (! (wantz || (jobz == MagmaNoVec))) {
        *info = -1;
    } else if (! (lower || (uplo == MagmaUpper))) {
        *info = -2;
    } else if (n < 0) {
        *info = -3;
    } else if (ldda < max(1,n)) {
        *info = -5;
    }

    magma_int_t nb = magma_get_ssytrd_nb( n );
    if ( n <= 1 ) {
        lwmin  = 1;
        liwmin = 1;
    }
    else if ( wantz ) {
        lwmin  = max( 2*n + n*nb, 1 + 6*n + 2*n*n );
        liwmin = 3 + 5*n;
    }
    else {
        lwmin  = 2*n + n*nb;
        liwmin = 1;
    }
    
    work[0]  = magma_smake_lwork( lwmin );
    iwork[0] = liwmin;

    if ((lwork < lwmin) && !lquery) {
        *info = -10;
    } else if ((liwork < liwmin) && ! lquery) {
        *info = -12;
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    magma_queue_t queue;
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queue );

    /* If matrix is very small, then just call LAPACK on CPU, no need for GPU */
    if (n <= 128) {
        magma_int_t lda = n;
        float *A;
        magma_smalloc_cpu( &A, lda*n );
        magma_sgetmatrix( n, n, dA, ldda, A, lda, queue );
        lapackf77_ssyevd( lapack_vec_const(jobz), lapack_uplo_const(uplo),
                          &n, A, &lda,
                          w, work, &lwork,
                          iwork, &liwork, info );
        magma_ssetmatrix( n, n, A, lda, dA, ldda, queue );
        magma_free_cpu( A );
        magma_queue_destroy( queue );
        return *info;
    }

    // ssytrd2_gpu requires ldda*ceildiv(n,64) + 2*ldda*nb
    // sormtr_gpu  requires lddc*n
    // slansy      requires n
    magma_int_t ldwork = max( ldda*magma_ceildiv(n,64) + 2*ldda*nb, lddc*n );
    ldwork = max( ldwork, n );
    if ( wantz ) {
        // sstedx requires 3n^2/2
        ldwork = max( ldwork, 3*n*(n/2 + 1) );
    }
    if (MAGMA_SUCCESS != magma_smalloc( &dwork, ldwork )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }

    /* Get machine constants. */
    safmin = lapackf77_slamch("Safe minimum");
    eps    = lapackf77_slamch("Precision");
    smlnum = safmin / eps;
    bignum = 1. / smlnum;
    rmin = magma_ssqrt( smlnum );
    rmax = magma_ssqrt( bignum );

    /* Scale matrix to allowable range, if necessary. */
    anrm = magmablas_slansy( MagmaMaxNorm, uplo, n, dA, ldda, dwork, ldwork, queue );
    iscale = 0;
    sigma  = 1;
    if (anrm > 0. && anrm < rmin) {
        iscale = 1;
        sigma = rmin / anrm;
    } else if (anrm > rmax) {
        iscale = 1;
        sigma = rmax / anrm;
    }
    if (iscale == 1) {
        magmablas_slascl( uplo, 0, 0, 1., sigma, n, n, dA, ldda, queue, info );
    }

    /* Call SSYTRD to reduce symmetric matrix to tridiagonal form. */
    // ssytrd work: e (n) + tau (n) + llwork (n*nb)  ==>  2n + n*nb
    // sstedx work: e (n) + tau (n) + z (n*n) + llwrk2 (1 + 4*n + n^2)  ==>  1 + 6n + 2n^2
    inde   = 0;
    indtau = inde   + n;
    indwrk = indtau + n;
    indwk2 = indwrk + n*n;
    llwork = lwork - indwrk;
    llwrk2 = lwork - indwk2;

    magma_timer_t time=0;
    timer_start( time );

#ifdef FAST_SYMV
    magma_ssytrd2_gpu( uplo, n, dA, ldda, w, &work[inde],
                       &work[indtau], wA, ldwa, &work[indwrk], llwork,
                       dwork, ldwork, &iinfo );
#else
    magma_ssytrd_gpu(  uplo, n, dA, ldda, w, &work[inde],
                       &work[indtau], wA, ldwa, &work[indwrk], llwork,
                       &iinfo );
#endif

    timer_stop( time );
    #ifdef FAST_SYMV
    timer_printf( "time ssytrd2 = %6.2f\n", time );
    #else
    timer_printf( "time ssytrd = %6.2f\n", time );
    #endif

    /* For eigenvalues only, call SSTERF.  For eigenvectors, first call
       SSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the
       tridiagonal matrix, then call SORMTR to multiply it to the Householder
       transformations represented as Householder vectors in A. */
    if (! wantz) {
        lapackf77_ssterf( &n, w, &work[inde], info );
    }
    else {
        timer_start( time );

        magma_sstedx( MagmaRangeAll, n, 0., 0., 0, 0, w, &work[inde],
                      &work[indwrk], n, &work[indwk2],
                      llwrk2, iwork, liwork, dwork, info );

        timer_stop( time );
        timer_printf( "time sstedx = %6.2f\n", time );
        timer_start( time );

        magma_ssetmatrix( n, n, &work[indwrk], n, dwork, lddc, queue );

        magma_sormtr_gpu( MagmaLeft, uplo, MagmaNoTrans, n, n, dA, ldda, &work[indtau],
                          dwork, lddc, wA, ldwa, &iinfo );

        magma_scopymatrix( n, n, dwork, lddc, dA, ldda, queue );

        timer_stop( time );
        timer_printf( "time sormtr + copy = %6.2f\n", time );
    }

    /* If matrix was scaled, then rescale eigenvalues appropriately. */
    if (iscale == 1) {
        d__1 = 1. / sigma;
        blasf77_sscal( &n, &d__1, w, &ione );
    }

    work[0]  = magma_smake_lwork( lwmin );
    iwork[0] = liwmin;

    magma_queue_destroy( queue );
    magma_free( dwork );

    return *info;
} /* magma_ssyevd_gpu */
Exemplo n.º 26
0
/**
    Purpose
    -------
    CHETRD reduces a complex Hermitian matrix A to real symmetric
    tridiagonal form T by an orthogonal similarity transformation:
    Q**H * A * Q = T.

    Arguments
    ---------
    @param[in]
    ngpu    INTEGER
            Number of GPUs to use. ngpu > 0.

    @param[in]
    nqueue  INTEGER
            The number of GPU queues used for update.  10 >= nqueue > 0.

    @param[in]
    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of A is stored;
      -     = MagmaLower:  Lower triangle of A is stored.

    @param[in]
    n       INTEGER
            The order of the matrix A.  N >= 0.

    @param[in,out]
    A       COMPLEX array, dimension (LDA,N)
            On entry, the Hermitian matrix A.  If UPLO = MagmaUpper, the leading
            N-by-N upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = MagmaLower, the
            leading N-by-N lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
            On exit, if UPLO = MagmaUpper, the diagonal and first superdiagonal
            of A are overwritten by the corresponding elements of the
            tridiagonal matrix T, and the elements above the first
            superdiagonal, with the array TAU, represent the orthogonal
            matrix Q as a product of elementary reflectors; if UPLO
            = MagmaLower, the diagonal and first subdiagonal of A are over-
            written by the corresponding elements of the tridiagonal
            matrix T, and the elements below the first subdiagonal, with
            the array TAU, represent the orthogonal matrix Q as a product
            of elementary reflectors. See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    @param[out]
    d       COMPLEX array, dimension (N)
            The diagonal elements of the tridiagonal matrix T:
            D(i) = A(i,i).
 
    @param[out]
    e       COMPLEX array, dimension (N-1)
            The off-diagonal elements of the tridiagonal matrix T:
            E(i) = A(i,i+1) if UPLO = MagmaUpper, E(i) = A(i+1,i) if UPLO = MagmaLower.

    @param[out]
    tau     COMPLEX array, dimension (N-1)
            The scalar factors of the elementary reflectors (see Further
            Details).

    @param[out]
    work    (workspace) COMPLEX array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.  LWORK >= N*NB, where NB is the
            optimal blocksize given by magma_get_chetrd_nb().
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value

    Further Details
    ---------------
    If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary
    reflectors

        Q = H(n-1) . . . H(2) H(1).

    Each H(i) has the form

        H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in
    A(1:i-1,i+1), and tau in TAU(i).

    If UPLO = MagmaLower, the matrix Q is represented as a product of elementary
    reflectors

        Q = H(1) H(2) . . . H(n-1).

    Each H(i) has the form

        H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i),
    and tau in TAU(i).

    The contents of A on exit are illustrated by the following examples
    with n = 5:

    if UPLO = MagmaUpper:                if UPLO = MagmaLower:

        (  d   e   v2  v3  v4 )              (  d                  )
        (      d   e   v3  v4 )              (  e   d              )
        (          d   e   v4 )              (  v1  e   d          )
        (              d   e  )              (  v1  v2  e   d      )
        (                  d  )              (  v1  v2  v3  e   d  )

    where d and e denote diagonal and off-diagonal elements of T, and vi
    denotes an element of the vector defining H(i).

    @ingroup magma_cheev_comp
    ********************************************************************/
extern "C" magma_int_t
magma_chetrd_mgpu(
    magma_int_t ngpu,
    magma_int_t nqueue, magma_uplo_t uplo, magma_int_t n,
    magmaFloatComplex *A, magma_int_t lda,
    float *d, float *e, magmaFloatComplex *tau,
    magmaFloatComplex *work, magma_int_t lwork,
    magma_int_t *info)
{
#define  A(i, j)     (A           + (j)*lda  + (i))
#define dA(id, i, j) (dA[(id)]    + (j)*ldda + (i))
#define dW(id, i, j) (dW[(id)] + (j)*ldda + (i))

    /* Constants */
    const magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE;
    const magmaFloatComplex c_one     = MAGMA_C_ONE;
    const float             d_one     = MAGMA_D_ONE;
    
    /* Local variables */
    const char* uplo_ = lapack_uplo_const( uplo );
    
    magma_int_t nlocal, ldda;
    magma_int_t nb = magma_get_chetrd_nb(n), ib, ib2;

    #ifdef PROFILE_SY2RK
    float mv_time = 0.0;
    float up_time = 0.0;
    #endif

    magma_int_t kk, nx;
    magma_int_t i, ii, iii, j, dev, i_n;
    magma_int_t iinfo;
    magma_int_t ldwork, lddw, lwkopt, ldwork2, lhwork;
    
    // set pointers to NULL so it is safe to goto CLEANUP if any malloc fails.
    magma_queue_t queues[MagmaMaxGPUs][10] = { { NULL, NULL } };
    magma_queue_t queues0[MagmaMaxGPUs]    = { NULL };
    magmaFloatComplex *hwork = NULL;
    magmaFloatComplex_ptr dwork2[MagmaMaxGPUs] = { NULL };
    magmaFloatComplex_ptr dA[MagmaMaxGPUs]     = { NULL };
    magmaFloatComplex_ptr dW[MagmaMaxGPUs]     = { NULL };

    *info = 0;
    bool upper = (uplo == MagmaUpper);
    bool lquery = (lwork == -1);
    if (! upper && uplo != MagmaLower) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,n)) {
        *info = -4;
    } else if (lwork < nb*n && ! lquery) {
        *info = -9;
    } else if ( nqueue > 2 ) {
        *info = 2;  // TODO fix
    }

    /* Determine the block size. */
    ldwork = n;
    lwkopt = n * nb;
    if (*info == 0) {
        work[0] = magma_cmake_lwork( lwkopt );
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (n == 0) {
        work[0] = c_one;
        return *info;
    }

    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );

    //#define PROFILE_SY2RK
    #ifdef PROFILE_SY2RK
    float times[11] = { 0 };
    magma_event_t start, stop;
    float etime;
    magma_setdevice( 0 );
    magma_event_create( &start );
    magma_event_create( &stop  );
    #endif

    ldda = magma_roundup( lda, 32 );
    lddw = ldda;
    nlocal = nb*(1 + n/(nb*ngpu));
    ldwork2 = ldda*( magma_ceildiv( n, nb ) + 1);  // i.e., ldda*(blocks + 1)
    for( dev=0; dev < ngpu; dev++ ) {
        magma_setdevice( dev );
        // TODO fix memory leak
        if ( MAGMA_SUCCESS != magma_cmalloc( &dA[dev],     nlocal*ldda + 3*lddw*nb ) ||
             MAGMA_SUCCESS != magma_cmalloc( &dwork2[dev], ldwork2 ) ) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            goto CLEANUP;
        }
        dW[dev] = dA[dev] + nlocal*ldda;
        
        for( kk=0; kk < nqueue; kk++ ) {
            magma_device_t cdev;
            magma_getdevice( &cdev );
            magma_queue_create( cdev, &queues[dev][kk] );
        }
        queues0[dev] = queues[dev][0];
    }
    
    lhwork = nqueue*ngpu*n;
    if ( MAGMA_SUCCESS != magma_cmalloc_pinned( &hwork, lhwork ) ) {
        *info = MAGMA_ERR_HOST_ALLOC;
        goto CLEANUP;
    }

    // nx <= n is required
    // use LAPACK for n < 3000, otherwise switch at 512
    if (n < 3000)
        nx = n;
    else
        nx = 512;

    if (upper) {
        /* Copy the matrix to the GPU */
        if (1 <= n-nx) {
            magma_chtodhe( ngpu, uplo, n, nb, A, lda, dA, ldda, queues, &iinfo );
        }

        /*  Reduce the upper triangle of A.
            Columns 1:kk are handled by the unblocked method. */
        for (i = nb*((n-1)/nb); i >= nx; i -= nb) {
            ib = min(nb, n-i);

            ii  = nb*(i/(nb*ngpu));
            dev = (i/nb)%ngpu;

            /* wait for the next panel */
            if (i != nb*((n-1)/nb)) {
                magma_setdevice( dev );
                magma_queue_sync( queues[dev][0] );
            }

            magma_clatrd_mgpu( ngpu, uplo, i+ib, ib, nb,
                               A(0, 0), lda, e, tau,
                               work, ldwork,
                               dA, ldda, 0,
                               dW, i+ib,
                               hwork,  lhwork,
                               dwork2, ldwork2,
                               queues0 );

            magma_cher2k_mgpu( ngpu, MagmaUpper, MagmaNoTrans, nb, i, ib,
                               c_neg_one, dW, i+ib, 0,
                               d_one,     dA, ldda, 0,
                               nqueue, queues );

            /* get the next panel */
            if (i-nb >= nx ) {
                ib2 = min(nb, n-(i-nb));
                
                ii  = nb*((i-nb)/(nb*ngpu));
                dev = ((i-nb)/nb)%ngpu;
                magma_setdevice( dev );
                
                magma_cgetmatrix_async( (i-nb)+ib2, ib2,
                                        dA(dev, 0, ii), ldda,
                                        A(0, i-nb),     lda,
                                        queues[dev][0] );
            }

            /* Copy superdiagonal elements back into A, and diagonal
               elements into D */
            for (j = i; j < i+ib; ++j) {
                if ( j > 0 ) {
                    *A(j-1,j) = MAGMA_C_MAKE( e[j - 1], 0 );
                }
                d[j] = MAGMA_C_REAL( *A(j, j) );
            }
        } /* end of for i=... */
      
        if ( nx > 0 ) {
            if (1 <= n-nx) { /* else A is already on CPU */
                for (i=0; i < nx; i += nb) {
                    ib = min(nb, n-i);
                    ii  = nb*(i/(nb*ngpu));
                    dev = (i/nb)%ngpu;
                
                    magma_setdevice( dev );
                    magma_cgetmatrix_async( nx, ib,
                                            dA(dev, 0, ii), ldda,
                                            A(0, i),        lda,
                                            queues[dev][0] );
                }
            }
            
            for( dev=0; dev < ngpu; dev++ ) {
                magma_setdevice( dev );
                magma_queue_sync( queues[dev][0] );
            }
            /* Use CPU code to reduce the last or only block */
            lapackf77_chetrd( uplo_, &nx, A(0, 0), &lda, d, e, tau,
                              work, &lwork, &iinfo );
        }
    }
    else {
        trace_init( 1, ngpu, nqueue, queues );
        /* Copy the matrix to the GPU */
        if (1 <= n-nx) {
            magma_chtodhe( ngpu, uplo, n, nb, A, lda, dA, ldda, queues, &iinfo );
        }

        /* Reduce the lower triangle of A */
        for (i = 0; i < n-nx; i += nb) {
            ib = min(nb, n-i);

            ii  = nb*(i/(nb*ngpu));
            dev = (i/nb)%ngpu;
            /* Reduce columns i:i+ib-1 to tridiagonal form and form the
               matrix W which is needed to update the unreduced part of
               the matrix */

            /*   Get the current panel (no need for the 1st iteration) */
            if (i != 0) {
                magma_setdevice( dev );
                trace_gpu_start( dev, 0, "comm", "get" );
                magma_cgetmatrix_async( n-i, ib,
                                        dA(dev, i, ii), ldda,
                                        A(i,i),         lda,
                                        queues[dev][0] );
                trace_gpu_end( dev, 0 );
                magma_queue_sync( queues[dev][0] );
                magma_setdevice( 0 );
            }
            
            magma_clatrd_mgpu( ngpu, uplo, n-i, ib, nb,
                               A(i, i), lda, &e[i], &tau[i],
                               work, ldwork,
                               dA, ldda, i,
                               dW, n-i,
                               hwork,  lhwork,
                               dwork2, ldwork2,
                               queues0 );

            #ifdef PROFILE_SY2RK
            magma_setdevice( 0 );
            if ( i > 0 ) {
                cudaEventElapsedTime( &etime, start, stop );
                up_time += (etime/1000.0);
            }
            magma_event_record( start, 0 );
            #endif
            
            magma_cher2k_mgpu( ngpu, MagmaLower, MagmaNoTrans, nb, n-i-ib, ib,
                               c_neg_one, dW, n-i, ib,
                               d_one, dA, ldda, i+ib, nqueue, queues );
            
            #ifdef PROFILE_SY2RK
            magma_setdevice( 0 );
            magma_event_record( stop, 0 );
            #endif

            /* Copy subdiagonal elements back into A, and diagonal
               elements into D */
            for (j = i; j < i+ib; ++j) {
                if ( j+1 < n ) {
                    *A(j+1,j) = MAGMA_C_MAKE( e[j], 0 );
                }
                d[j] = MAGMA_C_REAL( *A(j, j) );
            }
        } /* for i=... */

        /* Use CPU code to reduce the last or only block */
        if ( i < n ) {
            iii = i;
            i_n = n-i;
            if ( i > 0 ) {
                for (; i < n; i += nb) {
                    ib = min(nb, n-i);
                    ii  = nb*(i/(nb*ngpu));
                    dev = (i/nb)%ngpu;
                
                    magma_setdevice( dev );
                    magma_cgetmatrix_async( i_n, ib,
                                            dA(dev, iii, ii), ldda,
                                            A(iii, i),        lda,
                                            queues[dev][0] );
                }
                for( dev=0; dev < ngpu; dev++ ) {
                    magma_setdevice( dev );
                    magma_queue_sync( queues[dev][0] );
                }
            }
            lapackf77_chetrd( uplo_, &i_n, A(iii, iii), &lda, &d[iii], &e[iii],
                              &tau[iii], work, &lwork, &iinfo );
        }
    }
    
    for( dev=0; dev < ngpu; dev++ ) {
        magma_setdevice( dev );
        for( kk=0; kk < nqueue; kk++ ) {
            magma_queue_sync( queues[dev][kk] );
        }
    }
    
    #ifdef PROFILE_SY2RK
    magma_setdevice( 0 );
    if ( n > nx ) {
        cudaEventElapsedTime( &etime, start, stop );
        up_time += (etime/1000.0);
    }
    magma_event_destroy( start );
    magma_event_destroy( stop  );
    #endif

    trace_finalize( "chetrd.svg", "trace.css" );
    
    #ifdef PROFILE_SY2RK
    printf( " n=%d nb=%d\n", n, nb );
    printf( " Time in CLARFG: %.2e seconds\n", times[0] );
    //printf( " Time in CHEMV : %.2e seconds\n", mv_time );
    printf( " Time in CHER2K: %.2e seconds\n", up_time );
    #endif
    
CLEANUP:
    for( dev=0; dev < ngpu; dev++ ) {
        magma_setdevice( dev );
        for( kk=0; kk < nqueue; kk++ ) {
            magma_queue_destroy( queues[dev][kk] );
        }
        magma_free( dA[dev] );
        magma_free( dwork2[dev] );
    }
    magma_free_pinned( hwork );
    
    magma_setdevice( orig_dev );
    
    work[0] = magma_cmake_lwork( lwkopt );
    
    return *info;
} /* magma_chetrd */
Exemplo n.º 27
0
extern "C" magma_int_t
magma_zbulge_applyQ_v2_m(
    magma_int_t ngpu,
    magma_side_t side,
    magma_int_t NE, magma_int_t N,
    magma_int_t NB, magma_int_t Vblksiz,
    magmaDoubleComplex *E, magma_int_t lde,
    magmaDoubleComplex *V, magma_int_t ldv,
    magmaDoubleComplex *T, magma_int_t ldt,
    magma_int_t *info)
{
    //%===========================
    //%   local variables
    //%===========================
    magma_int_t Vm, Vn, mt, nt;
    magma_int_t myrow, mycol, blkj, blki;
    magma_int_t blkid,vpos,tpos;
    magma_int_t firstrow, nbcolinvolvd;
    magma_int_t versionL  = 113;
    magma_int_t versionR  = 92;
    magma_int_t Vchunksiz = 10;
    *info=0;

    /* Quick return */
    if ( NE == 0 ) {
        return *info;
    }
    if ( N == 0 ) {
        return *info;
    }
    if ( NB == 0 ) {
        return *info;
    }
    /* ==========================================
     * some infos for developer
     * Initialisation and checking nb of cores
     * ==========================================*/
    /* we have 2 algo for left (113 114) and 2 algo for right (91 92)
     * which correspond to versionL versionR.
     * They are very similar (detail explained in tech report and matlab code)
     * however version 114 and 92 improve locality.
     * while version 113 is used in case WNATZ=1 (construct Q2) which allow
     * the construction to be done in an optimized way taking into
     * consideration that the matrix is Identity so making less flops.
     *
    */

    // Initialize streaming and events
    magma_device_sync();
    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );
    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );

    magma_int_t  nbevents =2, nstream=2;
    magma_queue_t streams[MagmaMaxGPUs][20];
    magma_event_t  myevent[MagmaMaxGPUs][20];
    for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
        magma_setdevice( dev );
        for( magma_int_t i = 0; i < nstream; ++i ) {
            magma_queue_create( &streams[dev][i] );
        }
        for( magma_int_t i = 0; i < nbevents; ++i ) {
            cudaEventCreateWithFlags(&myevent[dev][i],cudaEventDisableTiming);
        }
    }



    // Azzam 21/11/2012
    // NOTE THAT dwork was of size 2*NE*Vblksiz+...
    // but I am thinking why not modifing it to NE*Vblksiz+...
    // BUT NO because the 2* is used because of making 2 streams working and so
    // they might be using dwork in parallel
    magmaDoubleComplex *dE[MagmaMaxGPUs];
    magmaDoubleComplex *dwork[MagmaMaxGPUs], *dwork0[MagmaMaxGPUs], *dwork1[MagmaMaxGPUs];
    //magmaDoubleComplex *dwvt[MagmaMaxGPUs];
    magmaDoubleComplex *dwvt0[MagmaMaxGPUs], *dwvt1[MagmaMaxGPUs];
    magmaDoubleComplex *dT0[MagmaMaxGPUs], *dV0[MagmaMaxGPUs], *dT1[MagmaMaxGPUs], *dV1[MagmaMaxGPUs];
    magma_int_t dev;
    magma_int_t ldde = N;
    magma_int_t lddv = ldv;
    magma_int_t lddt = ldt;
    magma_int_t ne_loc = magma_ceildiv(NE, ngpu);
    if (ne_loc < 256)
       ne_loc=256;
    magma_int_t dwVTsiz  = lddv*Vblksiz;  // lddv*lddv + lddv*NE; // lddv*Vblksiz;
    magma_int_t dworksiz = ne_loc*Vblksiz;  // lddv*Vblksiz;      // NE*Vblksiz;

    ngpu = min(ngpu, magma_ceildiv(NE,ne_loc)); // Don't use GPU that will not have data.
    // copy dE to GPUs
    for (dev=0; dev < ngpu; ++dev) {
        magma_setdevice( dev );
        if (MAGMA_SUCCESS != magma_zmalloc( &dE[dev], ldde * ne_loc)) {
            printf ("!!!!  magma_zbulge_applyQ magma_alloc failed for: dE\n" );
            exit(-1);
        }
        if (MAGMA_SUCCESS != magma_zmalloc( &dwork[dev], 2*dworksiz + 2*dwVTsiz + 2*Vchunksiz* (Vblksiz* (lddv+lddt)) )) {
            printf ("!!!!  magma_zbulge_applyQ magma_alloc failed for: dwork\n" );
            exit(-1);
        }

        dwork0[dev] = dwork[dev];               // size = dworksiz;
        dwork1[dev] = dwork0[dev] + dworksiz;   // size = dworksiz;
        dwvt0[dev]  = dwork[dev] + 2*dworksiz;  // size = dwVTsiz;
        dwvt1[dev]  = dwvt0[dev] + dwVTsiz;     // size = dwVTsiz;
        dV0[dev]    = dwork[dev] + 2*dworksiz + 2*dwVTsiz;
        dT0[dev]    = dV0[dev]   + Vchunksiz*Vblksiz*lddv;
        dV1[dev]    = dT0[dev]   + Vchunksiz*Vblksiz*lddt;
        dT1[dev]    = dV1[dev]   + Vchunksiz*Vblksiz*lddv;

        magma_int_t ie_loc = min(ne_loc, NE - ne_loc*dev);
        magma_zsetmatrix_async( N, ie_loc, E+lde*ne_loc*dev, lde, dE(dev, 0, 0), ldde, streams[dev][1] );
    }


    // make overlapped copy
    magma_int_t ncpy = 0;
    magma_int_t copyed=0, copyst=0;
    magma_int_t blkcnt,nothing, mysiz, flip, vld,tld, locpos;
    findVTsiz(N, NB, Vblksiz, &blkcnt, &nothing);
    
    flip = 0;


    /* SIDE LEFT  meaning apply E = Q*E = (q_1*q_2*.....*q_n) * E ==> so traverse Vs in reverse order (forward) from q_n to q_1
     *            Also E is splitten by row meaning each apply consist in a block of row (horizontal block) */
    /* SIDE RIGHT meaning apply E = E*Q = E * (q_1*q_2*.....*q_n) ==> so tarverse Vs in normal  order (forward) from q_1 to q_n
     *            Also E is splitten by col meaning each apply consist in a block of col (vertical block) */

    #ifdef ENABLE_DEBUG
    printf("  APPLY Q_v22_m GPU with NGPU %d  N %d, NE %d,  NB %d, Vblksiz %d, versionL %d versionR %d  SIDE %c \n",
          ngpu, N, NE, NB, Vblksiz, versionL, versionR, side);
    #endif


    /*
     * MagmamaLeft
     */
    if (side == MagmaLeft) {
        /*
         * Version 113:
         * loop over the block_col (nt) and for each find the
         * number of tiles (mt) in this block_col. then loop over mt, find
         * the size of the V's(Vm,Vn) and apply it to the corresponding
         * portion of E.
         */
        if ( versionL == 113 ) {
            nt  = magma_ceildiv((N-1),Vblksiz);
            for (blkj=nt-1; blkj >= 0; blkj--) {
                /* the index of the first row on the top of block (blkj) */
                firstrow = blkj * Vblksiz + 1;
                /*find the number of tile for this block */
                if ( blkj == nt-1 )
                    mt = magma_ceildiv( N -  firstrow,    NB);
                else
                    mt = magma_ceildiv( N - (firstrow+1), NB);
                /*loop over the tiles find the size of the Vs and apply it */
                for (blki=mt; blki > 0; blki--) {
                    /*calculate the size of each losange of Vs= (Vm,Vn)*/
                    myrow     = firstrow + (mt-blki)*NB;
                    mycol     = blkj*Vblksiz;
                    Vm = min( NB+Vblksiz-1, N-myrow);
                    if ( ( blkj == nt-1 ) && ( blki == mt ) ) {
                        Vn = min (Vblksiz, Vm);
                    } else {
                        Vn = min (Vblksiz, Vm-1);
                    }
                    /*calculate the pointer to the Vs and the Ts.
                     * Note that Vs and Ts have special storage done
                     * by the bulgechasing function*/
                    //printf("voici blkj %d blki %d  Vm %d  Vn %d mycol %d vpos %d \n",blkj,blki,Vm, Vn,mycol,vpos);
                    magma_bulge_findpos113(N, NB, Vblksiz, mycol, myrow, &blkid);
               
                    // COPY Vchunksiz Vs and Vchunksiz Ts to GPU and store it in dV0/dV1 and dT0/dT1
                    if (ncpy == 0) {
                        // flip = 1 for this.
                        copyst = 0;                             // meaning that copy will start copying from blkid =copyst
                        copyed = min(copyst+Vchunksiz, blkcnt); // meaning that copy will end copying at blkid =copyed-1==> next copy had to start at copyed
                        mysiz  = copyed-copyst;                 // the size of the chunk to be copied
                        if (mysiz > 0) {
                            ncpy = 1;
                            flip = 1;
                            vpos = copyst*Vblksiz*ldv;
                            tpos = copyst*Vblksiz*ldt;
                            vld  = mysiz * ldv;
                            tld  = mysiz * ldt;
                            for( dev = 0; dev < ngpu; ++dev ) {
                                magma_setdevice( dev );
                                magmablasSetKernelStream( streams[ dev ][ 1 ] );
                                magma_zsetmatrix_async(vld, Vblksiz, V(vpos), vld, dV1[dev], vld, streams[dev][1]);
                                magma_zsetmatrix_async(tld, Vblksiz, T(tpos), tld, dT1[dev], tld, streams[dev][1]);
                            }
                            //printf("doing the first copy   of mysiz %2d copyst %2d copyed %2d vpos %8d tpos %8d into dV1 dT1\n",mysiz,copyst,copyed,vpos,tpos);
                        }
                    }
                   
                    if (blkid == copyst) {
                        flip   = ncpy % 2;
                        copyst = copyed;                             // meaning that copy will start copying from blkid =copyst
                        copyed = min(copyst+Vchunksiz, blkcnt); // meaning that copy will end copying at blkid =copyed-1==> next copy had to start at copyed
                        mysiz  = copyed-copyst;                 // the size of the chunk to be copied
                        //printf(" get to copy blkid %d blkid+(2*Vchunksiz) %d copyst %d copyed %d\n",blkid,blkid+(Vchunksiz),copyst,copyed);
                        if (mysiz > 0) {
                            ncpy = ncpy + 1;
                            vpos = copyst*Vblksiz*ldv;
                            tpos = copyst*Vblksiz*ldt;
                            vld  = mysiz * ldv;
                            tld  = mysiz * ldt;
                            if (flip == 0) { // now I am working on dV0 so copy the next and put it on dV1
                                //printf("doing overlapping copy of mysiz %2d copyst %2d copyed %2d vpos %8d tpos %8d into dV1 dT1\n",mysiz,copyst,copyed,vpos,tpos);
                                for( dev = 0; dev < ngpu; ++dev ) {
                                    magma_setdevice( dev );
                                    magmablasSetKernelStream( streams[ dev ][ 1 ] );
                                    magma_zsetmatrix_async(vld, Vblksiz, V(vpos), vld, dV1[dev], vld, streams[dev][1]);
                                    magma_zsetmatrix_async(tld, Vblksiz, T(tpos), tld, dT1[dev], tld, streams[dev][1]);
                                }
                            } else { // now I am working on dV1 so copy the next and put it on dV0
                                //printf("doing overlapping copy of mysiz %2d copyst %2d copyed %2d vpos %8d tpos %8d into dV0 dT0\n",mysiz,copyst,copyed,vpos,tpos);
                                for( dev = 0; dev < ngpu; ++dev ) {
                                    magma_setdevice( dev );
                                    magmablasSetKernelStream( streams[ dev ][ 0 ] );
                                    magma_zsetmatrix_async(vld, Vblksiz, V(vpos), vld, dV0[dev], vld, streams[dev][0]);
                                    magma_zsetmatrix_async(tld, Vblksiz, T(tpos), tld, dT0[dev], tld, streams[dev][0]);
                                }
                            }
                        }
                    }

                    if ((Vm > 0) && (Vn > 0)) {
                        locpos = blkid%Vchunksiz;
                        magma_int_t lcvpos   = locpos*Vblksiz*lddv;
                        magma_int_t lctpos   = locpos*Vblksiz*lddt;
                        //printf("voici blkj %d blki %d  Vm %d  Vn %d mycol %d locvpos %5d loctpos %5d  blkid %2d  using data in dV%1d dT%1d \n",blkj,blki,Vm, Vn,mycol,lcvpos,lctpos, blkid,flip,flip);
                        if (flip == 0) {
                            for( dev = 0; dev < ngpu; ++dev ) {
                                magma_int_t ie_loc   = min(ne_loc, NE - ne_loc*dev);
                                magma_int_t nr_bl = magma_ceildiv(ie_loc,10000);       //nr of blocks
                                magma_int_t sz_bl = magma_ceildiv(ie_loc,nr_bl*64)*64; //maximum size of blocks (to have blocks of around the same size and multiple of 64)
                                magma_int_t ib;                                        //size of current block
                                magma_setdevice( dev );
                                magmablasSetKernelStream(streams[dev][0]);
                                magma_queue_wait_event( streams[dev][0], myevent[dev][1] );
                                for (magma_int_t i=0; i < ie_loc; i += sz_bl) {
                                    ib = min(sz_bl, ie_loc-i);
                                    //magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV0[dev]+lcvpos, lddv, dT0[dev]+lctpos, lddt, dE(dev,myrow,i), ldde, dwork0[dev], ib);
                                    magma_zlarfb_gpu_gemm( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV0[dev]+lcvpos, lddv, dT0[dev]+lctpos, lddt, dE(dev,myrow,i), ldde, dwork0[dev], ib, dwvt0[dev], Vm);
                                }
                                magma_event_record( myevent[dev][0], streams[dev][0] );
                            }
                        } else {
                            for( dev = 0; dev < ngpu; ++dev ) {
                                magma_int_t ie_loc   = min(ne_loc, NE - ne_loc*dev);
                                magma_int_t nr_bl = magma_ceildiv(ie_loc,10000);       //nr of blocks
                                magma_int_t sz_bl = magma_ceildiv(ie_loc,nr_bl*64)*64; //maximum size of blocks (to have blocks of around the same size and multiple of 64)
                                magma_int_t ib;                                        //size of current block
                                magma_setdevice( dev );
                                magmablasSetKernelStream(streams[dev][1]);
                                magma_queue_wait_event( streams[dev][1], myevent[dev][0] );
                                for (magma_int_t i=0; i < ie_loc; i += sz_bl) {
                                    ib = min(sz_bl, ie_loc-i);
                                    //magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV1[dev]+lcvpos, lddv, dT1[dev]+lctpos, lddt, dE(dev,myrow,i), ldde, dwork1[dev], ib);
                                    magma_zlarfb_gpu_gemm( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV1[dev]+lcvpos, lddv, dT1[dev]+lctpos, lddt, dE(dev,myrow,i), ldde, dwork1[dev], ib, dwvt1[dev], Vm);
                                }
                                magma_event_record( myevent[dev][1], streams[dev][1] );
                            }
                        }
                    }  // end for (Vm &Vn) > 0
                } // end for blki
            } // end for blkj
        } // end if version=113
        /*
         * Version 114:
         * loop over the block_row (mt) and for each find diagonally the
         * number of tiles (nt) in this block_row. then loop over nt, find
         * the size of the V's(Vm,Vn) and apply it to the corresponding
         * portion of E.
         */
        else {
            printf("versionL 114 not implemented in zbulge_applyQ_v2_m\n");
            exit(-1);
            mt    = magma_ceildiv((N-1),NB);
            for (blki = mt; blki > 0; blki--) {
                /* nbcolinvolvd = number of column corresponding to this block_row (blki) */
                nbcolinvolvd = min(N-1, blki*NB);
                /*find the number of tile for this block (diagonal row of tiles) */
                nt = magma_ceildiv(nbcolinvolvd,Vblksiz);
                /*loop over the tiles find the size of the Vs and apply it */
                for (blkj = nt-1; blkj >= 0; blkj--) {
                    /* the index of the first row of the first col meaning
                     * the block on the top left (blki) */
                    firstrow = (mt-blki)*NB+1;
                    /*calculate the size of each losange of Vs= (Vm,Vn)*/
                    myrow    = firstrow + blkj*Vblksiz;
                    mycol    = blkj*Vblksiz;
                    Vm = min( NB+Vblksiz-1, N-myrow);
                    if ( ( blkj == nt-1 ) && ( blki == mt ) ) {
                        Vn = min (Vblksiz, Vm);
                    } else {
                        Vn = min (Vblksiz, Vm-1);
                    }

                    if ((Vm > 0) && (Vn > 0)) {
                        /*calculate the pointer to the Vs and the Ts.
                         * Note that Vs and Ts have special storage done
                         * by the bulgechasing function*/
                        /*
                        magma_bulge_findVTpos(N, NB, Vblksiz, mycol, myrow, ldv, ldt, &vpos, &tpos);
                        magma_zsetmatrix_async(Vm, Vn, V(vpos), ldv, dV0, lddv, NULL);
                        magma_zsetmatrix_async(Vn,  Vn, T(tpos), ldt, dT0, lddt, NULL);
                        //printf("voici blki %d  rownbm %d mycol %d  coled %d  blkid %d vpos %d  tpos %d\n", blki, rownbm, mycol, coled, blkid, vpos, tpos);
                        for (magma_int_t i=0; i < NE; i += sz_bl) {
                            ib = min(sz_bl, NE-i);
                            magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV0, lddv, dT0, lddt, dE(myrow,i), ldde, dwork, NE);
                        }
                        */
                    } // end for (Vm &Vn) > 0
                } // end for blkj
            } // end for blki
        } // end version 114
    } // end LEFT
    /*
     * MagmaRight
     */
    else {
        printf("Side 'R' not implemented in zbulge_applyQ_v2_m\n");
        exit(-1);
        /*
         * Version 91:
         */
        if ( versionR == 91 ) {
            nt  = magma_ceildiv((N-1),Vblksiz);
            for (blkj=0; blkj < nt; blkj++) {
                /* the index of the first myrow on the top of block (blkj) */
                firstrow = blkj * Vblksiz + 1;
                /*find the number of tile for this block */
                if ( blkj == nt-1 )
                    mt = magma_ceildiv( N -  firstrow,    NB);
                else
                    mt = magma_ceildiv( N - (firstrow+1), NB);
                /*loop over the tiles find the size of the Vs and apply it */
                for (blki=1; blki <= mt; blki++) {
                    /*calculate the size of each losange of Vs= (Vm,Vn)*/
                    myrow  = firstrow + (mt-blki)*NB;
                    Vm = min( NB+Vblksiz-1, N-myrow);
                    if ( (blkj == nt-1) && (blki == mt) ) {
                        Vn = min (Vblksiz, Vm);
                    } else {
                        Vn = min (Vblksiz, Vm-1);
                    }
                    mycol     = blkj*Vblksiz;
                    if ((Vm > 0) && (Vn > 0)) {
                        /*calculate the pointer to the Vs and the Ts.
                         * Note that Vs and Ts have special storage done
                         * by the bulgechasing function*/
                        /*
                        magma_bulge_findVTpos(N, NB, Vblksiz, mycol, myrow, ldv, ldt, &vpos, &tpos);
                        magma_zsetmatrix_async(Vm, Vn, V(vpos), ldv, dV0, lddv, NULL);
                        magma_zsetmatrix_async(Vn,  Vn, T(tpos), ldt, dT0, lddt, NULL);
                        magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, Vm, Vn, dV0, lddv, dT0, lddt, dE(0, myrow), ldde, dwork, NE);
                        */
                    } // end for (Vm &Vn) > 0
                } // end for blki
            } // end fo blkj
        } // end of version 91
        /*
         * Version 92:
         */
        else {
            mt    = magma_ceildiv((N-1),NB);
            for (blki = 1; blki <= mt; blki++) {
                /* nbcolinvolvd = number of column corresponding to this block_row (blki) */
                nbcolinvolvd = min(N-1, blki*NB);
                /*find the number of tile for this block (diagonal row of tiles) */
                nt = magma_ceildiv(nbcolinvolvd,Vblksiz);
                /*loop over the tiles find the size of the Vs and apply it */
                for (blkj = 0; blkj < nt; blkj++) {
                    /* the index of the first row of the first col meaning
                     * the block on the top left (blki) */
                    firstrow = (mt-blki)*NB+1;
                    /*calculate the size of each losange of Vs= (Vm,Vn)*/
                    myrow    = firstrow + blkj*Vblksiz;
                    mycol    = blkj*Vblksiz;
                    Vm = min( NB+Vblksiz-1, N-myrow);
                    if ( ( blkj == nt-1 ) && ( blki == mt ) ) {
                        Vn = min (Vblksiz, Vm);
                    } else {
                        Vn = min (Vblksiz, Vm-1);
                    }
                    if ((Vm > 0) && (Vn > 0)) {
                        /*calculate the pointer to the Vs and the Ts.
                         * Note that Vs and Ts have special storage done
                         * by the bulgechasing function*/
                        /*
                        magma_bulge_findVTpos(N, NB, Vblksiz, mycol, myrow, ldv, ldt, &vpos, &tpos);
                        magma_zsetmatrix_async(Vm, Vn, V(vpos), ldv, dV0, lddv, NULL);
                        magma_zsetmatrix_async(Vn,  Vn, T(tpos), ldt, dT0, lddt, NULL);
                        magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, Vm, Vn, dV0, lddv, dT0, lddt, dE(0, myrow), ldde, dwork, NE);
                        */
                    } // end for (Vm &Vn) > 0
                } //end for blkj
            } // end for blki
        } //end of version 92
    } // end RIGHT

    // copy back the dE form each GPU
    for( dev = 0; dev < ngpu; ++dev ) {
        magma_setdevice( dev );
        magmablasSetKernelStream(streams[dev][0]);
        magma_queue_wait_event( streams[dev][0], myevent[dev][1] );
        magma_queue_wait_event( streams[dev][0], myevent[dev][0] );
        magma_int_t ie_loc   = min(ne_loc, NE - ne_loc*dev);
        magma_zgetmatrix_async( N, ie_loc, dE(dev, 0, 0), ldde, E+lde*ne_loc*dev, lde, streams[dev][0] );
        magma_event_record( myevent[dev][0], streams[dev][0] );
    }


    for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
        magma_setdevice( dev );
        magmablasSetKernelStream(streams[dev][0]);
        magma_queue_wait_event( streams[dev][0], myevent[dev][0] );
        magma_device_sync(); // no need for synchronize
        magma_free(dwork[dev]);
        magma_free(dE[dev]);
        for( magma_int_t i = 0; i < nbevents; ++i ) {
            magma_event_destroy( myevent[dev][i] );
        }
        for( magma_int_t i = 0; i < nstream; ++i ) {
            magma_queue_destroy( streams[dev][i] );
        }
    }

    magma_setdevice( orig_dev );
    magmablasSetKernelStream( orig_stream );
    return *info;
}
Exemplo n.º 28
0
extern "C" magma_int_t magma_ssytrd_sb2st(magma_int_t threads, char uplo, magma_int_t n, magma_int_t nb, magma_int_t Vblksiz,
                                          float *A, magma_int_t lda, float *D, float *E,
                                          float *V, magma_int_t ldv, float *TAU, magma_int_t compT, float *T, magma_int_t ldt)
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======


    Arguments
    =========
    THREADS (input) INTEGER
            Specifies the number of pthreads used.
            THREADS > 0

    UPLO    (input) CHARACTER*1
            = 'U':  Upper triangles of A is stored;
            = 'L':  Lower triangles of A is stored.

    N       (input) INTEGER
            The order of the matrix A.  N >= 0.

    NB      (input) INTEGER
            The order of the band matrix A.  N >= NB >= 0.

    VBLKSIZ (input) INTEGER
            The size of the block of householder vectors applied at once.

    A       (input/workspace) REAL array, dimension (LDA, N)
            On entry the band matrix stored in the following way:

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= 2*NB.

    D       (output) DOUBLE array, dimension (N)
            The diagonal elements of the tridiagonal matrix T:
            D(i) = A(i,i).

    E       (output) DOUBLE array, dimension (N-1)
            The off-diagonal elements of the tridiagonal matrix T:
            E(i) = A(i,i+1) if UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'.

    V       (output) REAL array, dimension (BLKCNT, LDV, VBLKSIZ)
            On exit it contains the blocks of householder reflectors
            BLKCNT is the number of block and it is returned by the funtion MAGMA_BULGE_GET_BLKCNT.

    LDV     (input) INTEGER
            The leading dimension of V.
            LDV > NB + VBLKSIZ + 1

    TAU     (output) REAL dimension(BLKCNT, VBLKSIZ)
            ???

    COMPT   (input) INTEGER
            if COMPT = 0 T is not computed
            if COMPT = 1 T is computed

    T       (output) REAL dimension(LDT *)
            if COMPT = 1 on exit contains the matrices T needed for Q2
            if COMPT = 0 T is not referenced

    LDT     (input) INTEGER
            The leading dimension of T.
            LDT > Vblksiz

    INFO    (output) INTEGER ????????????????????????????????????????????????????????????????????????????????????
            = 0:  successful exit


    =====================================================================  */

    #ifdef ENABLE_TIMER
    real_Double_t timeblg=0.0;
    #endif

    //char uplo_[2] = {uplo, 0};
    magma_int_t mklth = threads;
    magma_int_t INgrsiz=1;
    magma_int_t blkcnt = magma_bulge_get_blkcnt(n, nb, Vblksiz);
    magma_int_t nbtiles = magma_ceildiv(n, nb);

    memset(T,   0, blkcnt*ldt*Vblksiz*sizeof(float));
    memset(TAU, 0, blkcnt*Vblksiz*sizeof(float));
    memset(V,   0, blkcnt*ldv*Vblksiz*sizeof(float));

    magma_int_t* prog;
    magma_malloc_cpu((void**) &prog, (2*nbtiles+threads+10)*sizeof(magma_int_t));
    memset(prog, 0, (2*nbtiles+threads+10)*sizeof(magma_int_t));

    magma_sbulge_id_data* arg;
    magma_malloc_cpu((void**) &arg, threads*sizeof(magma_sbulge_id_data));

    pthread_t* thread_id;
    magma_malloc_cpu((void**) &thread_id, threads*sizeof(pthread_t));
    pthread_attr_t thread_attr;

    magma_setlapack_numthreads(1);
    magma_sbulge_data data_bulge(threads, n, nb, nbtiles, INgrsiz, Vblksiz, compT,
                                 A, lda, V, ldv, TAU, T, ldt, prog);

    // Set one thread per core
    pthread_attr_init(&thread_attr);
    pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM);
    pthread_setconcurrency(threads);

    //timing
    #ifdef ENABLE_TIMER
    timeblg = magma_wtime();
    #endif

    // Launch threads
    for (magma_int_t thread = 1; thread < threads; thread++)
    {
        arg[thread] = magma_sbulge_id_data(thread, &data_bulge);
        pthread_create(&thread_id[thread], &thread_attr, magma_ssytrd_sb2st_parallel_section, &arg[thread]);
    }
    arg[0] = magma_sbulge_id_data(0, &data_bulge);
    magma_ssytrd_sb2st_parallel_section(&arg[0]);

    // Wait for completion
    for (magma_int_t thread = 1; thread < threads; thread++)
    {
        void *exitcodep;
        pthread_join(thread_id[thread], &exitcodep);
    }

    // timing
    #ifdef ENABLE_TIMER
    timeblg = magma_wtime()-timeblg;
    printf("  time BULGE+T = %f \n" ,timeblg);
    #endif

    magma_free_cpu(thread_id);
    magma_free_cpu(arg);
    magma_free_cpu(prog);

    magma_setlapack_numthreads(mklth);
    /*================================================
     *  store resulting diag and lower diag D and E
     *  note that D and E are always real
     *================================================*/

    /* Make diagonal and superdiagonal elements real,
     * storing them in D and E
     */
    /* In real case, the off diagonal element are
     * not necessary real. we have to make off-diagonal
     * elements real and copy them to E.
     * When using HouseHolder elimination,
     * the SLARFG give us a real as output so, all the
     * diagonal/off-diagonal element except the last one are already
     * real and thus we need only to take the abs of the last
     * one.
     *  */

#if defined(PRECISION_z) || defined(PRECISION_c)
    if(uplo==MagmaLower){
        for (magma_int_t i=0; i < n-1 ; i++)
        {
            D[i] = MAGMA_S_REAL(A[i*lda  ]);
            E[i] = MAGMA_S_REAL(A[i*lda+1]);
        }
        D[n-1] = MAGMA_S_REAL(A[(n-1)*lda]);
    } else { /* MagmaUpper not tested yet */
        for (magma_int_t i=0; i<n-1; i++)
        {
            D[i]  =  MAGMA_S_REAL(A[i*lda+nb]);
            E[i] = MAGMA_S_REAL(A[i*lda+nb-1]);
        }
        D[n-1] = MAGMA_S_REAL(A[(n-1)*lda+nb]);
    } /* end MagmaUpper */
#else
    if( uplo == MagmaLower ){
        for (magma_int_t i=0; i < n-1; i++) {
            D[i] = A[i*lda];   // diag
            E[i] = A[i*lda+1]; //lower diag
        }
        D[n-1] = A[(n-1)*lda];
    } else {
        for (magma_int_t i=0; i < n-1; i++) {
            D[i] = A[i*lda+nb];   // diag
            E[i] = A[i*lda+nb-1]; //lower diag
        }
        D[n-1] = A[(n-1)*lda+nb];
    }
#endif
    return MAGMA_SUCCESS;

}
Exemplo n.º 29
0
static void magma_dtile_bulge_applyQ(
    magma_int_t core_id, magma_side_t side, magma_int_t n_loc, magma_int_t n, magma_int_t nb, magma_int_t Vblksiz,
    double *E, magma_int_t lde,
    double *V, magma_int_t ldv,
    double *TAU, double *T, magma_int_t ldt)
    //, magma_int_t* info)
{
    //%===========================
    //%   local variables
    //%===========================
    magma_int_t firstcolj;
    magma_int_t bg, rownbm;
    magma_int_t st,ed,fst,vlen,vnb,colj;
    magma_int_t vpos,tpos;
    magma_int_t cur_blksiz,avai_blksiz, ncolinvolvd;
    magma_int_t nbgr, colst, coled;

    if (n <= 0)
        return;
    if (n_loc <= 0)
        return;

    //info = 0;
    magma_int_t INFO=0;

    magma_int_t nbGblk  = magma_ceildiv(n-1, Vblksiz);

    /*
     * version v1: for each chunck it apply all the V's then move to
     *                    the other chunck. the locality here inside each
     *                    chunck meaning that thread t apply V_k then move
     *                    to V_k+1 which overlap with V_k meaning that the
     *                    E_k+1 overlap with E_k. so here is the
     *                    locality however thread t had to read V_k+1 and
     *                    T_k+1 at each apply. note that all thread if they
     *                    run at same speed they might reading the same V_k
     *                    and T_k at the same time.
     * */

    magma_int_t nb_loc = 128; //$$$$$$$$

    magma_int_t     lwork = 2*nb_loc*max(Vblksiz,64);
    double *work, *work2;

    magma_dmalloc_cpu(&work, lwork);
    magma_dmalloc_cpu(&work2, lwork);

    magma_int_t nbchunk =  magma_ceildiv(n_loc, nb_loc);

    /* SIDE LEFT  meaning apply E = Q*E = (q_1*q_2*.....*q_n) * E ==> so traverse Vs in reverse order (forward) from q_n to q_1
     *            each q_i consist of applying V to a block of row E(row_i,:) and applies are overlapped meaning
     *            that q_i+1 overlap a portion of the E(row_i, :).
     *            IN parallel E is splitten in vertical block over the threads  */
    /* SIDE RIGHT meaning apply E = E*Q = E * (q_1*q_2*.....*q_n) ==> so tarverse Vs in normal  order (forward) from q_1 to q_n
     *            each q_i consist of applying V to a block of col E(:, col_i,:) and the applies are overlapped meaning
     *            that q_i+1 overlap a portion of the E(:, col_i).
     *            IN parallel E is splitten in horizontal block over the threads  */
    #ifdef ENABLE_DEBUG
    if ((core_id == 0) || (core_id == 1))
        printf("  APPLY Q2_cpu dbulge_back   N %d  N_loc %d  nbchunk %d  NB %d  Vblksiz %d  SIDE %c \n", n, n_loc, nbchunk, nb, Vblksiz, side);
    #endif
    for (magma_int_t i = 0; i < nbchunk; i++) {
        magma_int_t ib_loc = min(nb_loc, (n_loc - i*nb_loc));

        if (side == MagmaLeft) {
            for (bg = nbGblk; bg > 0; bg--) {
                firstcolj = (bg-1)*Vblksiz + 1;
                rownbm    = magma_ceildiv((n-(firstcolj+1)),nb);
                if (bg == nbGblk) rownbm = magma_ceildiv((n-(firstcolj)),nb);  // last blk has size=1 used for real to handle A(N,N-1)
                for (magma_int_t j = rownbm; j > 0; j--) {
                    vlen = 0;
                    vnb  = 0;
                    colj = (bg-1)*Vblksiz; // for k=0; I compute the fst and then can remove it from the loop
                    fst  = (rownbm -j)*nb+colj +1;
                    for (magma_int_t k=0; k < Vblksiz; k++) {
                        colj = (bg-1)*Vblksiz + k;
                        st   = (rownbm -j)*nb+colj +1;
                        ed   = min(st+nb-1,n-1);
                        if (st > ed)
                            break;
                        if ((st == ed) && (colj != n-2))
                            break;
                        vlen = ed-fst+1;
                        vnb = k+1;
                    }
                    colst     = (bg-1)*Vblksiz;
                    magma_bulge_findVTpos(n, nb, Vblksiz, colst, fst, ldv, ldt, &vpos, &tpos);

                    if ((vlen > 0) && (vnb > 0)) {
                        lapackf77_dlarfb( "L", "N", "F", "C", &vlen, &ib_loc, &vnb, V(vpos), &ldv, T(tpos), &ldt, E(fst,i*nb_loc), &lde, work, &ib_loc);
                    }
                    if (INFO != 0)
                        printf("ERROR DORMQR INFO %d \n", (int) INFO);
                }
            }
        } else if (side == MagmaRight) {
            rownbm    = magma_ceildiv((n-1),nb);
            for (magma_int_t k = 1; k <= rownbm; k++) {
                ncolinvolvd = min(n-1, k*nb);
                avai_blksiz = min(Vblksiz,ncolinvolvd);
                nbgr = magma_ceildiv(ncolinvolvd,avai_blksiz);
                for (magma_int_t j = 1; j <= nbgr; j++) {
                    vlen = 0;
                    vnb  = 0;
                    cur_blksiz = min(ncolinvolvd-(j-1)*avai_blksiz, avai_blksiz);
                    colst = (j-1)*avai_blksiz;
                    coled = colst + cur_blksiz -1;
                    fst   = (rownbm -k)*nb+colst +1;
                    for (colj=colst; colj <= coled; colj++) {
                        st = (rownbm -k)*nb+colj +1;
                        ed = min(st+nb-1,n-1);
                        if (st > ed)
                            break;
                        if ((st == ed) && (colj != n-2))
                            break;
                        vlen = ed-fst+1;
                        vnb = vnb+1;
                    }
                    magma_bulge_findVTpos(n, nb, Vblksiz, colst, fst, ldv, ldt, &vpos, &tpos);
                    if ((vlen > 0) && (vnb > 0)) {
                        lapackf77_dlarfb( "R", "N", "F", "C", &ib_loc, &vlen, &vnb, V(vpos), &ldv, T(tpos), &ldt, E(i*nb_loc,fst), &lde, work, &ib_loc);
                    }
                }
            }
        } else {
            printf("ERROR SIDE %d \n",side);
        }
    } // END loop over the chunks

    magma_free_cpu(work);
    magma_free_cpu(work2);
}
Exemplo n.º 30
0
static void magma_stile_bulge_parallel(magma_int_t my_core_id, magma_int_t cores_num, float *A, magma_int_t lda,
                                       float *V, magma_int_t ldv, float *TAU, magma_int_t n, magma_int_t nb, magma_int_t nbtiles,
                                       magma_int_t grsiz, magma_int_t Vblksiz, volatile magma_int_t *prog)
{
    magma_int_t sweepid, myid, shift, stt, st, ed, stind, edind;
    magma_int_t blklastind, colpt;
    magma_int_t stepercol;
    magma_int_t i,j,m,k;
    magma_int_t thgrsiz, thgrnb, thgrid, thed;
    magma_int_t coreid;
    magma_int_t colblktile,maxrequiredcores,colpercore,mycoresnb;
    magma_int_t fin;
    float *work;

    if(n<=0)
        return ;
    if(grsiz<=0)
        return ;

    //printf("=================> my core id %d of %d \n",my_core_id, cores_num);

    /* As I store V in the V vector there are overlap between
     * tasks so shift is now 4 where group need to be always
     * multiple of 2, because as example if grs=1 task 2 from
     * sweep 2 can run with task 6 sweep 1., but task 2 sweep 2
     * will overwrite the V of tasks 5 sweep 1 which are used by
     * task 6, so keep in mind that group need to be multiple of 2,
     * and thus tasks 2 sweep 2 will never run with task 6 sweep 1.
     * However, when storing V in A, shift could be back to 3.
     * */

    magma_smalloc_cpu(&work, n);
    mycoresnb = cores_num;

    shift   = 5;
    if(grsiz==1)
        colblktile=1;
    else
        colblktile=grsiz/2;

    maxrequiredcores = nbtiles/colblktile;
    if(maxrequiredcores<1)maxrequiredcores=1;
    colpercore  = colblktile*nb;
    if(mycoresnb > maxrequiredcores)
        mycoresnb = maxrequiredcores;
    thgrsiz = n;
    stepercol = magma_ceildiv(shift, grsiz);
    thgrnb  = magma_ceildiv(n-1, thgrsiz);

    #ifdef ENABLE_DEBUG
    if(my_core_id==0){
        if(cores_num > maxrequiredcores)    {
           printf("==================================================================================\n");
           printf("  WARNING only %3d threads are required to run this test optimizing cache reuse\n",maxrequiredcores);
           printf("==================================================================================\n");
        }
        printf("  Static bulgechasing version v9_9col threads  %4d      N %5d      NB %5d    grs %4d thgrsiz %4d \n",cores_num, n, nb, grsiz,thgrsiz);
    }
    #endif

    for (thgrid = 1; thgrid<=thgrnb; thgrid++){
        stt  = (thgrid-1)*thgrsiz+1;
        thed = min( (stt + thgrsiz -1), (n-1));
        for (i = stt; i <= n-1; i++){
            ed=min(i,thed);
            if(stt>ed)break;
            for (m = 1; m <=stepercol; m++){
                st=stt;
                for (sweepid = st; sweepid <=ed; sweepid++){

                    for (k = 1; k <=grsiz; k++){
                        myid = (i-sweepid)*(stepercol*grsiz) +(m-1)*grsiz + k;
                        if(myid%2 ==0){
                            colpt      = (myid/2)*nb+1+sweepid-1;
                            stind      = colpt-nb+1;
                            edind      = min(colpt,n);
                            blklastind = colpt;
                            if(stind>=edind){
                                printf("ERROR---------> st>=ed  %d  %d \n\n", (int) stind, (int) edind);
                                exit(-10);
                            }
                        }else{
                            colpt      = ((myid+1)/2)*nb + 1 +sweepid -1 ;
                            stind      = colpt-nb+1;
                            edind      = min(colpt,n);
                            if( (stind>=edind-1) && (edind==n) )
                                blklastind=n;
                            else
                                blklastind=0;
                            if(stind>edind){
                                printf("ERROR---------> st>=ed  %d  %d \n\n", (int) stind, (int) edind);
                                exit(-10);
                            }
                        }

                        coreid = (stind/colpercore)%mycoresnb;

                        if(my_core_id==coreid)
                        {
                            fin=0;
                            while(fin==0)
                            {
                                if(myid==1)
                                {
                                    if( (prog[myid+shift-1]== (sweepid-1)) )
                                    {
                                        magma_strdtype1cbHLsym_withQ_v2(n, nb, A, lda, V, ldv, TAU, stind, edind, sweepid, Vblksiz, work);

                                        fin=1;
                                        prog[myid]= sweepid;
                                        if(blklastind >= (n-1))
                                        {
                                            for (j = 1; j <= shift; j++)
                                                prog[myid+j]=sweepid;
                                        }
                                    } // END progress condition

                                }else{
                                    if( (prog[myid-1]==sweepid) && (prog[myid+shift-1]== (sweepid-1)) )
                                    {
                                        if(myid%2 == 0)
                                            magma_strdtype2cbHLsym_withQ_v2(n, nb, A, lda, V, ldv, TAU, stind, edind, sweepid, Vblksiz, work);
                                        else
                                            magma_strdtype3cbHLsym_withQ_v2(n, nb, A, lda, V, ldv, TAU, stind, edind, sweepid, Vblksiz, work);

                                        fin=1;
                                        prog[myid]= sweepid;
                                        if(blklastind >= (n-1))
                                        {
                                            for (j = 1; j <= shift+mycoresnb; j++)
                                                prog[myid+j]=sweepid;
                                        }
                                    } // END progress condition
                                } // END if myid==1
                            } // END while loop

                        } // END if my_core_id==coreid

                        if(blklastind >= (n-1))
                        {
                            stt=stt+1;
                            break;
                        }
                    }   // END for k=1:grsiz
                } // END for sweepid=st:ed
            } // END for m=1:stepercol
        } // END for i=1:n-1
    } // END for thgrid=1:thgrnb

    magma_free_cpu(work);

} // END FUNCTION