static void magma_stile_bulge_computeT_parallel(magma_int_t my_core_id, magma_int_t cores_num, float *V, magma_int_t ldv, float *TAU, float *T, magma_int_t ldt, magma_int_t n, magma_int_t nb, magma_int_t Vblksiz) { //%=========================== //% local variables //%=========================== magma_int_t firstcolj; magma_int_t rownbm; magma_int_t st,ed,fst,vlen,vnb,colj; magma_int_t blkid,vpos,taupos,tpos; magma_int_t blkpercore, myid; if(n<=0) return ; magma_int_t blkcnt = magma_bulge_get_blkcnt(n, nb, Vblksiz); blkpercore = blkcnt/cores_num; magma_int_t nbGblk = magma_ceildiv(n-1, Vblksiz); if(my_core_id==0) printf(" COMPUTE T parallel threads %d with N %d NB %d Vblksiz %d \n",cores_num,n,nb,Vblksiz); for (magma_int_t bg = nbGblk; bg>0; bg--) { firstcolj = (bg-1)*Vblksiz + 1; rownbm = magma_ceildiv(n-(firstcolj+1), nb); if(bg==nbGblk) rownbm = magma_ceildiv(n-firstcolj ,nb); // last blk has size=1 used for real to handle A(N,N-1) for (magma_int_t m = rownbm; m>0; m--) { vlen = 0; vnb = 0; colj = (bg-1)*Vblksiz; // for k=0;I compute the fst and then can remove it from the loop fst = (rownbm -m)*nb+colj +1; for (magma_int_t k=0; k<Vblksiz; k++) { colj = (bg-1)*Vblksiz + k; st = (rownbm -m)*nb+colj +1; ed = min(st+nb-1,n-1); if(st>ed) break; if((st==ed)&&(colj!=n-2)) break; vlen=ed-fst+1; vnb=k+1; } colj = (bg-1)*Vblksiz; magma_bulge_findVTAUTpos(n, nb, Vblksiz, colj, fst, ldv, ldt, &vpos, &taupos, &tpos, &blkid); myid = blkid/blkpercore; if(my_core_id==(myid%cores_num)){ if((vlen>0)&&(vnb>0)) lapackf77_slarft( "F", "C", &vlen, &vnb, V(vpos), &ldv, TAU(taupos), T(tpos), &ldt); } } } }
void magma_bulge_findpos113(magma_int_t n, magma_int_t nb, magma_int_t Vblksiz, magma_int_t sweep, magma_int_t st, magma_int_t *myblkid) { magma_int_t prevGblkid, mastersweep; magma_int_t locblknb = 0; magma_int_t prevblkcnt = 0; magma_int_t myblknb = 0; magma_int_t lstGblkid = magma_ceildiv((n-1),Vblksiz)-1;//nbGblk-1; magma_int_t myGblkid = sweep/Vblksiz; // go forward and for each Gblk(a column of blocks) before my Gblk compute its number of blk. //=================================================== for (prevGblkid = lstGblkid; prevGblkid > myGblkid; prevGblkid--) { mastersweep = prevGblkid * Vblksiz; if(prevGblkid==lstGblkid) locblknb = magma_ceildiv((n-(mastersweep+1)),nb); else locblknb = magma_ceildiv((n-(mastersweep+2)),nb); prevblkcnt = prevblkcnt + locblknb; } //=================================================== /* // for best performance, the if condiiton inside the loop // is only for prevblkid==lastblkid so I can unroll this // out of the loop and so remove the if condition. //=================================================== // for prevGblkid==lstGblkid mastersweep = lstGblkid * Vblksiz; locblknb = magma_ceildiv((n-(mastersweep+1)),nb); prevblkcnt = prevblkcnt + locblknb; // the remaining of the loop for (prevGblkid = lstGblkid-1; prevGblkid > myGblkid; prevGblkid--) { mastersweep = prevGblkid * Vblksiz; locblknb = magma_ceildiv((n-(mastersweep+2)),nb); prevblkcnt = prevblkcnt + locblknb; } //=================================================== */ myblknb = magma_ceildiv((st-sweep),nb); *myblkid = prevblkcnt + myblknb -1; //printf("voici sweep %d lstGblkid %d myGblkid %d prevcnt %d myblknb %d myblkid %d\n", sweep, lstGblkid,myGblkid,prevblkcnt,myblknb,*myblkid ); }
void magma_bulge_findpos(magma_int_t n, magma_int_t nb, magma_int_t Vblksiz, magma_int_t sweep, magma_int_t st, magma_int_t *myblkid) { magma_int_t locblknb, prevblkcnt, prevGblkid; magma_int_t myblknb, nbprevGblk, mastersweep; locblknb = 0; prevblkcnt = 0; myblknb = 0; nbprevGblk = sweep/Vblksiz; for (prevGblkid = 0; prevGblkid < nbprevGblk; prevGblkid++) { mastersweep = prevGblkid * Vblksiz; locblknb = magma_ceildiv((n-(mastersweep+2)),nb); prevblkcnt = prevblkcnt + locblknb; } myblknb = magma_ceildiv((st-sweep),nb); *myblkid = prevblkcnt + myblknb -1; }
magma_int_t magma_bulge_get_blkcnt(magma_int_t n, magma_int_t nb, magma_int_t Vblksiz) { magma_int_t colblk, nbcolblk; magma_int_t myblknb, mastersweep; magma_int_t blkcnt = 0; nbcolblk = magma_ceildiv((n-1),Vblksiz); for (colblk = 0; colblk<nbcolblk; colblk++) { mastersweep = colblk * Vblksiz; if(colblk == (nbcolblk-1)) myblknb = magma_ceildiv((n-(mastersweep+1)),nb); else myblknb = magma_ceildiv((n-(mastersweep+2)),nb); blkcnt = blkcnt + myblknb; //printf("voici nbcolblk %d master sweep %d blkcnt %d \n",nbcolblk, mastersweep,*blkcnt); } return blkcnt; }
void findVTsiz(magma_int_t N, magma_int_t NB, magma_int_t Vblksiz, magma_int_t *blkcnt, magma_int_t *LDV) { magma_int_t colblk, nbcolblk; magma_int_t myblknb, mastersweep; *blkcnt = 0; nbcolblk = plasma_ceildiv((N-1),Vblksiz); for (colblk = 0; colblk<nbcolblk; colblk++) { mastersweep = colblk * Vblksiz; if(colblk == (nbcolblk-1)) myblknb = magma_ceildiv((N-(mastersweep+1)),NB); else myblknb = magma_ceildiv((N-(mastersweep+2)),NB); *blkcnt = *blkcnt + myblknb; //printf("voici nbcolblk %d master sweep %d blkcnt %d \n",nbcolblk, mastersweep,*blkcnt); } *LDV= NB+Vblksiz; }
// -------------------- int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, cpu_time=0, cpu_perf=0, gpu_time, gpu_perf, mgpu_time, mgpu_perf, cuda_time, cuda_perf; double Ynorm, error=0, error2=0, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t n_local[MagmaMaxGPUs]; magma_int_t N, Noffset, lda, ldda, blocks, lhwork, ldwork, matsize, vecsize; magma_int_t incx = 1; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 1.5, -2.3 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( -0.6, 0.8 ); magmaDoubleComplex *A, *X, *Y, *Ylapack, *Ycublas, *Ymagma, *Ymagma1, *hwork; magmaDoubleComplex_ptr dA, dX, dY; magmaDoubleComplex_ptr d_lA[MagmaMaxGPUs], dwork[MagmaMaxGPUs]; magma_device_t dev; magma_queue_t queues[MagmaMaxGPUs]; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); opts.ngpu = abs( opts.ngpu ); // always uses multi-GPU code double tol = opts.tolerance * lapackf77_dlamch("E"); magma_int_t nb = 64; // required by magmablas_zhemv_mgpu implementation for( dev=0; dev < opts.ngpu; ++dev ) { magma_queue_create( dev, &queues[dev] ); } // currently, tests all offsets in the offsets array; // comment out loop below to test a specific offset. magma_int_t offset = opts.offset; magma_int_t offsets[] = { 0, 1, 31, 32, 33, 63, 64, 65, 100, 200 }; magma_int_t noffsets = sizeof(offsets) / sizeof(*offsets); printf("%% uplo = %s, ngpu %d, block size = %d, offset %d\n", lapack_uplo_const(opts.uplo), (int) opts.ngpu, (int) nb, (int) offset ); printf( "%% BLAS CUBLAS MAGMA 1 GPU MAGMA MGPU Error rel Error rel\n" "%% N offset Gflop/s (msec) Gflop/s (msec) Gflop/s (msec) Gflop/s (msec) to CUBLAS to LAPACK\n" "%%==================================================================================================================\n" ); for( int itest = 0; itest < opts.ntest; ++itest ) { // comment out these two lines & end of loop to test a specific offset for( int ioffset=0; ioffset < noffsets; ioffset += 1 ) { offset = offsets[ioffset]; for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; Noffset = N + offset; lda = Noffset; ldda = magma_roundup( Noffset, opts.align ); // multiple of 32 by default matsize = Noffset*ldda; vecsize = (Noffset-1)*incx + 1; gflops = FLOPS_ZHEMV( N ) / 1e9; blocks = magma_ceildiv( N + (offset % nb), nb ); lhwork = N*opts.ngpu; ldwork = ldda*(blocks + 1); TESTING_MALLOC_CPU( A, magmaDoubleComplex, matsize ); TESTING_MALLOC_CPU( Y, magmaDoubleComplex, vecsize ); TESTING_MALLOC_CPU( Ycublas, magmaDoubleComplex, vecsize ); TESTING_MALLOC_CPU( Ymagma, magmaDoubleComplex, vecsize ); TESTING_MALLOC_CPU( Ymagma1, magmaDoubleComplex, vecsize ); TESTING_MALLOC_CPU( Ylapack, magmaDoubleComplex, vecsize ); TESTING_MALLOC_PIN( X, magmaDoubleComplex, vecsize ); TESTING_MALLOC_PIN( hwork, magmaDoubleComplex, lhwork ); magma_setdevice( opts.device ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, matsize ); TESTING_MALLOC_DEV( dX, magmaDoubleComplex, vecsize ); TESTING_MALLOC_DEV( dY, magmaDoubleComplex, vecsize ); // TODO make magma_zmalloc_bcyclic helper function? for( dev=0; dev < opts.ngpu; dev++ ) { n_local[dev] = ((Noffset/nb)/opts.ngpu)*nb; if (dev < (Noffset/nb) % opts.ngpu) n_local[dev] += nb; else if (dev == (Noffset/nb) % opts.ngpu) n_local[dev] += Noffset % nb; magma_setdevice( dev ); TESTING_MALLOC_DEV( d_lA[dev], magmaDoubleComplex, ldda*n_local[dev] ); TESTING_MALLOC_DEV( dwork[dev], magmaDoubleComplex, ldwork ); } ////////////////////////////////////////////////////////////////////////// /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &matsize, A ); magma_zmake_hermitian( Noffset, A, lda ); lapackf77_zlarnv( &ione, ISEED, &vecsize, X ); lapackf77_zlarnv( &ione, ISEED, &vecsize, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_setdevice( opts.device ); magma_zsetmatrix( Noffset, Noffset, A, lda, dA, ldda, opts.queue ); magma_zsetvector( Noffset, X, incx, dX, incx, opts.queue ); magma_zsetvector( Noffset, Y, incx, dY, incx, opts.queue ); cuda_time = magma_sync_wtime(0); cublasZhemv( opts.handle, cublas_uplo_const(opts.uplo), N, &alpha, dA + offset + offset*ldda, ldda, dX + offset, incx, &beta, dY + offset, incx ); cuda_time = magma_sync_wtime(0) - cuda_time; cuda_perf = gflops / cuda_time; magma_zgetvector( Noffset, dY, incx, Ycublas, incx, opts.queue ); /* ===================================================================== Performs operation using MAGMABLAS (1 GPU) =================================================================== */ magma_setdevice( opts.device ); magma_zsetvector( Noffset, Y, incx, dY, incx, opts.queue ); gpu_time = magma_sync_wtime( opts.queue ); magmablas_zhemv_work( opts.uplo, N, alpha, dA + offset + offset*ldda, ldda, dX + offset, incx, beta, dY + offset, incx, dwork[ opts.device ], ldwork, opts.queue ); gpu_time = magma_sync_wtime( opts.queue ) - gpu_time; gpu_perf = gflops / gpu_time; magma_zgetvector( Noffset, dY, incx, Ymagma1, incx, opts.queue ); /* ===================================================================== Performs operation using MAGMABLAS (multi-GPU) =================================================================== */ magma_zsetmatrix_1D_col_bcyclic( Noffset, Noffset, A, lda, d_lA, ldda, opts.ngpu, nb, queues ); blasf77_zcopy( &Noffset, Y, &incx, Ymagma, &incx ); // workspaces do NOT need to be zero -- set to NAN to prove for( dev=0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); magmablas_zlaset( MagmaFull, ldwork, 1, MAGMA_Z_NAN, MAGMA_Z_NAN, dwork[dev], ldwork, opts.queue ); } lapackf77_zlaset( "Full", &lhwork, &ione, &MAGMA_Z_NAN, &MAGMA_Z_NAN, hwork, &lhwork ); mgpu_time = magma_sync_wtime(0); magma_int_t info; info = magmablas_zhemv_mgpu( opts.uplo, N, alpha, d_lA, ldda, offset, X + offset, incx, beta, Ymagma + offset, incx, hwork, lhwork, dwork, ldwork, opts.ngpu, nb, queues ); if (info != 0) { printf("magmablas_zhemv_mgpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } info = magmablas_zhemv_mgpu_sync( opts.uplo, N, alpha, d_lA, ldda, offset, X + offset, incx, beta, Ymagma + offset, incx, hwork, lhwork, dwork, ldwork, opts.ngpu, nb, queues ); if (info != 0) { printf("magmablas_zhemv_sync returned error %d: %s.\n", (int) info, magma_strerror( info )); } mgpu_time = magma_sync_wtime(0) - mgpu_time; mgpu_perf = gflops / mgpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { blasf77_zcopy( &Noffset, Y, &incx, Ylapack, &incx ); cpu_time = magma_wtime(); blasf77_zhemv( lapack_uplo_const(opts.uplo), &N, &alpha, A + offset + offset*lda, &lda, X + offset, &incx, &beta, Ylapack + offset, &incx ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Compute the Difference LAPACK vs. Magma =================================================================== */ Ynorm = lapackf77_zlange( "F", &Noffset, &ione, Ylapack, &Noffset, work ); blasf77_zaxpy( &Noffset, &c_neg_one, Ymagma, &incx, Ylapack, &incx ); error2 = lapackf77_zlange( "F", &Noffset, &ione, Ylapack, &Noffset, work ) / Ynorm; } /* ===================================================================== Compute the Difference Cublas vs. Magma =================================================================== */ Ynorm = lapackf77_zlange( "F", &Noffset, &ione, Ycublas, &Noffset, work ); blasf77_zaxpy( &Noffset, &c_neg_one, Ymagma, &incx, Ycublas, &incx ); error = lapackf77_zlange( "F", &Noffset, &ione, Ycublas, &Noffset, work ) / Ynorm; bool okay = (error < tol && error2 < tol); status += ! okay; if ( opts.lapack ) { printf( "%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) N, (int) offset, cpu_perf, cpu_time*1000., cuda_perf, cuda_time*1000., gpu_perf, gpu_time*1000., mgpu_perf, mgpu_time*1000., error, error2, (okay ? "ok" : "failed") ); } else { printf( "%5d %5d --- ( --- ) %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e --- %s\n", (int) N, (int) offset, cuda_perf, cuda_time*1000., gpu_perf, gpu_time*1000., mgpu_perf, mgpu_time*1000., error, (okay ? "ok" : "failed") ); } /* Free Memory */ TESTING_FREE_CPU( A ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ycublas ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_CPU( Ymagma1 ); TESTING_FREE_CPU( Ylapack ); TESTING_FREE_PIN( X ); TESTING_FREE_PIN( hwork ); magma_setdevice( opts.device ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); for( dev=0; dev < opts.ngpu; dev++ ) { magma_setdevice( dev ); TESTING_FREE_DEV( d_lA[dev] ); TESTING_FREE_DEV( dwork[dev] ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } // comment out these two lines line & top of loop test a specific offset } // end for ioffset printf( "\n" ); } for( dev=0; dev < opts.ngpu; ++dev ) { magma_queue_destroy( queues[dev] ); } opts.cleanup(); TESTING_FINALIZE(); return status; }
//################################################################################################## static void *magma_zapplyQ_m_parallel_section(void *arg) { magma_int_t my_core_id = ((magma_zapplyQ_m_id_data*)arg) -> id; magma_zapplyQ_m_data* data = ((magma_zapplyQ_m_id_data*)arg) -> data; magma_int_t ngpu = data -> ngpu; magma_int_t allcores_num = data -> threads_num; magma_int_t n = data -> n; magma_int_t ne = data -> ne; magma_int_t n_gpu = data -> n_gpu; magma_int_t nb = data -> nb; magma_int_t Vblksiz = data -> Vblksiz; magmaDoubleComplex *E = data -> E; magma_int_t lde = data -> lde; magmaDoubleComplex *V = data -> V; magma_int_t ldv = data -> ldv; magmaDoubleComplex *TAU = data -> TAU; magmaDoubleComplex *T = data -> T; magma_int_t ldt = data -> ldt; pthread_barrier_t* barrier = &(data -> barrier); magma_int_t info; #ifdef ENABLE_TIMER real_Double_t timeQcpu=0.0, timeQgpu=0.0; #endif magma_int_t n_cpu = ne - n_gpu; // with MKL and when using omp_set_num_threads instead of mkl_set_num_threads // it need that all threads setting it to 1. magma_set_lapack_numthreads(1); #ifdef MAGMA_SETAFFINITY //#define PRINTAFFINITY #ifdef PRINTAFFINITY affinity_set print_set; print_set.print_affinity(my_core_id, "starting affinity"); #endif affinity_set original_set; affinity_set new_set(my_core_id); int check = 0; int check2 = 0; // bind threads check = original_set.get_affinity(); if (check == 0) { check2 = new_set.set_affinity(); if (check2 != 0) printf("Error in sched_setaffinity (single cpu)\n"); } else { printf("Error in sched_getaffinity\n"); } #ifdef PRINTAFFINITY print_set.print_affinity(my_core_id, "set affinity"); #endif #endif if (my_core_id == 0) { //============================================= // on GPU on thread 0: // - apply V2*Z(:,1:N_GPU) //============================================= #ifdef ENABLE_TIMER timeQgpu = magma_wtime(); #endif magma_zbulge_applyQ_v2_m(ngpu, MagmaLeft, n_gpu, n, nb, Vblksiz, E, lde, V, ldv, T, ldt, &info); magma_device_sync(); #ifdef ENABLE_TIMER timeQgpu = magma_wtime()-timeQgpu; printf(" Finish Q2_GPU GGG timing= %f\n", timeQgpu); #endif } else { //============================================= // on CPU on threads 1:allcores_num-1: // - apply V2*Z(:,N_GPU+1:NE) //============================================= #ifdef ENABLE_TIMER if (my_core_id == 1) timeQcpu = magma_wtime(); #endif magma_int_t n_loc = magma_ceildiv(n_cpu, allcores_num-1); magmaDoubleComplex* E_loc = E + (n_gpu+ n_loc * (my_core_id-1))*lde; n_loc = min(n_loc,n_cpu - n_loc * (my_core_id-1)); magma_ztile_bulge_applyQ(my_core_id, MagmaLeft, n_loc, n, nb, Vblksiz, E_loc, lde, V, ldv, TAU, T, ldt); pthread_barrier_wait(barrier); #ifdef ENABLE_TIMER if (my_core_id == 1) { timeQcpu = magma_wtime()-timeQcpu; printf(" Finish Q2_CPU CCC timing= %f\n", timeQcpu); } #endif } // END if my_core_id #ifdef MAGMA_SETAFFINITY // unbind threads if (check == 0) { check2 = original_set.set_affinity(); if (check2 != 0) printf("Error in sched_setaffinity (restore cpu list)\n"); } #ifdef PRINTAFFINITY print_set.print_affinity(my_core_id, "restored_affinity"); #endif #endif return 0; }
static void magma_stile_bulge_computeT_parallel(magma_int_t my_core_id, magma_int_t cores_num, float *V, magma_int_t ldv, float *TAU, float *T, magma_int_t ldt, magma_int_t n, magma_int_t nb, magma_int_t Vblksiz) { //%=========================== //% local variables //%=========================== magma_int_t Vm, Vn, mt, nt; magma_int_t myrow, mycol, blkj, blki, firstrow; magma_int_t blkid,vpos,taupos,tpos; magma_int_t blkpercore, myid; if(n<=0) return ; magma_int_t blkcnt = magma_bulge_get_blkcnt(n, nb, Vblksiz); blkpercore = blkcnt/cores_num; blkpercore = blkpercore==0 ? 1:blkpercore; //magma_int_t nbGblk = magma_ceildiv(n-1, Vblksiz); #ifdef ENABLE_DEBUG if(my_core_id==0) printf(" COMPUTE T parallel threads %d with N %d NB %d Vblksiz %d \n",cores_num,n,nb,Vblksiz); #endif /*======================================== * compute the T's in parallel. * The Ts are independent so each core pick * a T and compute it. The loop is based on * the version 113 of the applyQ * which go over the losange block_column * by block column. but it is not important * here the order because Ts are independent. * ======================================== */ nt = magma_ceildiv((n-1),Vblksiz); for (blkj=nt-1; blkj>=0; blkj--) { /* the index of the first row on the top of block (blkj) */ firstrow = blkj * Vblksiz + 1; /*find the number of tile for this block */ if( blkj == nt-1 ) mt = magma_ceildiv( n - firstrow, nb); else mt = magma_ceildiv( n - (firstrow+1), nb); /*loop over the tiles find the size of the Vs and apply it */ for (blki=mt; blki>0; blki--) { /*calculate the size of each losange of Vs= (Vm,Vn)*/ myrow = firstrow + (mt-blki)*nb; mycol = blkj*Vblksiz; Vm = min( nb+Vblksiz-1, n-myrow); if( ( blkj == nt-1 ) && ( blki == mt ) ){ Vn = min (Vblksiz, Vm); } else { Vn = min (Vblksiz, Vm-1); } /*calculate the pointer to the Vs and the Ts. * Note that Vs and Ts have special storage done * by the bulgechasing function*/ magma_bulge_findVTAUTpos(n, nb, Vblksiz, mycol, myrow, ldv, ldt, &vpos, &taupos, &tpos, &blkid); myid = blkid/blkpercore; if( my_core_id==(myid%cores_num) ){ if( ( Vm > 0 ) && ( Vn > 0 ) ){ lapackf77_slarft( "F", "C", &Vm, &Vn, V(vpos), &ldv, TAU(taupos), T(tpos), &ldt); } } } } }
//################################################################################################## static void *magma_sapplyQ_parallel_section(void *arg) { magma_int_t my_core_id = ((magma_sapplyQ_id_data*)arg) -> id; magma_sapplyQ_data* data = ((magma_sapplyQ_id_data*)arg) -> data; magma_int_t allcores_num = data -> threads_num; magma_int_t n = data -> n; magma_int_t ne = data -> ne; magma_int_t n_gpu = data -> n_gpu; magma_int_t nb = data -> nb; magma_int_t Vblksiz = data -> Vblksiz; float *E = data -> E; magma_int_t lde = data -> lde; float *V = data -> V; magma_int_t ldv = data -> ldv; float *TAU = data -> TAU; float *T = data -> T; magma_int_t ldt = data -> ldt; float *dE = data -> dE; magma_int_t ldde = data -> ldde; pthread_barrier_t* barrier = &(data -> barrier); magma_int_t info; real_Double_t timeQcpu=0.0, timeQgpu=0.0; magma_int_t n_cpu = ne - n_gpu; #if defined(SETAFFINITY) cpu_set_t set; CPU_ZERO( &set ); CPU_SET( my_core_id, &set ); sched_setaffinity( 0, sizeof(set), &set) ; #endif if(my_core_id==0) { //============================================= // on GPU on thread 0: // - apply V2*Z(:,1:N_GPU) //============================================= timeQgpu = magma_wtime(); magma_ssetmatrix(n, n_gpu, E, lde, dE, ldde); magma_sbulge_applyQ_v2('L', n_gpu, n, nb, Vblksiz, dE, ldde, V, ldv, T, ldt, &info); magma_device_sync(); timeQgpu = magma_wtime()-timeQgpu; printf(" Finish Q2_GPU GGG timing= %f \n" ,timeQgpu); }else{ //============================================= // on CPU on threads 1:allcores_num-1: // - apply V2*Z(:,N_GPU+1:NE) //============================================= if(my_core_id == 1) timeQcpu = magma_wtime(); magma_int_t n_loc = magma_ceildiv(n_cpu, allcores_num-1); float* E_loc = E + (n_gpu+ n_loc * (my_core_id-1))*lde; n_loc = min(n_loc,n_cpu - n_loc * (my_core_id-1)); magma_stile_bulge_applyQ('L', n_loc, n, nb, Vblksiz, E_loc, lde, V, ldv, TAU, T, ldt); pthread_barrier_wait(barrier); if(my_core_id == 1){ timeQcpu = magma_wtime()-timeQcpu; printf(" Finish Q2_CPU CCC timing= %f \n" ,timeQcpu); } } // END if my_core_id #if defined(SETAFFINITY) // unbind threads magma_int_t sys_corenbr = 1; sys_corenbr = sysconf(_SC_NPROCESSORS_ONLN); CPU_ZERO( &set ); for(magma_int_t i=0; i<sys_corenbr; i++) CPU_SET( i, &set ); sched_setaffinity( 0, sizeof(set), &set) ; #endif return 0; }
magma_int_t plasma_ceildiv(magma_int_t a, magma_int_t b) { return magma_ceildiv(a,b); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing strtri */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time=0; //, cpu_perf=0, cpu_time=0; float magma_error, norm_invA, work[1]; magma_int_t i, j, N, lda, ldda, info; magma_int_t jb, nb, nblock, sizeA, size_inv; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t *ipiv; float *h_A, *h_dinvA; magmaFloat_ptr d_A, d_dinvA; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) float tol = opts.tolerance * lapackf77_slamch("E"); const char *uplo_ = lapack_uplo_const(opts.uplo); // this is the NB hard coded into strtri_diag. nb = 128; printf("%% uplo = %s, diag = %s\n", lapack_uplo_const(opts.uplo), lapack_diag_const(opts.diag) ); printf("%% N MAGMA Gflop/s (ms) MAGMA error\n"); printf("%%======================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldda = magma_roundup( lda, opts.align ); // multiple of 32 by default nblock = magma_ceildiv( N, nb ); gflops = nblock * FLOPS_STRTRI( nb ) / 1e9; TESTING_MALLOC_CPU( h_A, float, lda*N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); size_inv = nblock*nb*nb; TESTING_MALLOC_DEV( d_A, float, ldda*N ); TESTING_MALLOC_DEV( d_dinvA, float, size_inv ); TESTING_MALLOC_CPU( h_dinvA, float, size_inv ); /* Initialize the matrices */ /* Factor A into LU to get well-conditioned triangular matrix. * Copy L to U, since L seems okay when used with non-unit diagonal * (i.e., from U), while U fails when used with unit diagonal. */ sizeA = lda*N; lapackf77_slarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_sgetrf( &N, &N, h_A, &lda, ipiv, &info ); for( j = 0; j < N; ++j ) { for( i = 0; i < j; ++i ) { *h_A(i,j) = *h_A(j,i); } } /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ magma_ssetmatrix( N, N, h_A, lda, d_A, ldda, opts.queue ); magma_time = magma_sync_wtime( opts.queue ); magmablas_strtri_diag( opts.uplo, opts.diag, N, d_A, ldda, d_dinvA, opts.queue ); magma_time = magma_sync_wtime( opts.queue ) - magma_time; magma_perf = gflops / magma_time; magma_sgetvector( size_inv, d_dinvA, 1, h_dinvA, 1, opts.queue ); if ( opts.verbose ) { printf( "A%d=", (int) N ); magma_sprint( N, N, h_A, lda ); printf( "d_dinvA%d=", (int) N ); magma_sprint( min(N+4, nb), min(N+4, nblock*nb), h_dinvA, nb ); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { //cpu_time = magma_wtime(); lapackf77_strtri( lapack_uplo_const(opts.uplo), lapack_diag_const(opts.diag), &N, h_A, &lda, &info ); //cpu_time = magma_wtime() - cpu_time; //cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.check ) { // |invA - invA_magma| / |invA|, accumulated over all diagonal blocks magma_error = 0; norm_invA = 0; for( i=0; i < N; i += nb ) { jb = min( nb, N-i ); sgeadd( jb, jb, c_neg_one, h_A(i, i), lda, h_dinvA(0, i), nb ); magma_error = max( magma_error, lapackf77_slantr( "M", uplo_, MagmaNonUnitStr, &jb, &jb, h_dinvA(0, i), &nb, work )); norm_invA = max( norm_invA, lapackf77_slantr( "M", uplo_, MagmaNonUnitStr, &jb, &jb, h_A(i, i), &lda, work )); } magma_error /= norm_invA; // CPU is doing N-by-N inverse, while GPU is doing (N/NB) NB-by-NB inverses. // So don't compare performance. printf("%5d %7.2f (%7.2f) %8.2e %s\n", (int) N, magma_perf, 1000.*magma_time, //cpu_perf, 1000.*cpu_time, magma_error, (magma_error < tol ? "ok" : "failed")); status += ! (magma_error < tol); } else { printf("%5d %7.2f (%7.2f) ---\n", (int) N, magma_perf, 1000.*magma_time ); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_dinvA ); TESTING_FREE_CPU( h_dinvA ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/** Purpose ------- Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangles of A is stored; - = MagmaLower: Lower triangles of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] nb INTEGER The order of the band matrix A. N >= NB >= 0. @param[in] Vblksiz INTEGER The size of the block of householder vectors applied at once. @param[in] A (workspace) COMPLEX_16 array, dimension (LDA, N) On entry the band matrix stored in the following way: @param[in] lda INTEGER The leading dimension of the array A. LDA >= 2*NB. @param[out] d DOUBLE array, dimension (N) The diagonal elements of the tridiagonal matrix T: D(i) = A(i,i). @param[out] e DOUBLE array, dimension (N-1) The off-diagonal elements of the tridiagonal matrix T: E(i) = A(i,i+1) if UPLO = MagmaUpper, E(i) = A(i+1,i) if UPLO = MagmaLower. @param[out] V COMPLEX_16 array, dimension (BLKCNT, LDV, VBLKSIZ) On exit it contains the blocks of householder reflectors BLKCNT is the number of block and it is returned by the funtion MAGMA_BULGE_GET_BLKCNT. @param[in] ldv INTEGER The leading dimension of V. LDV > NB + VBLKSIZ + 1 @param[out] TAU COMPLEX_16 dimension(BLKCNT, VBLKSIZ) ??? @param[in] compT INTEGER if COMPT = 0 T is not computed if COMPT = 1 T is computed @param[out] T COMPLEX_16 dimension(LDT *) if COMPT = 1 on exit contains the matrices T needed for Q2 if COMPT = 0 T is not referenced @param[in] ldt INTEGER The leading dimension of T. LDT > Vblksiz @ingroup magma_zheev_2stage ********************************************************************/ extern "C" magma_int_t magma_zhetrd_hb2st( magma_uplo_t uplo, magma_int_t n, magma_int_t nb, magma_int_t Vblksiz, magmaDoubleComplex *A, magma_int_t lda, double *d, double *e, magmaDoubleComplex *V, magma_int_t ldv, magmaDoubleComplex *TAU, magma_int_t compT, magmaDoubleComplex *T, magma_int_t ldt) { #ifdef ENABLE_TIMER real_Double_t timeblg=0.0; #endif magma_int_t threads = magma_get_parallel_numthreads(); magma_int_t mklth = magma_get_lapack_numthreads(); magma_set_lapack_numthreads(1); //const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t INgrsiz=1; magma_int_t blkcnt = magma_bulge_get_blkcnt(n, nb, Vblksiz); magma_int_t nbtiles = magma_ceildiv(n, nb); memset(T, 0, blkcnt*ldt*Vblksiz*sizeof(magmaDoubleComplex)); memset(TAU, 0, blkcnt*Vblksiz*sizeof(magmaDoubleComplex)); memset(V, 0, blkcnt*ldv*Vblksiz*sizeof(magmaDoubleComplex)); magma_int_t* prog; magma_malloc_cpu((void**) &prog, (2*nbtiles+threads+10)*sizeof(magma_int_t)); memset(prog, 0, (2*nbtiles+threads+10)*sizeof(magma_int_t)); magma_zbulge_id_data* arg; magma_malloc_cpu((void**) &arg, threads*sizeof(magma_zbulge_id_data)); pthread_t* thread_id; magma_malloc_cpu((void**) &thread_id, threads*sizeof(pthread_t)); pthread_attr_t thread_attr; magma_zbulge_data data_bulge(threads, n, nb, nbtiles, INgrsiz, Vblksiz, compT, A, lda, V, ldv, TAU, T, ldt, prog); // Set one thread per core pthread_attr_init(&thread_attr); pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM); pthread_setconcurrency(threads); //timing #ifdef ENABLE_TIMER timeblg = magma_wtime(); #endif // Launch threads for (magma_int_t thread = 1; thread < threads; thread++) { arg[thread] = magma_zbulge_id_data(thread, &data_bulge); pthread_create(&thread_id[thread], &thread_attr, magma_zhetrd_hb2st_parallel_section, &arg[thread]); } arg[0] = magma_zbulge_id_data(0, &data_bulge); magma_zhetrd_hb2st_parallel_section(&arg[0]); // Wait for completion for (magma_int_t thread = 1; thread < threads; thread++) { void *exitcodep; pthread_join(thread_id[thread], &exitcodep); } // timing #ifdef ENABLE_TIMER timeblg = magma_wtime()-timeblg; printf(" time BULGE+T = %f\n", timeblg); #endif magma_free_cpu(thread_id); magma_free_cpu(arg); magma_free_cpu(prog); magma_set_lapack_numthreads(mklth); /*================================================ * store resulting diag and lower diag d and e * note that d and e are always real *================================================*/ /* Make diagonal and superdiagonal elements real, * storing them in d and e */ /* In complex case, the off diagonal element are * not necessary real. we have to make off-diagonal * elements real and copy them to e. * When using HouseHolder elimination, * the ZLARFG give us a real as output so, all the * diagonal/off-diagonal element except the last one are already * real and thus we need only to take the abs of the last * one. * */ #if defined(PRECISION_z) || defined(PRECISION_c) if (uplo == MagmaLower) { for (magma_int_t i=0; i < n-1; i++) { d[i] = MAGMA_Z_REAL( A[i*lda ] ); e[i] = MAGMA_Z_REAL( A[i*lda+1] ); } d[n-1] = MAGMA_Z_REAL(A[(n-1)*lda]); } else { /* MagmaUpper not tested yet */ for (magma_int_t i=0; i < n-1; i++) { d[i] = MAGMA_Z_REAL( A[i*lda+nb] ); e[i] = MAGMA_Z_REAL( A[i*lda+nb-1] ); } d[n-1] = MAGMA_Z_REAL(A[(n-1)*lda+nb]); } /* end MagmaUpper */ #else if ( uplo == MagmaLower ) { for (magma_int_t i=0; i < n-1; i++) { d[i] = A[i*lda]; // diag e[i] = A[i*lda+1]; // lower diag } d[n-1] = A[(n-1)*lda]; } else { for (magma_int_t i=0; i < n-1; i++) { d[i] = A[i*lda+nb]; // diag e[i] = A[i*lda+nb-1]; // lower diag } d[n-1] = A[(n-1)*lda+nb]; } #endif return MAGMA_SUCCESS; }
/** Purpose ------- SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] d_lA REAL array of pointers on the GPU, dimension (ngpu). On entry, the M-by-N matrix A distributed over GPUs (d_lA[d] points to the local matrix on d-th GPU). It uses 1D block column cyclic format with the block size of nb, and each local matrix is stored by column. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array d_lA. LDDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_sgesv_comp ********************************************************************/ extern "C" magma_int_t magma_sgetrf_mgpu( magma_int_t ngpu, magma_int_t m, magma_int_t n, magmaFloat_ptr d_lA[], magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info) { magma_int_t nb, n_local[MagmaMaxGPUs]; magma_int_t maxm; magma_int_t i, j, d, lddat, lddwork; float *d_lAT[MagmaMaxGPUs]; float *d_panel[MagmaMaxGPUs], *work; magma_queue_t queues[MagmaMaxGPUs][2]; /* Check arguments */ *info = 0; if (m < 0) *info = -2; else if (n < 0) *info = -3; else if (ldda < max(1,m)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* create the queues */ for( d=0; d < ngpu; d++ ) { magma_queue_create( d, &queues[d][0] ); magma_queue_create( d, &queues[d][1] ); } /* Function Body */ nb = magma_get_sgetrf_nb( m, n ); if (nb <= 1 || nb >= n) { /* Use CPU code. */ magma_smalloc_cpu( &work, m * n ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_sgetmatrix( m, n, d_lA[0], ldda, work, m, queues[0][0] ); lapackf77_sgetrf(&m, &n, work, &m, ipiv, info); magma_ssetmatrix( m, n, work, m, d_lA[0], ldda, queues[0][0] ); magma_free_cpu(work); } else { /* Use hybrid blocked code. */ magma_device_t orig_dev; magma_getdevice( &orig_dev ); maxm = magma_roundup( m, 32 ); if ( ngpu > ceil((float)n/nb) ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); *info = -1; return *info; } /* allocate workspace for each GPU */ lddat = magma_roundup( ((magma_ceildiv( n, nb )/ngpu)*nb), 32 ); lddat = magma_ceildiv( n, nb ); /* number of block columns */ lddat = magma_ceildiv( lddat, ngpu ); /* number of block columns per GPU */ lddat = nb*lddat; /* number of columns per GPU */ lddat = magma_roundup( lddat, 32 ); /* make it a multiple of 32 */ for (i=0; i < ngpu; i++) { magma_setdevice(i); /* local-n and local-ld */ n_local[i] = ((n/nb)/ngpu)*nb; if (i < (n/nb)%ngpu) n_local[i] += nb; else if (i == (n/nb)%ngpu) n_local[i] += n%nb; /* workspaces */ if (MAGMA_SUCCESS != magma_smalloc( &d_panel[i], (3+ngpu)*nb*maxm )) { for( j=0; j <= i; j++ ) { magma_setdevice(j); } for( j=0; j < i; j++ ) { magma_setdevice(j); magma_free( d_panel[j] ); magma_free( d_lAT[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* local-matrix storage */ if (MAGMA_SUCCESS != magma_smalloc( &d_lAT[i], lddat*maxm )) { for( j=0; j <= i; j++ ) { magma_setdevice(j); magma_free( d_panel[j] ); } for( j=0; j < i; j++ ) { magma_setdevice(j); magma_free( d_lAT[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magmablas_stranspose( m, n_local[i], d_lA[i], ldda, d_lAT[i], lddat, queues[i][1] ); } for (i=0; i < ngpu; i++) { magma_setdevice(i); magma_queue_sync(queues[i][0]); } magma_setdevice(0); /* cpu workspace */ lddwork = maxm; if (MAGMA_SUCCESS != magma_smalloc_pinned( &work, lddwork*nb*ngpu )) { for (i=0; i < ngpu; i++ ) { magma_setdevice(i); magma_free( d_panel[i] ); magma_free( d_lAT[i] ); } *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* calling multi-gpu interface with allocated workspaces and queues */ magma_sgetrf2_mgpu(ngpu, m, n, nb, 0, d_lAT, lddat, ipiv, d_panel, work, maxm, queues, info); /* clean up */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); /* save on output */ magmablas_stranspose( n_local[d], m, d_lAT[d], lddat, d_lA[d], ldda, queues[d][0] ); magma_queue_sync(queues[d][0]); magma_queue_sync(queues[d][1]); magma_free( d_lAT[d] ); magma_free( d_panel[d] ); } /* end of for d=1,..,ngpu */ magma_setdevice( orig_dev ); magma_free_pinned( work ); } /* clean up */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_destroy( queues[d][0] ); magma_queue_destroy( queues[d][1] ); } return *info; }
extern "C" magma_int_t magma_zmslice( magma_int_t num_slices, magma_int_t slice, magma_z_matrix A, magma_z_matrix *B, magma_z_matrix *ALOC, magma_z_matrix *ANLOC, magma_index_t *comm_i, magmaDoubleComplex *comm_v, magma_int_t *start, magma_int_t *end, magma_queue_t queue ) { magma_int_t info = 0; if( A.num_rows != A.num_cols ){ printf("%% error: only supported for square matrices.\n"); info = MAGMA_ERR_NOT_SUPPORTED; goto cleanup; } if ( A.memory_location == Magma_CPU && A.storage_type == Magma_CSR ){ CHECK( magma_zmconvert( A, B, Magma_CSR, Magma_CSR, queue ) ); magma_free_cpu( B->col ); magma_free_cpu( B->val ); CHECK( magma_zmconvert( A, ALOC, Magma_CSR, Magma_CSR, queue ) ); magma_free_cpu( ALOC->col ); magma_free_cpu( ALOC->row ); magma_free_cpu( ALOC->val ); CHECK( magma_zmconvert( A, ANLOC, Magma_CSR, Magma_CSR, queue ) ); magma_free_cpu( ANLOC->col ); magma_free_cpu( ANLOC->row ); magma_free_cpu( ANLOC->val ); magma_int_t i,j,k, nnz, nnz_loc=0, loc_row = 0, nnz_nloc = 0; magma_index_t col; magma_int_t size = magma_ceildiv( A.num_rows, num_slices ); magma_int_t lstart = slice*size; magma_int_t lend = min( (slice+1)*size, A.num_rows ); // correct size for last slice size = lend-lstart; CHECK( magma_index_malloc_cpu( &ALOC->row, size+1 ) ); CHECK( magma_index_malloc_cpu( &ANLOC->row, size+1 ) ); // count elements for slice - identity for rest nnz = A.row[ lend ] - A.row[ lstart ] + ( A.num_rows - size ); CHECK( magma_index_malloc_cpu( &B->col, nnz ) ); CHECK( magma_zmalloc_cpu( &B->val, nnz ) ); // for the communication plan for( i=0; i<A.num_rows; i++ ) { comm_i[i] = 0; comm_v[i] = MAGMA_Z_ZERO; } k=0; B->row[i] = 0; ALOC->row[0] = 0; ANLOC->row[0] = 0; // identity above slice for( i=0; i<lstart; i++ ) { B->row[i+1] = B->row[i]+1; B->val[k] = MAGMA_Z_ONE; B->col[k] = i; k++; } // slice for( i=lstart; i<lend; i++ ) { B->row[i+1] = B->row[i] + (A.row[i+1]-A.row[i]); for( j=A.row[i]; j<A.row[i+1]; j++ ){ B->val[k] = A.val[j]; col = A.col[j]; B->col[k] = col; // communication plan if( col<lstart || col>=lend ){ comm_i[ col ] = 1; comm_v[ col ] = comm_v[ col ] + MAGMA_Z_MAKE( MAGMA_Z_ABS( A.val[j] ), 0.0 ); nnz_nloc++; } else { nnz_loc++; } k++; } loc_row++; ALOC->row[ loc_row ] = nnz_loc; ANLOC->row[ loc_row ] = nnz_nloc; } CHECK( magma_index_malloc_cpu( &ALOC->col, nnz_loc ) ); CHECK( magma_zmalloc_cpu( &ALOC->val, nnz_loc ) ); ALOC->num_rows = size; ALOC->num_cols = size; ALOC->nnz = nnz_loc; CHECK( magma_index_malloc_cpu( &ANLOC->col, nnz_nloc ) ); CHECK( magma_zmalloc_cpu( &ANLOC->val, nnz_nloc ) ); ANLOC->num_rows = size; ANLOC->num_cols = A.num_cols; ANLOC->nnz = nnz_nloc; nnz_loc = 0; nnz_nloc = 0; // local/nonlocal matrix for( i=lstart; i<lend; i++ ) { for( j=A.row[i]; j<A.row[i+1]; j++ ){ col = A.col[j]; // insert only in local part in ALOC, nonlocal in ANLOC if( col<lstart || col>=lend ){ ANLOC->val[ nnz_nloc ] = A.val[j]; ANLOC->col[ nnz_nloc ] = col; nnz_nloc++; } else { ALOC->val[ nnz_loc ] = A.val[j]; ALOC->col[ nnz_loc ] = col-lstart; nnz_loc++; } } } // identity below slice for( i=lend; i<A.num_rows; i++ ) { B->row[i+1] = B->row[i]+1; B->val[k] = MAGMA_Z_ONE; B->col[k] = i; k++; } B->nnz = k; *start = lstart; *end = lend; } else { printf("error: mslice only supported for CSR matrices on the CPU: %d %d.\n", int(A.memory_location), int(A.storage_type) ); info = MAGMA_ERR_NOT_SUPPORTED; } cleanup: return info; }
void magmablas_ssymm_mgpu_spec( magma_side_t side, magma_uplo_t uplo, magma_int_t m, magma_int_t n, float alpha, magmaFloat_ptr dA[], magma_int_t ldda, magma_int_t offset, magmaFloat_ptr dB[], magma_int_t lddb, float beta, magmaFloat_ptr dC[], magma_int_t lddc, magmaFloat_ptr dwork[], magma_int_t dworksiz, float *C, magma_int_t ldc, float *work[], magma_int_t worksiz, // TODO unused magma_int_t ngpu, magma_int_t nb, magma_queue_t queues[][20], magma_int_t nqueue, magma_event_t redevents[][MagmaMaxGPUs*MagmaMaxGPUs+10], magma_int_t nbevents, magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2], magma_int_t nbcmplx ) { #define dA(dev, i, j) (dA[dev] + (i) + (j)*ldda) #define dB(dev, i, j) (dB[dev] + (i) + (j)*lddb) #define dC(dev, i, j) (dC[dev] + (i) + (j)*lddc) #define dwork(dev, i, j) (dwork[dev] + (i) + (j)*lddwork) #define C(i, j) (C + (i) + (j)*ldc) if ( side != MagmaLeft || uplo != MagmaLower ) { fprintf( stderr, "%s: only Left Lower implemented\n", __func__ ); } assert( ldda >= m ); assert( lddb >= m ); assert( lddc >= m ); assert( nqueue >= ngpu ); assert( nbevents >= ngpu*ngpu ); magmaFloat_ptr dwork1[MagmaMaxGPUs]; magmaFloat_ptr dwork2[MagmaMaxGPUs]; magma_int_t lddwork = lddc; for( magma_int_t dev = 0; dev < ngpu; ++dev ) { dwork1[dev] = dwork[dev]; dwork2[dev] = dwork[dev]+n*lddwork; } assert( dworksiz >= (2*n*lddwork) ); magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_t cstream; magmablasGetKernelStream(&cstream); magma_int_t dev, devperm, myblk, mycolsize, myblkoffst; magma_int_t gdev, gcolsize, gmaster, gngpu; magma_int_t masterdev, lcdev, lccolsize, myngpu; magma_int_t stdev = (offset/nb)%ngpu; magma_int_t blockoffset = offset % nb; magma_int_t fstblksiz = 0; if(blockoffset>0){ fstblksiz = min(m, (nb - blockoffset)); } //magma_int_t nbblk = magma_ceildiv(m, nb); magma_int_t nbblk = magma_ceildiv((m+blockoffset), nb); magma_int_t maxgsize = n*nb*magma_ceildiv(nbblk, ngpu); magma_int_t remm = m- fstblksiz; magma_int_t nbblkoffst = offset/nb; magma_int_t nblstblks = -1; magma_int_t devlstblk = -1; magma_int_t lstblksiz = remm%nb; if(lstblksiz>0){ nblstblks = nbblk%ngpu; devlstblk = (nblstblks-1+ngpu)%ngpu; } magma_int_t nbcmplxactive = 0; magma_int_t cmplxisactive[MagmaMaxGPUs]; magma_int_t gpuisactive[MagmaMaxGPUs]; memset(gpuisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t)); memset(cmplxisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t)); //******************************* // each GPU make a GEMM with the // transpose of its blocks to compute // a final portion of X=A*VT //******************************* /* dB = V*T already ==> dB**H = T**H * V**H * compute T**H * V**H * X is equal to compute locally (VT)**H_i*X_i * then each GPU broadcast its X_i to assemble the full X which is used * to compute W = X - 0.5 * V * T**H * V**H * X = X - 0.5 * V *dwork3 */ if(ngpu ==1){ magma_setdevice( 0 ); magmablasSetKernelStream( queues[ 0 ][ 0 ] ); // compute X[me] = A*VT = A[me]^tr *VT; magma_sgemm( MagmaConjTrans, MagmaNoTrans, m, n, m, alpha, dA(0, offset, offset), ldda, dB[0], lddb, beta, dC[0], lddc ); return; } //ngpu>1 for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { masterdev = -1; gnode[cmplxid][MagmaMaxGPUs+1] = -1; myngpu = gnode[cmplxid][MagmaMaxGPUs]; for( magma_int_t idev = 0; idev < myngpu; ++idev ) { dev = gnode[cmplxid][idev]; devperm = (dev-stdev+ngpu)%ngpu; myblk = (nbblk/ngpu) + (nbblk%ngpu > devperm ? 1:0 ); mycolsize = myblk*nb; myblkoffst = nb*((nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0)); if(dev==stdev){ mycolsize -= blockoffset; myblkoffst += blockoffset; // local index in parent matrix } if((devperm==devlstblk)&&(lstblksiz>0)){ mycolsize -= (nb-(remm%nb)); } mycolsize = min(mycolsize, m); if(mycolsize>0){ if(masterdev==-1) masterdev = dev; //printf("dev %d devperm %d on cmplx %d master %d nbblk %d myblk %d m %d n %d mycolsize %d stdev %d fstblksize %d lastdev %d lastsize %d dA(%d, %d, %d) ==> dwork(%d, %d)\n", dev, devperm, cmplxid, masterdev, nbblk, myblk, m, n, mycolsize, stdev, fstblksiz, devlstblk, remm%nb, dev, offset, myblkoffst, dev, maxgsize*dev); gpuisactive[dev] = mycolsize; magma_setdevice( dev ); magmablasSetKernelStream( queues[ dev ][ dev ] ); magma_sgemm( MagmaConjTrans, MagmaNoTrans, mycolsize, n, m, alpha, dA(dev, offset, myblkoffst), ldda, dB(dev, 0, 0), lddb, beta, &dwork[dev][maxgsize*dev], mycolsize ); magma_event_record(redevents[dev][dev*ngpu+dev], queues[dev][dev]); } if(dev == masterdev){ nbcmplxactive = nbcmplxactive +1; cmplxisactive[cmplxid] = 1; gnode[cmplxid][MagmaMaxGPUs+1] = masterdev; } } } /* for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_queue_sync( queues[ dev ][ dev ] ); } */ //******************************* // each Master GPU has the final // result either by receiving // from CPU of by making the add // by himself, so now it is time // to broadcast over the GPUs of // its board. //******************************* //printf("=======================================================================\n"); //printf(" sending \n"); //printf("=======================================================================\n"); for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { myngpu = gnode[cmplxid][MagmaMaxGPUs]; masterdev = gnode[cmplxid][MagmaMaxGPUs+1]; for( magma_int_t idev = 0; idev < myngpu; ++idev ) { dev = gnode[cmplxid][idev]; mycolsize = gpuisactive[dev]; if(mycolsize>0){ // I am an active GPU send my portion local // to all active gpu of my cmplex and global to the // active master of the other real and they should // send it out to their actives slaves. magma_setdevice( dev ); //============================================== // sending to the master of the active real //============================================== //printf ("\n\n**************GPU %d\n ", dev); //printf (" GPU %d sending to cmplx masters\n", dev); for( magma_int_t k = 0; k < nbcmplx; ++k ) { if(k!=cmplxid){ gmaster = gnode[k][MagmaMaxGPUs+1]; if(gmaster!=-1){ //real is active //printf (" device %d from cmplx %d is sending to master %d on cmplx %d block of size %d event %d\n", dev, cmplxid, gmaster, k, mycolsize, redevents[dev][gmaster*ngpu+dev]); magma_queue_wait_event(queues[ dev ][ gmaster ], redevents[dev][dev*ngpu+dev]); magma_scopymatrix_async( mycolsize, n, &dwork[dev ][maxgsize*dev], mycolsize, &dwork[gmaster][maxgsize*dev], mycolsize, queues[dev][gmaster] ); magma_event_record(redevents[dev][gmaster*ngpu+dev], queues[dev][gmaster]); } } } //============================================== // //============================================== // sending to the active GPUs of my real //============================================== //printf (" GPU %d sending internal\n", dev); for( magma_int_t l = 0; l < myngpu; ++l ) { lcdev = gnode[cmplxid][l]; lccolsize = gpuisactive[lcdev]; if((lcdev!=dev)&&(lccolsize>0)){ //printf (" device %d from cmplx %d is sending internal to dev %d block of size %d event %d\n", dev, cmplxid, lcdev, mycolsize, redevents[dev][lcdev*ngpu+dev]); magma_queue_wait_event(queues[ dev ][ lcdev ], redevents[dev][dev*ngpu+dev]); magma_scopymatrix_async( mycolsize, n, &dwork[dev ][maxgsize*dev], mycolsize, &dwork[lcdev][maxgsize*dev], mycolsize, queues[dev][lcdev] ); magma_event_record(redevents[dev][lcdev*ngpu+dev], queues[dev][lcdev]); } } //============================================== }// end if mycolsize>0 }// for idev }// for cmplxid //printf("=======================================================================\n"); //printf(" master wait and resend internally \n"); //printf("=======================================================================\n"); for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { myngpu = gnode[cmplxid][MagmaMaxGPUs]; masterdev = gnode[cmplxid][MagmaMaxGPUs+1]; //============================================== // if I am active master so wait receiving contribution // of the GPUs of other real and send it locally //============================================== if(masterdev != -1){ mycolsize = gpuisactive[masterdev]; magma_setdevice( masterdev ); //printf(" GPU %d distributing internal\n", masterdev); for( magma_int_t k = 0; k < nbcmplx; ++k ) { if(k!=cmplxid){ gngpu = gnode[k][MagmaMaxGPUs]; for( magma_int_t g = 0; g < gngpu; ++g ) { gdev = gnode[k][g]; gcolsize = gpuisactive[gdev]; // check if I received from this GPU, // if yes send it to my group if(gcolsize>0){ magma_queue_wait_event(queues[ masterdev ][ gdev ], redevents[gdev][masterdev*ngpu+gdev]); for( magma_int_t l = 0; l < myngpu; ++l ) { lcdev = gnode[cmplxid][l]; lccolsize = gpuisactive[lcdev]; if((lcdev!=masterdev)&&(lccolsize>0)){ //printf(" Master %d on cmplx %d waiting on event %d is distributing internal results of %d to lcdev %d block of size %d event %d\n", masterdev, cmplxid, redevents[gdev][masterdev*ngpu+gdev], gdev, lcdev, gcolsize, redevents[masterdev][lcdev*ngpu+gdev]); magma_scopymatrix_async( gcolsize, n, &dwork[masterdev][maxgsize*gdev], gcolsize, &dwork[lcdev ][maxgsize*gdev], gcolsize, queues[masterdev][gdev] ); magma_event_record(redevents[masterdev][lcdev*ngpu+gdev], queues[masterdev][gdev]); } } } } } } }// if active master //============================================== }// for cmplxid /* for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_queue_sync( queues[ dev ][ 0 ] ); for( magma_int_t s = 0; s < ngpu; ++s ) { magma_queue_sync( queues[ dev ][ s ] ); } } */ //printf("=======================================================================\n"); //printf(" distributing \n"); //printf("=======================================================================\n"); magma_int_t lcblki, gbblki, gblk, ib; for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { myngpu = gnode[cmplxid][MagmaMaxGPUs]; masterdev = gnode[cmplxid][MagmaMaxGPUs+1]; for( magma_int_t idev = 0; idev < myngpu; ++idev ) { dev = gnode[cmplxid][idev]; mycolsize = gpuisactive[dev]; if(mycolsize>0){ // I am an active GPU //printf("\n\n==============GPU %d collecting\n", dev); magma_setdevice( dev ); // collect my results first as tyhere is no need to wait to // receive nothing, just wait that my gemm are done. // in theory this should be inside the loop but cuda was not // able to run it first for all gpu and on gpu>0 it was waiting // however it was on different stream so it should run. but maybe // this is because there are too many function call and this make // cuda not handleit so nice. anyway it coul dbe removed when cuda // is able to lunch it first without wait. gdev = dev; gcolsize = gpuisactive[gdev]; if(gcolsize>0){ devperm = (gdev-stdev+ngpu)%ngpu; gblk = (nbblk/ngpu) + (nbblk%ngpu > devperm ? 1:0 ); magmablasSetKernelStream( queues[ dev ][ gdev ] ); magma_queue_wait_event(queues[ dev ][ gdev ], redevents[gdev][dev*ngpu+gdev]); //printf (" GPU %d stream %d doing slacpy\n", dev, queues[ dev ][ gdev ]); for( magma_int_t blki = 0; blki < gblk; ++blki){ gbblki = (blki*ngpu + devperm)*nb - blockoffset; lcblki = blki*nb; ib = nb;//min(nb, m-gbblki); if(gdev==stdev){ lcblki = blki*nb-blockoffset; if(blki==0){ gbblki = 0; lcblki = 0; ib = nb-blockoffset; } } ib = min(ib, m-gbblki); //printf(" blockoffset %d nbblk %d stdev %d receiving from gdev %d gblk %d gcolsize %d copying blki %d of size ibxn %dx%d from work[%d] to C[%d]\n", blockoffset, nbblk, stdev, gdev, gblk, gcolsize, blki, ib, n, lcblki, gbblki); magmablas_slacpy( MagmaFull, ib, n, &dwork[dev][maxgsize*gdev+lcblki], gcolsize, &dC[dev][gbblki], lddc); }// end blki } for( magma_int_t k = 0; k < nbcmplx; ++k ) { gngpu = gnode[k][MagmaMaxGPUs]; for( magma_int_t g = 0; g < gngpu; ++g ) { gdev = gnode[k][g]; gcolsize = gpuisactive[gdev]; // if gcolsize>0, ==> gpu gdev was active and so // I received from him/computed a portion of dwork, // so go over its gblk and distribute it on dC. if(gdev!=dev){ if(gcolsize>0){ devperm = (gdev-stdev+ngpu)%ngpu; gblk = (nbblk/ngpu) + (nbblk%ngpu > devperm ? 1:0 ); magmablasSetKernelStream( queues[ dev ][ gdev ] ); if(k==cmplxid){ //we are on the same group so wait on event issued by gdev for me citing his id magma_queue_wait_event(queues[ dev ][ gdev ], redevents[gdev][dev*ngpu+gdev]); //printf (" GPU %d queue %d waiting on event %d to collecte from %d the size of gcolsize %d\n", dev, queues[ dev ][ gdev ], redevents[gdev][dev*ngpu+gdev], gdev, gcolsize); }else{ //we are on different group so: //if I am the master wait on the event issued by gdev for me citing his id //else wait event issued by my master for me on the behalf of gdev //printf (" GPU %d queue %d waiting on event %d to collecte from %d the size of gcolsize %d\n", dev, queues[ dev ][ gdev ], redevents[masterdev][dev*ngpu+gdev], gdev, gcolsize); if(dev==masterdev) magma_queue_wait_event(queues[ dev ][ gdev ], redevents[gdev][dev*ngpu+gdev]); else magma_queue_wait_event(queues[ dev ][ gdev ], redevents[masterdev][dev*ngpu+gdev]); } //printf (" GPU %d stream %d doing slacpy\n", dev, queues[ dev ][ gdev ]); for( magma_int_t blki = 0; blki < gblk; ++blki){ gbblki = (blki*ngpu + devperm)*nb - blockoffset; lcblki = blki*nb; ib = nb;//min(nb, m-gbblki); if(gdev==stdev){ lcblki = blki*nb-blockoffset; if(blki==0){ gbblki = 0; lcblki = 0; ib = nb-blockoffset; } } ib = min(ib, m-gbblki); //printf(" blockoffset %d nbblk %d stdev %d receiving from gdev %d gblk %d gcolsize %d copying blki %d of size ibxn %dx%d from work[%d] to C[%d]\n", blockoffset, nbblk, stdev, gdev, gblk, gcolsize, blki, ib, n, lcblki, gbblki); magmablas_slacpy( MagmaFull, ib, n, &dwork[dev][maxgsize*gdev+lcblki], gcolsize, &dC[dev][gbblki], lddc); }// end blki }// en gcolsize>0 meaning gdev is active } // end if gdev != dev }// end loop over the g gpus of the cmplx k }//end loop over the real k }// end mycolsize>0 meaning that I am active }// end loop over idev of cmplxid }// end loop of the cmplx for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_device_sync(); } // put back the input gpu and its input stream magma_setdevice( cdev ); magmablasSetKernelStream( cstream ); }
int main(int argc, char **argv) { TESTING_INIT(); const float c_neg_one = MAGMA_S_NEG_ONE; const magma_int_t ione = 1; real_Double_t atomics_perf=0, atomics_time=0; real_Double_t gflops, magma_perf=0, magma_time=0, cublas_perf, cublas_time, cpu_perf, cpu_time; float magma_error=0, atomics_error=0, cublas_error, work[1]; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t N, lda, ldda, sizeA, sizeX, sizeY, blocks, ldwork; magma_int_t incx = 1; magma_int_t incy = 1; magma_int_t nb = 64; float alpha = MAGMA_S_MAKE( 1.5, -2.3 ); float beta = MAGMA_S_MAKE( -0.6, 0.8 ); float *A, *X, *Y, *Yatomics, *Ycublas, *Ymagma; magmaFloat_ptr dA, dX, dY, dwork; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("%% uplo = %s\n", lapack_uplo_const(opts.uplo) ); printf("%% N MAGMA Gflop/s (ms) Atomics Gflop/s CUBLAS Gflop/s CPU Gflop/s MAGMA error Atomics CUBLAS\n"); printf("%%=====================================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldda = magma_roundup( N, opts.align ); // multiple of 32 by default sizeA = N*lda; sizeX = N*incx; sizeY = N*incy; gflops = FLOPS_SSYMV( N ) / 1e9; TESTING_MALLOC_CPU( A, float, sizeA ); TESTING_MALLOC_CPU( X, float, sizeX ); TESTING_MALLOC_CPU( Y, float, sizeY ); TESTING_MALLOC_CPU( Yatomics, float, sizeY ); TESTING_MALLOC_CPU( Ycublas, float, sizeY ); TESTING_MALLOC_CPU( Ymagma, float, sizeY ); TESTING_MALLOC_DEV( dA, float, ldda*N ); TESTING_MALLOC_DEV( dX, float, sizeX ); TESTING_MALLOC_DEV( dY, float, sizeY ); blocks = magma_ceildiv( N, nb ); ldwork = ldda*blocks; TESTING_MALLOC_DEV( dwork, float, ldwork ); magmablas_slaset( MagmaFull, ldwork, 1, MAGMA_S_NAN, MAGMA_S_NAN, dwork, ldwork, opts.queue ); magmablas_slaset( MagmaFull, ldda, N, MAGMA_S_NAN, MAGMA_S_NAN, dA, ldda, opts.queue ); /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &sizeA, A ); magma_smake_symmetric( N, A, lda ); // should not use data from the opposite triangle -- fill with NAN to check magma_int_t N1 = N-1; if ( opts.uplo == MagmaUpper ) { lapackf77_slaset( "Lower", &N1, &N1, &MAGMA_S_NAN, &MAGMA_S_NAN, &A[1], &lda ); } else { lapackf77_slaset( "Upper", &N1, &N1, &MAGMA_S_NAN, &MAGMA_S_NAN, &A[lda], &lda ); } lapackf77_slarnv( &ione, ISEED, &sizeX, X ); lapackf77_slarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_ssetmatrix( N, N, A, lda, dA, ldda, opts.queue ); magma_ssetvector( N, X, incx, dX, incx, opts.queue ); magma_ssetvector( N, Y, incy, dY, incy, opts.queue ); cublas_time = magma_sync_wtime( opts.queue ); #ifdef HAVE_CUBLAS cublasSsymv( opts.handle, cublas_uplo_const(opts.uplo), N, &alpha, dA, ldda, dX, incx, &beta, dY, incy ); #else magma_ssymv( opts.uplo, N, alpha, dA, 0, ldda, dX, 0, incx, beta, dY, 0, incy, opts.queue ); #endif cublas_time = magma_sync_wtime( opts.queue ) - cublas_time; cublas_perf = gflops / cublas_time; magma_sgetvector( N, dY, incy, Ycublas, incy, opts.queue ); /* ===================================================================== Performs operation using CUBLAS - using atomics =================================================================== */ #ifdef HAVE_CUBLAS cublasSetAtomicsMode( opts.handle, CUBLAS_ATOMICS_ALLOWED ); magma_ssetvector( N, Y, incy, dY, incy, opts.queue ); // sync on queue doesn't work -- need device sync or use NULL stream -- bug in CUBLAS? atomics_time = magma_sync_wtime( NULL /*opts.queue*/ ); cublasSsymv( opts.handle, cublas_uplo_const(opts.uplo), N, &alpha, dA, ldda, dX, incx, &beta, dY, incy ); atomics_time = magma_sync_wtime( NULL /*opts.queue*/ ) - atomics_time; atomics_perf = gflops / atomics_time; magma_sgetvector( N, dY, incy, Yatomics, incy, opts.queue ); cublasSetAtomicsMode( opts.handle, CUBLAS_ATOMICS_NOT_ALLOWED ); #endif /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ #ifdef HAVE_CUBLAS magma_ssetvector( N, Y, incy, dY, incy, opts.queue ); magma_time = magma_sync_wtime( opts.queue ); if ( opts.version == 1 ) { magmablas_ssymv_work( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy, dwork, ldwork, opts.queue ); } else { // non-work interface (has added overhead) magmablas_ssymv( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy, opts.queue ); } magma_time = magma_sync_wtime( opts.queue ) - magma_time; magma_perf = gflops / magma_time; magma_sgetvector( N, dY, incy, Ymagma, incy, opts.queue ); #endif /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_ssymv( lapack_uplo_const(opts.uplo), &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ blasf77_saxpy( &N, &c_neg_one, Y, &incy, Ycublas, &incy ); cublas_error = lapackf77_slange( "M", &N, &ione, Ycublas, &N, work ) / N; #ifdef HAVE_CUBLAS blasf77_saxpy( &N, &c_neg_one, Y, &incy, Yatomics, &incy ); atomics_error = lapackf77_slange( "M", &N, &ione, Yatomics, &N, work ) / N; blasf77_saxpy( &N, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_slange( "M", &N, &ione, Ymagma, &N, work ) / N; #endif bool okay = (magma_error < tol && cublas_error < tol && atomics_error < tol); status += ! okay; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %8.2e %s\n", (int) N, magma_perf, 1000.*magma_time, atomics_perf, 1000.*atomics_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error, atomics_error, (okay ? "ok" : "failed")); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ycublas ); TESTING_FREE_CPU( Yatomics ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/** Purpose ------- Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangles of A is stored; - = MagmaLower: Lower triangles of A is stored. @param[in] n INTEGER The order of the matrix A. n >= 0. @param[in] nb INTEGER The order of the band matrix A. n >= nb >= 0. @param[in] Vblksiz INTEGER The size of the block of householder vectors applied at once. @param[in] A (workspace) DOUBLE PRECISION array, dimension (lda, n) On entry the band matrix stored in the following way: @param[in] lda INTEGER The leading dimension of the array A. lda >= 2*nb. @param[out] d DOUBLE array, dimension (n) The diagonal elements of the tridiagonal matrix T: D(i) = A(i,i). @param[out] e DOUBLE array, dimension (n-1) The off-diagonal elements of the tridiagonal matrix T: E(i) = A(i,i+1) if UPLO = MagmaUpper, E(i) = A(i+1,i) if UPLO = MagmaLower. @param[out] V DOUBLE PRECISION array, dimension (BLKCNT, LDV, VBLKSIZ) On exit it contains the blocks of householder reflectors BLKCNT is the number of block and it is returned by the funtion MAGMA_BULGE_GET_BLKCNT. @param[in] ldv INTEGER The leading dimension of V. LDV > nb + VBLKSIZ + 1 @param[out] TAU DOUBLE PRECISION dimension(BLKCNT, VBLKSIZ) ??? @param[in] wantz INTEGER if COMPT = 0 T is not computed if COMPT = 1 T is computed @param[out] T DOUBLE PRECISION dimension(LDT *) if COMPT = 1 on exit contains the matrices T needed for Q2 if COMPT = 0 T is not referenced @param[in] ldt INTEGER The leading dimension of T. LDT > Vblksiz @ingroup magma_dsyev_2stage ********************************************************************/ extern "C" magma_int_t magma_dsytrd_sb2st( magma_uplo_t uplo, magma_int_t n, magma_int_t nb, magma_int_t Vblksiz, double *A, magma_int_t lda, double *d, double *e, double *V, magma_int_t ldv, double *TAU, magma_int_t wantz, double *T, magma_int_t ldt) { #ifdef ENABLE_TIMER real_Double_t timeblg=0.0; #endif magma_int_t parallel_threads = magma_get_parallel_numthreads(); magma_int_t mklth = magma_get_lapack_numthreads(); magma_int_t ompth = magma_get_omp_numthreads(); //magma_set_omp_numthreads(1); //magma_set_lapack_numthreads(1); magma_int_t blkcnt, sizTAU2, sizT2, sizV2; magma_dbulge_getstg2size(n, nb, wantz, Vblksiz, ldv, ldt, &blkcnt, &sizTAU2, &sizT2, &sizV2); memset(T, 0, sizT2*sizeof(double)); memset(TAU, 0, sizTAU2*sizeof(double)); memset(V, 0, sizV2*sizeof(double)); magma_int_t INgrsiz=1; magma_int_t nbtiles = magma_ceildiv(n, nb); volatile magma_int_t* prog; magma_malloc_cpu((void**) &prog, (2*nbtiles+parallel_threads+10)*sizeof(magma_int_t)); memset((void *) prog, 0, (2*nbtiles+parallel_threads+10)*sizeof(magma_int_t)); magma_dbulge_id_data* arg; magma_malloc_cpu((void**) &arg, parallel_threads*sizeof(magma_dbulge_id_data)); pthread_t* thread_id; magma_malloc_cpu((void**) &thread_id, parallel_threads*sizeof(pthread_t)); pthread_attr_t thread_attr; magma_dbulge_data data_bulge; magma_dbulge_data_init(&data_bulge, parallel_threads, n, nb, nbtiles, INgrsiz, Vblksiz, wantz, A, lda, V, ldv, TAU, T, ldt, prog); // Set one thread per core pthread_attr_init(&thread_attr); pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM); pthread_setconcurrency(parallel_threads); //timing #ifdef ENABLE_TIMER timeblg = magma_wtime(); #endif // Launch threads for (magma_int_t thread = 1; thread < parallel_threads; thread++) { magma_dbulge_id_data_init(&(arg[thread]), thread, &data_bulge); pthread_create(&thread_id[thread], &thread_attr, magma_dsytrd_sb2st_parallel_section, &arg[thread]); } magma_dbulge_id_data_init(&(arg[0]), 0, &data_bulge); magma_dsytrd_sb2st_parallel_section(&arg[0]); // Wait for completion for (magma_int_t thread = 1; thread < parallel_threads; thread++) { void *exitcodep; pthread_join(thread_id[thread], &exitcodep); } // timing #ifdef ENABLE_TIMER timeblg = magma_wtime()-timeblg; printf(" time BULGE+T = %f\n", timeblg); #endif magma_free_cpu(thread_id); magma_free_cpu(arg); magma_free_cpu((void *) prog); magma_dbulge_data_destroy(&data_bulge); magma_set_omp_numthreads(ompth); magma_set_lapack_numthreads(mklth); /*================================================ * store resulting diag and lower diag d and e * note that d and e are always real *================================================*/ /* Make diagonal and superdiagonal elements real, * storing them in d and e */ /* In real case, the off diagonal element are * not necessary real. we have to make off-diagonal * elements real and copy them to e. * When using HouseHolder elimination, * the DLARFG give us a real as output so, all the * diagonal/off-diagonal element except the last one are already * real and thus we need only to take the abs of the last * one. * */ #ifdef COMPLEX if (uplo == MagmaLower) { for (magma_int_t i=0; i < n-1; i++) { d[i] = MAGMA_D_REAL( A[i*lda ] ); e[i] = MAGMA_D_REAL( A[i*lda+1] ); } d[n-1] = MAGMA_D_REAL(A[(n-1)*lda]); } else { /* MagmaUpper not tested yet */ for (magma_int_t i=0; i < n-1; i++) { d[i] = MAGMA_D_REAL( A[i*lda+nb] ); e[i] = MAGMA_D_REAL( A[i*lda+nb-1] ); } d[n-1] = MAGMA_D_REAL(A[(n-1)*lda+nb]); } /* end MagmaUpper */ #else if ( uplo == MagmaLower ) { for (magma_int_t i=0; i < n-1; i++) { d[i] = A[i*lda]; // diag e[i] = A[i*lda+1]; // lower diag } d[n-1] = A[(n-1)*lda]; } else { for (magma_int_t i=0; i < n-1; i++) { d[i] = A[i*lda+nb]; // diag e[i] = A[i*lda+nb-1]; // lower diag } d[n-1] = A[(n-1)*lda+nb]; } #endif return MAGMA_SUCCESS; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrf_mgpu */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; double error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_R, *tau, *h_work, tmp[1]; magmaDouble_ptr d_lA[ MagmaMaxGPUs ]; magma_int_t M, N, n2, lda, ldda, n_local, ngpu; magma_int_t info, min_mn, nb, lhwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}, ISEED2[4]; magma_opts opts; opts.parse_opts( argc, argv ); opts.ngpu = abs( opts.ngpu ); // always uses multi-GPU code opts.lapack |= (opts.check == 2); // check (-c2) implies lapack (-l) magma_int_t status = 0; double eps = lapackf77_dlamch("E"); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("%% ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 1 ) { printf("%% M N CPU Gflop/s (sec) GPU Gflop/s (sec) ||R-Q'A||_1 / (M*||A||_1) ||I-Q'Q||_1 / M\n"); printf("%%===============================================================================================\n"); } else { printf("%% M N CPU Gflop/s (sec) GPU Gflop/s (sec) ||R||_F /(M*||A||_F)\n"); printf("%%=========================================================================\n"); } for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default nb = magma_get_dgeqrf_nb( M, N ); gflops = FLOPS_DGEQRF( M, N ) / 1e9; // ngpu must be at least the number of blocks ngpu = min( opts.ngpu, magma_ceildiv(N,nb) ); if ( ngpu < opts.ngpu ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); } // query for workspace size lhwork = -1; lapackf77_dgeqrf( &M, &N, NULL, &M, NULL, tmp, &lhwork, &info ); lhwork = (magma_int_t) MAGMA_D_REAL( tmp[0] ); // Allocate host memory for the matrix TESTING_MALLOC_CPU( tau, double, min_mn ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_CPU( h_work, double, lhwork ); TESTING_MALLOC_PIN( h_R, double, n2 ); // Allocate device memory for( int dev = 0; dev < ngpu; dev++ ) { n_local = ((N/nb)/ngpu)*nb; if (dev < (N/nb) % ngpu) n_local += nb; else if (dev == (N/nb) % ngpu) n_local += N % nb; magma_setdevice( dev ); TESTING_MALLOC_DEV( d_lA[dev], double, ldda*n_local ); } /* Initialize the matrix */ for( int j=0; j < 4; j++ ) ISEED2[j] = ISEED[j]; // save seeds lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaFullStr, &M, &N, h_A, &lda, h_R, &lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { double *tau2; TESTING_MALLOC_CPU( tau2, double, min_mn ); cpu_time = magma_wtime(); lapackf77_dgeqrf( &M, &N, h_A, &lda, tau2, h_work, &lhwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapack_dgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } TESTING_FREE_CPU( tau2 ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_dsetmatrix_1D_col_bcyclic( M, N, h_R, lda, d_lA, ldda, ngpu, nb ); gpu_time = magma_wtime(); magma_dgeqrf2_mgpu( ngpu, M, N, d_lA, ldda, tau, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_dgeqrf2 returned error %d: %s.\n", (int) info, magma_strerror( info )); } magma_dgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_R, lda, ngpu, nb ); if ( opts.check == 1 && M >= N ) { /* ===================================================================== Check the result -- dqrt02 requires M >= N =================================================================== */ magma_int_t lwork = n2+N; double *h_W1, *h_W2, *h_W3; double *h_RW, results[2]; TESTING_MALLOC_CPU( h_W1, double, n2 ); // Q TESTING_MALLOC_CPU( h_W2, double, n2 ); // R TESTING_MALLOC_CPU( h_W3, double, lwork ); // WORK TESTING_MALLOC_CPU( h_RW, double, M ); // RWORK lapackf77_dlarnv( &ione, ISEED2, &n2, h_A ); lapackf77_dqrt02( &M, &N, &min_mn, h_A, h_R, h_W1, h_W2, &lda, tau, h_W3, &lwork, h_RW, results ); results[0] *= eps; results[1] *= eps; if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, results[0], results[1] ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, gpu_perf, gpu_time, results[0], results[1] ); } // todo also check results[1] < tol? printf(" %s\n", (results[0] < tol ? "ok" : "failed")); status += ! (results[0] < tol); TESTING_FREE_CPU( h_W1 ); TESTING_FREE_CPU( h_W2 ); TESTING_FREE_CPU( h_W3 ); TESTING_FREE_CPU( h_RW ); } else if ( opts.check == 2 ) { /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = lapackf77_dlange("f", &M, &N, h_A, &lda, work ); blasf77_daxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); error = lapackf77_dlange("f", &M, &N, h_R, &lda, work ) / (min_mn*error); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) ---", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) ---", (int) M, (int) N, gpu_perf, gpu_time); } printf("%s\n", (opts.check != 0 ? " (error check only for M >= N)" : "")); } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); for( int dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); TESTING_FREE_DEV( d_lA[dev] ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
void magmablas_ssymm_mgpu_com( magma_side_t side, magma_uplo_t uplo, magma_int_t m, magma_int_t n, float alpha, float *dA[], magma_int_t ldda, magma_int_t offset, float *dB[], magma_int_t lddb, float beta, float *dC[], magma_int_t lddc, float *dwork[], magma_int_t dworksiz, float *C, magma_int_t ldc, float *work[], magma_int_t worksiz, magma_int_t ngpu, magma_int_t nb, magma_queue_t streams[][20], magma_int_t nstream, magma_event_t redevents[][MagmaMaxGPUs*MagmaMaxGPUs+10], magma_int_t nbevents, magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2], magma_int_t nbcmplx ) { #define dA(dev, i, j) (dA[dev] + (i) + (j)*ldda) #define dB(dev, i, j) (dB[dev] + (i) + (j)*lddb) #define dC(dev, i, j) (dC[dev] + (i) + (j)*lddc) #define dwork(dev, i, j) (dwork[dev] + (i) + (j)*lddwork) #define C(i, j) (C + (i) + (j)*ldc) //printf("####################################################\n"); //printf(" start ssymm \n"); //printf("####################################################\n"); if ( side != MagmaLeft || uplo != MagmaLower ) { fprintf( stderr, "%s: only Left Lower implemented\n", __func__ ); } assert( ldda >= m ); assert( lddb >= m ); assert( lddc >= m ); assert( nstream >= ngpu ); assert( nbevents >= ngpu*ngpu ); float c_one = MAGMA_S_ONE; float *dwork1[MagmaMaxGPUs]; float *dwork2[MagmaMaxGPUs]; magma_int_t maxgsize = n*m; magma_int_t lddwork = lddc; magma_int_t ldwork = m; for( magma_int_t dev = 0; dev < ngpu; ++dev ) { dwork1[dev] = dwork[dev]; // size of dwork1 is n*lddwork dwork2[dev] = dwork[dev]+n*lddwork; // size of dwork2 is maxgsize*ngpu } assert( dworksiz >= (n*lddwork+maxgsize*ngpu) ); assert( worksiz >= (n*ldwork) ); magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_t cstream; magmablasGetKernelStream(&cstream); magma_int_t dev, devperm, myblk, mycolsize, myblkoffst; magma_int_t gmaster; magma_int_t masterdev, lcdev, lccolsize, myngpu; magma_int_t stdev = (offset/nb)%ngpu; magma_int_t blockoffset = offset % nb; magma_int_t fstblksiz = 0; if(blockoffset>0){ fstblksiz = min(m, (nb - blockoffset)); } //magma_int_t nbblk = magma_ceildiv(m, nb); magma_int_t nbblk = magma_ceildiv((m+blockoffset), nb); magma_int_t remm = m- fstblksiz; magma_int_t nbblkoffst = offset/nb; magma_int_t nblstblks = -1; magma_int_t devlstblk = -1; magma_int_t lstblksiz = remm%nb; if(lstblksiz>0){ nblstblks = nbblk%ngpu; devlstblk = (nblstblks-1+ngpu)%ngpu; } magma_int_t nbcmplxactive = 0; magma_int_t cmplxisactive[MagmaMaxGPUs]; magma_int_t gpuisactive[MagmaMaxGPUs]; memset(gpuisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t)); memset(cmplxisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t)); for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ 0 ] ); cudaMemset(dwork(dev,0,0), 0, (lddwork)*(n)*sizeof(float) ); // put all dC on all dev to 0 except the one which // hold i==0 because this one has to multiply by beta. if(dev!=stdev){ cudaMemset(dC(dev,0,0), 0, (lddc)*(n)*sizeof(float) ); } } magma_int_t newoffset = offset; // 1. symmetrize if(blockoffset>0){ newoffset = offset+fstblksiz; // newoffset is adjusted over nb magma_int_t myblkoffst = (nbblkoffst/ngpu)+(nbblkoffst%ngpu > stdev?1:0); //printf("STDEV %d voici offset %d remm %d myblockoffset %d siz %d \n", stdev, offset, remm, myblkoffst, fstblksiz); magma_setdevice( stdev ); magmablasSetKernelStream( streams[ stdev ][ 0 ] ); magmablas_ssymmetrize_tiles( MagmaLower, fstblksiz, dA(stdev, offset, myblkoffst*nb+blockoffset), ldda, 1, ngpu*nb, nb ); } for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_int_t newstdev = (newoffset/nb)%ngpu; magma_int_t nbblk = remm/nb; // number of block of size nb. if m%nb>0 then a last block exist and is of size ib=m%nb magma_int_t myblk = (nbblk/ngpu) + (nbblk%ngpu > ((dev-newstdev+ngpu)%ngpu) ? 1:0 ); magma_int_t devperm = (dev-newstdev+ngpu)%ngpu; magma_int_t nbblkoffst = newoffset/nb; magma_int_t myblkoffst = (nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0); //printf("dev %d devperm %d newoffset %d rowoff %d coloff %d myblk %d \n", dev, devperm, newoffset, newoffset+devperm*nb, myblkoffst*nb, myblk); magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ 0 ] ); magmablas_ssymmetrize_tiles( MagmaLower, nb, dA(dev, newoffset+devperm*nb, myblkoffst*nb), ldda, myblk, ngpu*nb, nb ); if(remm%nb>0){ magma_int_t nblstblks = (nbblk+1)%ngpu; magma_int_t devlstblk = (nblstblks-1+ngpu)%ngpu; //printf("==> siz %d devperm %d, devlstblk %d, newoffset+nbblk*nb %d, myblkoffst*nb+ myblk*nb %d\n", remm % nb, devperm, devlstblk, newoffset+nbblk*nb, myblkoffst*nb+ myblk*nb); if(devperm==devlstblk) magmablas_ssymmetrize( MagmaLower, remm % nb, dA(dev, newoffset+nbblk*nb, myblkoffst*nb+ myblk*nb), ldda ); // last partial tile } } /* magma_int_t siz = m+offset; float *R; magma_smalloc_cpu( &R, siz*siz ); // collecte back A magmablas_sgetmatrix_1D_bcyclic( siz, siz, dA, ldda, R, siz, ngpu, nb ); magma_setdevice( 0 ); magmablasSetKernelStream( streams[ dev ][ 0 ] ); //magma_sgetmatrix( siz, siz, dA[0], ldda, R, siz ); FILE *trace_file; trace_file = fopen("AJETE/Aafter", "w"); for (int j = 0; j < siz ; j++) for (int i = 0; i < siz ; i++) fprintf(trace_file, "%10d%10d%40.30e\n", i+1, j+1, R[j*siz+i]); fclose(trace_file); return; */ // ROW GEMM transpose a row and make a gemm with a block // if only 1 GPU used the ROW GEMM is integrated with the // COL GEMM (better accuracy observed) and better perf if(ngpu>1){ for( magma_int_t i = fstblksiz; i < m; i += nb ) { magma_int_t ib = min( nb, m-i ); // block size magma_int_t ioff = i + offset; // start global index in parent matrix //magma_int_t dev = (ioff / nb) % ngpu; magma_int_t nbblkoffst = offset/nb; magma_int_t nbblk = magma_ceildiv(i, nb); for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_int_t myblk = (nbblk/ngpu) + (nbblk%ngpu > ((dev-stdev+ngpu)%ngpu) ? 1:0 ); magma_int_t myblkoffst = (nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0); magma_int_t myrowsize = myblk * nb; magma_int_t coloffset = myblkoffst*nb; if(dev==stdev) { myrowsize = myrowsize -blockoffset; coloffset = myblkoffst*nb+blockoffset; } //printf("ROW GEMM: voici i %d ib %d ioff %d nbblkoffst %d stdev %d dev %d myblk %d myblkoffset %d coloffset %d rowsize %d\n", i, ib, ioff, nbblkoffst, stdev, dev, myblk, myblkoffst, coloffset, myrowsize); if(myrowsize>0){ magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ 1 ] ); magma_sgemm( MagmaConjTrans, MagmaNoTrans, myrowsize, n, ib, alpha, dA(dev,ioff,coloffset), ldda, dB(dev,i,0), lddb, c_one, dwork(dev,0,0), lddwork ); } } } for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_event_record(redevents[dev][1], streams[dev][1]); } } // COL GEMM // blockoffset is offset within first block; for subsequent blocks it is 0 if(blockoffset>0){ magma_int_t ib = min( nb-blockoffset, m ); // block size magma_int_t iblock = (offset / nb) / ngpu; // local block id magma_int_t di = iblock*nb+blockoffset; // local index in parent matrix magma_setdevice( stdev ); magmablasSetKernelStream( streams[ stdev ][ 0 ] ); //printf("DEV %d COL GEMM first ioff %d di %d m %d n %d ib %d \n", stdev, offset, di, m, n, ib); magma_sgemm( MagmaNoTrans, MagmaNoTrans, m, n, ib, alpha, dA(stdev,offset,di), ldda, dB(stdev,0,0), lddb, beta, dC(stdev,0,0), lddc ); } // COL GEMM for( magma_int_t i = fstblksiz; i < m; i += nb ) { magma_int_t ib = min( nb, m-i ); // block size magma_int_t ioff = i + offset; // start global index in parent matrix magma_int_t iblock = (ioff / nb) / ngpu; // local block id magma_int_t dev = (ioff / nb) % ngpu; magma_int_t di = iblock*nb; // local index in parent matrix //printf("DEV %d COL GEMM i %d ioff %d di %d m-i %d n %d ib %d \n", dev, i, ioff, di, m-i, n, ib); magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ 0 ] ); if(i==0){ magma_sgemm( MagmaNoTrans, MagmaNoTrans, m-i, n, ib, alpha, dA(dev,ioff,di), ldda, dB(dev,i,0), lddb, beta, dC(dev,i,0), lddc ); }else{ magma_sgemm( MagmaNoTrans, MagmaNoTrans, m-i, n, ib, alpha, dA(dev,ioff,di), ldda, dB(dev,i,0), lddb, c_one, dC(dev,i,0), lddc ); } magma_event_record(redevents[dev][0], streams[dev][0]); // if only 1 GPU is used, do the ROW GEMM if(ngpu==1){ // NOTE THAT because the COL gemm write dC below the diagonal (i) // and the ROW GEMM write dC from 0 to diag-1, so they could // run in parallel on different streams. // // NO NO NO because // it might happen that col finished i and strated i+1 while row still at i // magmablasSetKernelStream( streams[ dev ][ 0 ] ); magma_sgemm( MagmaConjTrans, MagmaNoTrans, i, n, ib, alpha, dA(dev,ioff,offset), ldda, dB(dev,i,0), lddb, c_one, dC(dev,0,0), lddc ); } } if(ngpu>1){ for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_int_t nbblk = magma_ceildiv((m+blockoffset), nb); magma_int_t nbblkrow = nbblk-1; magma_int_t devperm = (dev-stdev+ngpu)%ngpu; magma_int_t myblk = (nbblkrow/ngpu) + (nbblkrow%ngpu > devperm ? 1:0 ); magma_int_t myrowsize = myblk * nb; if(dev==stdev) { myrowsize = myrowsize - blockoffset; } //printf("blockoffset %d nbblkrow %d devperm %d DEV %d RECEIVING myblk %d myrowsize %d\n", blockoffset, nbblkrow, devperm, dev, myblk, myrowsize); if(myrowsize>0){ magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ 0 ] ); magma_queue_wait_event(streams[ dev ][ 0 ], redevents[dev][1]); //magma_queue_sync( streams[ dev ][ 1 ] ); // for each dev add the computed ROW block each on its placment with dC for( magma_int_t blki = 0; blki < myblk; ++blki){ magma_int_t gbblki = (blki*ngpu + devperm)*nb - blockoffset; magma_int_t lcblki = blki*nb; magma_int_t ib = nb;// min(nb, m-gbblki); if(dev==stdev){ lcblki = blki*nb-blockoffset; if(blki==0){ gbblki = 0; lcblki = 0; ib = nb-blockoffset; } } magmablas_sgeadd(ib, n, c_one, &dwork[dev][lcblki], lddwork, &dC[dev][gbblki] , lddc ); } magma_event_record(redevents[dev][0], streams[dev][0]); } } } // =========================================================== // COMMUNICATION ALL_REDUCE_SUM // =========================================================== if(ngpu==1){ return; } // INITIALIZE COMM for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { masterdev = -1; gnode[cmplxid][MagmaMaxGPUs+1] = -1; myngpu = gnode[cmplxid][MagmaMaxGPUs]; for( magma_int_t idev = 0; idev < myngpu; ++idev ) { dev = gnode[cmplxid][idev]; devperm = (dev-stdev+ngpu)%ngpu; myblk = (nbblk/ngpu) + (nbblk%ngpu > devperm ? 1:0 ); mycolsize = myblk*nb; myblkoffst = nb*((nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0)); if(dev==stdev){ mycolsize -= blockoffset; myblkoffst += blockoffset; // local index in parent matrix } if((devperm==devlstblk)&&(lstblksiz>0)){ mycolsize -= (nb-(remm%nb)); } mycolsize = min(mycolsize, m); if(mycolsize>0){ gpuisactive[dev] = mycolsize; if(masterdev==-1) { masterdev = dev; nbcmplxactive = nbcmplxactive +1; cmplxisactive[cmplxid] = 1; gnode[cmplxid][MagmaMaxGPUs+1] = masterdev; } } } } /* for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_device_sync(); } */ //******************************* // each GPU send its result // to its master. The master make // the addition and then send to // to the masters of other real // and receive from the masters of // other real make the addition // and broadcast locally the final // result. //******************************* //printf("=======================================================================\n"); //printf(" sending to my master \n"); //printf("=======================================================================\n"); for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { myngpu = gnode[cmplxid][MagmaMaxGPUs]; masterdev = gnode[cmplxid][MagmaMaxGPUs+1]; //check if real is active if(masterdev!=-1){ for( magma_int_t idev = 0; idev < myngpu; ++idev ) { dev = gnode[cmplxid][idev]; mycolsize = gpuisactive[dev]; if(mycolsize>0){ // I am an active GPU. if I am not the master, then send my result to my master. // store result on dwork[masterdev][dev*maxgsize] if(dev!=masterdev){ magma_setdevice( dev ); //printf(" GPU %d sending to my master %d\n", dev, masterdev); // wait the geadd of my ROW and COL GEMM is done magma_queue_wait_event(streams[ dev ][ 0 ], redevents[dev][0]); // sending to the master of my real magma_scopymatrix_async( m, n, &dC[dev][0], lddc, &dwork2[masterdev][maxgsize*dev], m, streams[dev][0] ); magma_event_record(redevents[dev][masterdev], streams[dev][0]); } // end I am not the masterdev }// end if mycolsize>0 }// for idev }// end of if masterdev!=-1 maening real is active }// for cmplxid /* for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_device_sync(); } */ //printf("=======================================================================\n"); //printf(" each master do addition of local result and broadcast to other masters \n"); //printf("=======================================================================\n"); for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { myngpu = gnode[cmplxid][MagmaMaxGPUs]; masterdev = gnode[cmplxid][MagmaMaxGPUs+1]; //check if real is active if(masterdev!=-1){ magma_setdevice( masterdev ); // addition is done on stream 0 sequentially magmablasSetKernelStream( streams[ masterdev ][ 0 ] ); // wait the geadd of my ROW and COL GEMM is done magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[masterdev][0]); // ======================================== // local addition // ======================================== for( magma_int_t l = 0; l < myngpu; ++l ) { lcdev = gnode[cmplxid][l]; lccolsize = gpuisactive[lcdev]; if((lcdev!=masterdev)&&(lccolsize>0)){ //printf(" master %d receiving from %d and adding \n", masterdev, lcdev); // this is an active GPU of my real. // wait I received what he send it to me and then do addition. magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[lcdev][masterdev]); magmablas_sgeadd(m, n, c_one, &dwork2[masterdev][maxgsize*lcdev], m, &dC[masterdev][0] , lddc ); } }// for l=1:myngpu // because addition is done sequentially on stream 0, // I have to record this to be able to synch using it magma_event_record(redevents[masterdev][masterdev], streams[masterdev][0]); // ======================================== // // ======================================== // send to other masters // ======================================== for( magma_int_t k = 0; k < nbcmplx; ++k ) { if(k!=cmplxid){ gmaster = gnode[k][MagmaMaxGPUs+1]; if(gmaster!=-1){ //real is active //Master has to wait until finish the local addition then send using gmaster stream. //use stream 0 to make it sequential or stream gmaster to make it parallel. //Now both re the same. //printf(" master %d from cmplx %d sending to other master %d on cmplx %d \n", masterdev, cmplxid, gmaster, k); magma_queue_wait_event(streams[ masterdev ][ gmaster ], redevents[masterdev][masterdev]); magma_scopymatrix_async( m, n, &dC[masterdev][0], lddc, &dwork2[gmaster][maxgsize*masterdev], m, streams[masterdev][gmaster] ); magma_event_record(redevents[masterdev][gmaster], streams[masterdev][gmaster]); magma_event_record(redevents[masterdev][masterdev], streams[masterdev][gmaster]); } // end of gmaster!=-1 } // end of k!=cmplxid }// for k = 0: nbcmplx // ======================================== }// end of if masterdev!=-1 maening real is active }// for cmplxid /* for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_device_sync(); } */ //printf("=======================================================================\n"); //printf(" each master wait receiving other masters results, do the addition and broadcast locally \n"); //printf("=======================================================================\n"); for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { myngpu = gnode[cmplxid][MagmaMaxGPUs]; masterdev = gnode[cmplxid][MagmaMaxGPUs+1]; //check if real is active if(masterdev!=-1){ magma_setdevice( masterdev ); // addition is done on stream 0 sequentially magmablasSetKernelStream( streams[ masterdev ][ 0 ] ); // master has to wait until finishing all the send to other masters. magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[masterdev][masterdev]); // ======================================== // addition of results from other masters // ======================================== for( magma_int_t k = 0; k < nbcmplx; ++k ) { if(k!=cmplxid){ gmaster = gnode[k][MagmaMaxGPUs+1]; if(gmaster!=-1){ //real is active //Master has to wait until receiving from gmaster, then do addition using stream 0 //printf(" master %d from cmplx %d receiving from other master %d on cmplx %d and adding \n", masterdev, cmplxid, gmaster, k); magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[gmaster][masterdev]); magmablas_sgeadd(m, n, c_one, &dwork2[masterdev][maxgsize*gmaster], m, &dC[masterdev][0] , lddc ); } // end of gmaster!=-1 } // end of k!=cmplxid }// for k = 0: nbcmplx // because addition is done sequentially on stream 0, // I have to record this to be able to synch using it magma_event_record(redevents[masterdev][masterdev], streams[masterdev][0]); // ======================================== // ======================================== // local broadcast of final results // ======================================== for( magma_int_t l = 0; l < myngpu; ++l ) { lcdev = gnode[cmplxid][l]; lccolsize = gpuisactive[lcdev]; if((lcdev!=masterdev)&&(lccolsize>0)){ // this is an active GPU of my real. // wait the previous addition is done maening stream 0 is finished and broadcast sequentially for now. // to make it parallel put stream lcdev instead of stream 0 //printf(" master %d broadcasting local to %d \n", masterdev, lcdev); magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[masterdev][masterdev]); magma_scopymatrix_async( m, n, &dC[masterdev][0], lddc, &dC[lcdev][0], lddc, streams[masterdev][0] ); magma_event_record(redevents[masterdev][lcdev], streams[masterdev][0]); } }// for l=1:myngpu // ======================================== }// end of if masterdev!=-1 maening real is active }// for cmplxid /* for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_device_sync(); } */ for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { myngpu = gnode[cmplxid][MagmaMaxGPUs]; masterdev = gnode[cmplxid][MagmaMaxGPUs+1]; //check if real is active if(masterdev!=-1){ for( magma_int_t l = 0; l < myngpu; ++l ) { lcdev = gnode[cmplxid][l]; lccolsize = gpuisactive[lcdev]; if(lccolsize>0){ magma_setdevice( lcdev ); magma_queue_wait_event(streams[ lcdev ][ 0 ], redevents[lcdev][0]); magma_queue_wait_event(streams[ lcdev ][ 0 ], redevents[masterdev][lcdev]); } }// for l=1:myngpu }// end of if masterdev!=-1 maening real is active }// for cmplxid //printf("****************************************************\n"); //printf(" finish ssymm \n"); //printf("****************************************************\n"); magma_setdevice( cdev ); magmablasSetKernelStream( cstream ); }
/** Purpose ------- ZUNMQR overwrites the general complex M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = Magma_ConjTrans: Q**H * C C * Q**H @endverbatim where Q is a complex unitary matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by ZGEQRF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = Magma_ConjTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] A COMPLEX_16 array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQRF in the first k columns of its array argument A. @param[in] lda INTEGER The leading dimension of the array A. If SIDE = MagmaLeft, LDA >= max(1,M); if SIDE = MagmaRight, LDA >= max(1,N). @param[in] tau COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQRF. @param[in,out] C COMPLEX_16 array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q. @param[in] ldc INTEGER The leading dimension of the array C. LDC >= max(1,M). @param[out] work (workspace) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. If SIDE = MagmaLeft, LWORK >= max(1,N); if SIDE = MagmaRight, LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = MagmaLeft, and LWORK >= M*NB if SIDE = MagmaRight, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_zunmqr_m( magma_int_t ngpu, magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *C, magma_int_t ldc, magmaDoubleComplex *work, magma_int_t lwork, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define C(i, j) (C + (j)*ldc + (i)) #define dC(gpui, i, j) (dw[gpui] + (j)*lddc + (i)) #define dA_c(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddac) #define dA_r(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddar) #define dT(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + (ind)*((nb+1)*nb)) #define dwork(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + 2*((nb+1)*nb) + (ind)*(lddwork*nb)) /* Constants */ magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; /* Local variables */ const char* side_ = lapack_side_const( side ); const char* trans_ = lapack_trans_const( trans ); magma_int_t nb = 128; magmaDoubleComplex *T = NULL; magmaDoubleComplex_ptr dw[MagmaMaxGPUs] = { NULL }; magma_queue_t queues[MagmaMaxGPUs][2] = {{ NULL }}; magma_event_t events[MagmaMaxGPUs][2] = {{ NULL }}; magma_int_t ind_c; magma_device_t dev; magma_device_t orig_dev; magma_getdevice( &orig_dev ); *info = 0; magma_int_t left = (side == MagmaLeft); magma_int_t notran = (trans == MagmaNoTrans); magma_int_t lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ magma_int_t nq, nw; if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != Magma_ConjTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } magma_int_t lwkopt = max(1,nw) * nb; if (*info == 0) { work[0] = magma_zmake_lwork( lwkopt ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = c_one; return *info; } if (nb >= k) { /* Use CPU code */ lapackf77_zunmqr(side_, trans_, &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, info); return *info; } magma_int_t lddc = magma_roundup( m, 64 ); // TODO why 64 instead of 32 ? magma_int_t lddac = nq; magma_int_t lddar = nb; magma_int_t lddwork = nw; magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 }; magma_int_t nb_l=256; magma_int_t nbl = magma_ceildiv( n, nb_l ); // number of blocks magma_int_t maxnlocal = magma_ceildiv( nbl, ngpu )*nb_l; ngpu = min( ngpu, magma_ceildiv( n, nb_l )); // Don't use GPU that will not have data. magma_int_t ldw = maxnlocal*lddc // dC + 2*lddac*lddar // 2*dA + 2*(nb + 1 + lddwork)*nb; // 2*(dT and dwork) if (MAGMA_SUCCESS != magma_zmalloc_pinned( &T, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; goto cleanup; } for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); if (MAGMA_SUCCESS != magma_zmalloc( &dw[dev], ldw )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto cleanup; } magma_queue_create( dev, &queues[dev][0] ); magma_queue_create( dev, &queues[dev][1] ); magma_event_create( &events[dev][0] ); magma_event_create( &events[dev][1] ); } /* Use hybrid CPU-MGPU code */ if (left) { //copy C to mgpus for (magma_int_t i = 0; i < nbl; ++i) { dev = i % ngpu; magma_setdevice( dev ); magma_int_t kb = min(nb_l, n-i*nb_l); magma_zsetmatrix_async( m, kb, C(0, i*nb_l), ldc, dC(dev, 0, i/ngpu*nb_l), lddc, queues[dev][0] ); nlocal[dev] += kb; } magma_int_t i1, i2, i3; if ( !notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } ind_c = 0; for (magma_int_t i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { // start the copy of A panel magma_int_t kb = min(nb, k - i); for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_event_sync( events[dev][ind_c] ); // check if the new data can be copied magma_zsetmatrix_async(nq-i, kb, A(i, i), lda, dA_c(dev, ind_c, i, 0), lddac, queues[dev][0] ); // set upper triangular part of dA to identity magmablas_zlaset_band( MagmaUpper, kb, kb, kb, c_zero, c_one, dA_c(dev, ind_c, i, 0), lddac, queues[dev][0] ); } /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ magma_int_t nqi = nq - i; lapackf77_zlarft("F", "C", &nqi, &kb, A(i, i), &lda, &tau[i], T, &kb); /* H or H' is applied to C(1:m,i:n) */ /* Apply H or H'; First copy T to the GPU */ for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_zsetmatrix_async(kb, kb, T, kb, dT(dev, ind_c), kb, queues[dev][0] ); } for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_queue_sync( queues[dev][0] ); // check if the data was copied magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, m-i, nlocal[dev], kb, dA_c(dev, ind_c, i, 0), lddac, dT(dev, ind_c), kb, dC(dev, i, 0), lddc, dwork(dev, ind_c), lddwork, queues[dev][1] ); magma_event_record(events[dev][ind_c], queues[dev][1] ); } ind_c = (ind_c+1)%2; } for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_queue_sync( queues[dev][1] ); } //copy C from mgpus for (magma_int_t i = 0; i < nbl; ++i) { dev = i % ngpu; magma_setdevice( dev ); magma_int_t kb = min(nb_l, n-i*nb_l); magma_zgetmatrix( m, kb, dC(dev, 0, i/ngpu*nb_l), lddc, C(0, i*nb_l), ldc, queues[dev][1] ); // magma_zgetmatrix_async( m, kb, // dC(dev, 0, i/ngpu*nb_l), lddc, // C(0, i*nb_l), ldc, queues[dev][0] ); } } else { *info = MAGMA_ERR_NOT_IMPLEMENTED; magma_xerbla( __func__, -(*info) ); goto cleanup; /* if ( notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } mi = m; ic = 0; for (i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { ib = min(nb, k - i); // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) i__4 = nq - i; lapackf77_zlarft("F", "C", &i__4, &ib, A(i, i), &lda, &tau[i], T, &ib); // 1) copy the panel from A to the GPU, and // 2) set upper triangular part of dA to identity magma_zsetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda, queues[dev][1] ); magmablas_zlaset_band( MagmaUpper, ib, ib, ib, c_zero, c_one, dA(i, 0), ldda, queues[dev][1] ); // H or H' is applied to C(1:m,i:n) ni = n - i; jc = i; // Apply H or H'; First copy T to the GPU magma_zsetmatrix( ib, ib, T, ib, dT, ib, queues[dev][1] ); magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dA(i, 0), ldda, dT, ib, dC(ic, jc), lddc, dwork, lddwork, queues[dev][1] ); } */ } cleanup: work[0] = magma_zmake_lwork( lwkopt ); for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_event_destroy( events[dev][0] ); magma_event_destroy( events[dev][1] ); magma_queue_destroy( queues[dev][0] ); magma_queue_destroy( queues[dev][1] ); magma_free( dw[dev] ); } magma_setdevice( orig_dev ); magma_free_pinned( T ); return *info; } /* magma_zunmqr */
extern "C" magma_int_t magma_d_spmv( double alpha, magma_d_matrix A, magma_d_matrix x, double beta, magma_d_matrix y, magma_queue_t queue ) { magma_int_t info = 0; magma_d_matrix x2={Magma_CSR}; cusparseHandle_t cusparseHandle = 0; cusparseMatDescr_t descr = 0; // make sure RHS is a dense matrix if ( x.storage_type != Magma_DENSE ) { printf("error: only dense vectors are supported for SpMV.\n"); info = MAGMA_ERR_NOT_SUPPORTED; goto cleanup; } if ( A.memory_location != x.memory_location || x.memory_location != y.memory_location ) { printf("error: linear algebra objects are not located in same memory!\n"); printf("memory locations are: %d %d %d\n", A.memory_location, x.memory_location, y.memory_location ); info = MAGMA_ERR_INVALID_PTR; goto cleanup; } // DEV case if ( A.memory_location == Magma_DEV ) { if ( A.num_cols == x.num_rows && x.num_cols == 1 ) { if ( A.storage_type == Magma_CSR || A.storage_type == Magma_CUCSR || A.storage_type == Magma_CSRL || A.storage_type == Magma_CSRU ) { CHECK_CUSPARSE( cusparseCreate( &cusparseHandle )); CHECK_CUSPARSE( cusparseSetStream( cusparseHandle, queue->cuda_stream() )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descr )); CHECK_CUSPARSE( cusparseSetMatType( descr, CUSPARSE_MATRIX_TYPE_GENERAL )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descr, CUSPARSE_INDEX_BASE_ZERO )); cusparseDcsrmv( cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, A.num_rows, A.num_cols, A.nnz, &alpha, descr, A.dval, A.drow, A.dcol, x.dval, &beta, y.dval ); } else if ( A.storage_type == Magma_ELL ) { //printf("using ELLPACKT kernel for SpMV: "); CHECK( magma_dgeelltmv( MagmaNoTrans, A.num_rows, A.num_cols, A.max_nnz_row, alpha, A.dval, A.dcol, x.dval, beta, y.dval, queue )); //printf("done.\n"); } else if ( A.storage_type == Magma_ELLPACKT ) { //printf("using ELL kernel for SpMV: "); CHECK( magma_dgeellmv( MagmaNoTrans, A.num_rows, A.num_cols, A.max_nnz_row, alpha, A.dval, A.dcol, x.dval, beta, y.dval, queue )); //printf("done.\n"); } else if ( A.storage_type == Magma_ELLRT ) { //printf("using ELLRT kernel for SpMV: "); CHECK( magma_dgeellrtmv( MagmaNoTrans, A.num_rows, A.num_cols, A.max_nnz_row, alpha, A.dval, A.dcol, A.drow, x.dval, beta, y.dval, A.alignment, A.blocksize, queue )); //printf("done.\n"); } else if ( A.storage_type == Magma_SELLP ) { //printf("using SELLP kernel for SpMV: "); CHECK( magma_dgesellpmv( MagmaNoTrans, A.num_rows, A.num_cols, A.blocksize, A.numblocks, A.alignment, alpha, A.dval, A.dcol, A.drow, x.dval, beta, y.dval, queue )); //printf("done.\n"); } else if ( A.storage_type == Magma_DENSE ) { //printf("using DENSE kernel for SpMV: "); magmablas_dgemv( MagmaNoTrans, A.num_rows, A.num_cols, alpha, A.dval, A.num_rows, x.dval, 1, beta, y.dval, 1, queue ); //printf("done.\n"); } else if ( A.storage_type == Magma_SPMVFUNCTION ) { //printf("using DENSE kernel for SpMV: "); CHECK( magma_dcustomspmv( alpha, x, beta, y, queue )); //printf("done.\n"); } else if ( A.storage_type == Magma_BCSR ) { //printf("using CUSPARSE BCSR kernel for SpMV: "); // CUSPARSE context // cusparseDirection_t dirA = CUSPARSE_DIRECTION_ROW; int mb = magma_ceildiv( A.num_rows, A.blocksize ); int nb = magma_ceildiv( A.num_cols, A.blocksize ); CHECK_CUSPARSE( cusparseCreate( &cusparseHandle )); CHECK_CUSPARSE( cusparseSetStream( cusparseHandle, queue->cuda_stream() )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descr )); cusparseDbsrmv( cusparseHandle, dirA, CUSPARSE_OPERATION_NON_TRANSPOSE, mb, nb, A.numblocks, &alpha, descr, A.dval, A.drow, A.dcol, A.blocksize, x.dval, &beta, y.dval ); } else { printf("error: format not supported.\n"); info = MAGMA_ERR_NOT_SUPPORTED; } } else if ( A.num_cols < x.num_rows || x.num_cols > 1 ) { magma_int_t num_vecs = x.num_rows / A.num_cols * x.num_cols; if ( A.storage_type == Magma_CSR ) { CHECK_CUSPARSE( cusparseCreate( &cusparseHandle )); CHECK_CUSPARSE( cusparseSetStream( cusparseHandle, queue->cuda_stream() )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descr )); CHECK_CUSPARSE( cusparseSetMatType( descr, CUSPARSE_MATRIX_TYPE_GENERAL )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descr, CUSPARSE_INDEX_BASE_ZERO )); if ( x.major == MagmaColMajor) { cusparseDcsrmm(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, A.num_rows, num_vecs, A.num_cols, A.nnz, &alpha, descr, A.dval, A.drow, A.dcol, x.dval, A.num_cols, &beta, y.dval, A.num_cols); } else if ( x.major == MagmaRowMajor) { /*cusparseDcsrmm2(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, A.num_rows, num_vecs, A.num_cols, A.nnz, &alpha, descr, A.dval, A.drow, A.dcol, x.dval, A.num_cols, &beta, y.dval, A.num_cols); */ } } else if ( A.storage_type == Magma_SELLP ) { if ( x.major == MagmaRowMajor) { CHECK( magma_dmgesellpmv( MagmaNoTrans, A.num_rows, A.num_cols, num_vecs, A.blocksize, A.numblocks, A.alignment, alpha, A.dval, A.dcol, A.drow, x.dval, beta, y.dval, queue )); } else if ( x.major == MagmaColMajor) { // transpose first to row major CHECK( magma_dvtranspose( x, &x2, queue )); CHECK( magma_dmgesellpmv( MagmaNoTrans, A.num_rows, A.num_cols, num_vecs, A.blocksize, A.numblocks, A.alignment, alpha, A.dval, A.dcol, A.drow, x2.dval, beta, y.dval, queue )); } } /*if ( A.storage_type == Magma_DENSE ) { //printf("using DENSE kernel for SpMV: "); magmablas_dmgemv( MagmaNoTrans, A.num_rows, A.num_cols, num_vecs, alpha, A.dval, A.num_rows, x.dval, 1, beta, y.dval, 1 ); //printf("done.\n"); }*/ else { printf("error: format not supported.\n"); info = MAGMA_ERR_NOT_SUPPORTED; } } } // CPU case missing! else { printf("error: CPU not yet supported.\n"); info = MAGMA_ERR_NOT_SUPPORTED; } cleanup: cusparseDestroyMatDescr( descr ); cusparseDestroy( cusparseHandle ); cusparseHandle = 0; descr = 0; magma_dmfree(&x2, queue ); return info; }
//################################################################################################## static void *magma_dapplyQ_parallel_section(void *arg) { magma_int_t my_core_id = ((magma_dapplyQ_id_data*)arg) -> id; magma_dapplyQ_data* data = ((magma_dapplyQ_id_data*)arg) -> data; magma_int_t allcores_num = data -> threads_num; magma_int_t n = data -> n; magma_int_t ne = data -> ne; magma_int_t n_gpu = data -> n_gpu; magma_int_t nb = data -> nb; magma_int_t Vblksiz = data -> Vblksiz; double *E = data -> E; magma_int_t lde = data -> lde; double *V = data -> V; magma_int_t ldv = data -> ldv; double *TAU = data -> TAU; double *T = data -> T; magma_int_t ldt = data -> ldt; double *dE = data -> dE; magma_int_t ldde = data -> ldde; pthread_barrier_t* barrier = &(data -> barrier); magma_int_t info; #ifdef ENABLE_TIMER real_Double_t timeQcpu=0.0, timeQgpu=0.0; #endif magma_int_t n_cpu = ne - n_gpu; // with MKL and when using omp_set_num_threads instead of mkl_set_num_threads // it need that all threads setting it to 1. magma_set_lapack_numthreads(1); #ifdef MAGMA_SETAFFINITY //#define PRINTAFFINITY #ifdef PRINTAFFINITY affinity_set print_set; print_set.print_affinity(my_core_id, "starting affinity"); #endif cpu_set_t old_set, new_set; //store current affinity CPU_ZERO(&old_set); sched_getaffinity( 0, sizeof(old_set), &old_set); //set new affinity // bind threads CPU_ZERO(&new_set); CPU_SET(my_core_id, &new_set); sched_setaffinity( 0, sizeof(new_set), &new_set); #ifdef PRINTAFFINITY print_set.print_affinity(my_core_id, "set affinity"); #endif #endif if (my_core_id == 0) { //============================================= // on GPU on thread 0: // - apply V2*Z(:,1:N_GPU) //============================================= #ifdef ENABLE_TIMER timeQgpu = magma_wtime(); #endif magma_dsetmatrix(n, n_gpu, E, lde, dE, ldde); magma_dbulge_applyQ_v2(MagmaLeft, n_gpu, n, nb, Vblksiz, dE, ldde, V, ldv, T, ldt, &info); magma_device_sync(); #ifdef ENABLE_TIMER timeQgpu = magma_wtime()-timeQgpu; printf(" Finish Q2_GPU GGG timing= %f\n", timeQgpu); #endif } else { //============================================= // on CPU on threads 1:allcores_num-1: // - apply V2*Z(:,N_GPU+1:NE) //============================================= #ifdef ENABLE_TIMER if (my_core_id == 1) timeQcpu = magma_wtime(); #endif magma_int_t n_loc = magma_ceildiv(n_cpu, allcores_num-1); double* E_loc = E + (n_gpu+ n_loc * (my_core_id-1))*lde; n_loc = min(n_loc,n_cpu - n_loc * (my_core_id-1)); magma_dtile_bulge_applyQ(my_core_id, MagmaLeft, n_loc, n, nb, Vblksiz, E_loc, lde, V, ldv, TAU, T, ldt); pthread_barrier_wait(barrier); #ifdef ENABLE_TIMER if (my_core_id == 1) { timeQcpu = magma_wtime()-timeQcpu; printf(" Finish Q2_CPU CCC timing= %f\n", timeQcpu); } #endif } // END if my_core_id #ifdef MAGMA_SETAFFINITY //restore old affinity sched_setaffinity(0, sizeof(old_set), &old_set); #ifdef PRINTAFFINITY print_set.print_affinity(my_core_id, "restored_affinity"); #endif #endif return 0; }
/***************************************************************************//** Purpose ------- SGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix may exceed the GPU memory. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Note: The factorization of big panel is done calling multiple-gpu-interface. Pivots are applied on GPU within the big panel. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_getrf *******************************************************************************/ extern "C" magma_int_t magma_sgetrf_m( magma_int_t ngpu, magma_int_t m, magma_int_t n, float *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { #define A(i,j) (A + (j)*lda + (i)) #define dAT(d,i,j) (dAT[d] + (i)*nb*ldn_local + (j)*nb) #define dPT(d,i,j) (dPT[d] + (i)*nb*nb + (j)*nb*maxm) magma_timer_t time=0, time_total=0, time_alloc=0, time_set=0, time_get=0, time_comp=0; timer_start( time_total ); //real_Double_t flops; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; float *dAT[MagmaMaxGPUs], *dA[MagmaMaxGPUs], *dPT[MagmaMaxGPUs]; magma_int_t iinfo = 0, nb, nbi, maxm, n_local[MagmaMaxGPUs], ldn_local; magma_int_t N, M, NB, NBk, I, d, ngpu0 = ngpu; magma_int_t ii, jj, h, offset, ib, rows; magma_queue_t queues[MagmaMaxGPUs][2]; magma_event_t event[MagmaMaxGPUs][2]; *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; magma_device_t orig_dev; magma_getdevice( &orig_dev ); /* initialize nb */ nb = magma_get_sgetrf_nb( m, n ); maxm = magma_roundup( m, 32 ); /* figure out NB */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(float); /* number of columns in the big panel */ h = 1+(2+ngpu0); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); const char* ngr_nb_char = getenv("MAGMA_NGR_NB"); if ( ngr_nb_char != NULL ) NB = max( nb, min( NB, atoi(ngr_nb_char) ) ); //NB = 5*max(nb,32); if ( ngpu0 > magma_ceildiv( NB, nb )) { ngpu = magma_ceildiv( NB, nb ); h = 1+(2+ngpu); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); } else { ngpu = ngpu0; } if ( ngpu*NB >= n ) { #ifdef CHECK_SGETRF_OOC printf( " * still fit in GPU memory.\n" ); #endif NB = n; } else { #ifdef CHECK_SGETRF_OOC printf( " * don't fit in GPU memory.\n" ); #endif NB = ngpu*NB; NB = max( nb, (NB / nb) * nb); /* making sure it's devisable by nb (x64) */ } #ifdef CHECK_SGETRF_OO if ( NB != n ) printf( " * running in out-core mode (n=%lld, NB=%lld, nb=%lld, freeMem=%.2e).\n", (long long) n, (long long) NB, (long long) nb, (float) freeMem ); else printf( " * running in in-core mode (n=%lld, NB=%lld, nb=%lld, freeMem=%.2e).\n", (long long) n, (long long) NB, (long long) nb, (float) freeMem ); #endif if ( (nb <= 1) || (nb >= min(m,n)) ) { /* Use CPU code for scalar of one tile. */ lapackf77_sgetrf(&m, &n, A, &lda, ipiv, info); } else { /* Use hybrid blocked code. */ /* allocate memory on GPU to store the big panel */ timer_start( time_alloc ); n_local[0] = (NB/nb)/ngpu; if ( NB%(nb*ngpu) != 0 ) n_local[0]++; n_local[0] *= nb; ldn_local = magma_roundup( n_local[0], 32 ); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_smalloc( &dA[d], (ldn_local+h*nb)*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dPT[d] = dA[d] + nb*maxm; /* for storing the previous panel from CPU */ dAT[d] = dA[d] + h*nb*maxm; /* for storing the big panel */ magma_queue_create( d, &queues[d][0] ); magma_queue_create( d, &queues[d][1] ); magma_event_create( &event[d][0] ); magma_event_create( &event[d][1] ); } //magma_setdevice(0); timer_stop( time_alloc ); for( I=0; I < n; I += NB ) { M = m; N = min( NB, n-I ); /* number of columns in this big panel */ //s = min( max(m-I,0), N )/nb; /* number of small block-columns in this big panel */ maxm = magma_roundup( M, 32 ); if ( ngpu0 > magma_ceildiv( NB, nb ) ) { ngpu = magma_ceildiv( NB, nb ); } else { ngpu = ngpu0; } for( d=0; d < ngpu; d++ ) { n_local[d] = ((N/nb)/ngpu)*nb; if (d < (N/nb)%ngpu) n_local[d] += nb; else if (d == (N/nb)%ngpu) n_local[d] += N%nb; } ldn_local = magma_roundup( n_local[0], 32 ); /* upload the next big panel into GPU, transpose (A->A'), and pivot it */ timer_start( time ); magmablas_ssetmatrix_transpose_mgpu(ngpu, M, N, nb, A(0,I), lda, dAT, ldn_local, dA, maxm, queues); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( queues[d][0] ); magma_queue_sync( queues[d][1] ); } time_set += timer_stop( time ); timer_start( time ); /* --------------------------------------------------------------- */ /* loop around the previous big-panels to update the new big-panel */ for( offset = 0; offset < min(m,I); offset += NB ) { NBk = min( m-offset, NB ); /* start sending the first tile from the previous big-panels to gpus */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); nbi = min( nb, NBk ); magma_ssetmatrix_async( (M-offset), nbi, A(offset,offset), lda, dA[d], (maxm-offset), queues[d][0] ); /* make sure the previous update finished */ //magma_queue_sync( queues[d][1] ); magma_queue_wait_event( queues[d][0], event[d][0] ); /* transpose */ magmablas_stranspose( M-offset, nbi, dA[d], maxm-offset, dPT(d,0,0), nb, queues[d][0]); } /* applying the pivot from the previous big-panel */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magmablas_slaswp_q( ldn_local, dAT(d,0,0), ldn_local, offset+1, offset+NBk, ipiv, 1, queues[d][1] ); } /* going through each block-column of previous big-panels */ for( jj=0, ib=offset/nb; jj < NBk; jj += nb, ib++ ) { ii = offset+jj; rows = maxm - ii; nbi = min( nb, NBk-jj ); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); /* wait for a block-column on GPU */ magma_queue_sync( queues[d][0] ); /* start sending next column */ if ( jj+nb < NBk ) { magma_ssetmatrix_async( (M-ii-nb), min(nb,NBk-jj-nb), A(ii+nb,ii+nb), lda, dA[d], (rows-nb), queues[d][0] ); /* make sure the previous update finished */ //magma_queue_sync( queues[d][1] ); magma_queue_wait_event( queues[d][0], event[d][(1+jj/nb)%2] ); /* transpose next column */ magmablas_stranspose( M-ii-nb, nb, dA[d], rows-nb, dPT(d,0,(1+jj/nb)%2), nb, queues[d][0]); } /* update with the block column */ magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n_local[d], nbi, c_one, dPT(d,0,(jj/nb)%2), nb, dAT(d,ib,0), ldn_local, queues[d][1]); if ( M > ii+nb ) { magma_sgemm( MagmaNoTrans, MagmaNoTrans, n_local[d], M-(ii+nb), nbi, c_neg_one, dAT(d,ib,0), ldn_local, dPT(d,1,(jj/nb)%2), nb, c_one, dAT(d,ib+1,0), ldn_local, queues[d][1]); } magma_event_record( event[d][(jj/nb)%2], queues[d][1] ); } /* end of for each block-columns in a big-panel */ } } /* end of for each previous big-panels */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( queues[d][0] ); magma_queue_sync( queues[d][1] ); } /* calling magma-gpu interface to panel-factorize the big panel */ if ( M > I ) { magma_sgetrf2_mgpu(ngpu, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, A(0,I), lda, queues, &iinfo); if ( iinfo < 0 ) { *info = iinfo; break; } else if ( iinfo != 0 ) { *info = iinfo + I * NB; //break; } /* adjust pivots */ for( ii=I; ii < min(I+N,m); ii++ ) ipiv[ii] += I; } time_comp += timer_stop( time ); /* get the current big panel to CPU from devices */ timer_start( time ); magmablas_sgetmatrix_transpose_mgpu(ngpu, M, N, nb, dAT, ldn_local, A(0,I), lda, dA, maxm, queues); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( queues[d][0] ); magma_queue_sync( queues[d][1] ); } time_get += timer_stop( time ); } /* end of for */ //timer_stop( time_total ); //flops = FLOPS_SGETRF( m, n ) / 1e9; //timer_printf(" memory-allocation time: %e\n", time_alloc ); //timer_printf(" NB=%lld nb=%lld\n", (long long) NB, (long long) nb ); //timer_printf(" memcopy and transpose %e seconds\n", time_set ); //timer_printf(" total time %e seconds\n", time_total ); //timer_printf(" Performance %f GFlop/s, %f seconds without htod and dtoh\n", flops / (time_comp), time_comp ); //timer_printf(" Performance %f GFlop/s, %f seconds with htod\n", flops / (time_comp + time_set), time_comp + time_set ); //timer_printf(" Performance %f GFlop/s, %f seconds with dtoh\n", flops / (time_comp + time_get), time_comp + time_get ); //timer_printf(" Performance %f GFlop/s, %f seconds without memory-allocation\n", flops / (time_total - time_alloc), time_total - time_alloc ); for( d=0; d < ngpu0; d++ ) { magma_setdevice(d); magma_free( dA[d] ); magma_event_destroy( event[d][0] ); magma_event_destroy( event[d][1] ); magma_queue_destroy( queues[d][0] ); magma_queue_destroy( queues[d][1] ); } magma_setdevice( orig_dev ); } if ( *info >= 0 ) magma_sgetrf_piv(m, n, NB, A, lda, ipiv, info); return *info; } /* magma_sgetrf_m */
extern "C" magma_int_t magma_dmtransfer( magma_d_matrix A, magma_d_matrix *B, magma_location_t src, magma_location_t dst, magma_queue_t queue ) { magma_int_t info = 0; B->val = NULL; B->diag = NULL; B->row = NULL; B->rowidx = NULL; B->col = NULL; B->blockinfo = NULL; B->dval = NULL; B->ddiag = NULL; B->drow = NULL; B->drowidx = NULL; B->dcol = NULL; B->diag = NULL; B->ddiag = NULL; B->list = NULL; B->dlist = NULL; // first case: copy matrix from host to device if ( src == Magma_CPU && dst == Magma_DEV ) { //CSR-type if ( A.storage_type == Magma_CSR || A.storage_type == Magma_CUCSR || A.storage_type == Magma_CSC || A.storage_type == Magma_CSRD || A.storage_type == Magma_CSRL || A.storage_type == Magma_CSRU ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc( &B->dval, A.nnz )); CHECK( magma_index_malloc( &B->drow, A.num_rows + 1 )); CHECK( magma_index_malloc( &B->dcol, A.nnz )); // data transfer magma_dsetvector( A.nnz, A.val, 1, B->dval, 1, queue ); magma_index_setvector( A.num_rows + 1, A.row, 1, B->drow, 1, queue ); magma_index_setvector( A.nnz, A.col, 1, B->dcol, 1, queue ); } //COO-type else if ( A.storage_type == Magma_COO ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc( &B->dval, A.nnz )); CHECK( magma_index_malloc( &B->dcol, A.nnz )); CHECK( magma_index_malloc( &B->drowidx, A.nnz )); // data transfer magma_dsetvector( A.nnz, A.val, 1, B->dval, 1, queue ); magma_index_setvector( A.nnz, A.col, 1, B->dcol, 1, queue ); magma_index_setvector( A.nnz, A.rowidx, 1, B->drowidx, 1, queue ); } //CSRCOO-type else if ( A.storage_type == Magma_CSRCOO ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc( &B->dval, A.nnz )); CHECK( magma_index_malloc( &B->drow, A.num_rows + 1 )); CHECK( magma_index_malloc( &B->dcol, A.nnz )); CHECK( magma_index_malloc( &B->drowidx, A.nnz )); // data transfer magma_dsetvector( A.nnz, A.val, 1, B->dval, 1, queue ); magma_index_setvector( A.num_rows + 1, A.row, 1, B->drow, 1, queue ); magma_index_setvector( A.nnz, A.col, 1, B->dcol, 1, queue ); magma_index_setvector( A.nnz, A.rowidx, 1, B->drowidx, 1, queue ); } //ELL/ELLPACKT-type else if ( A.storage_type == Magma_ELLPACKT || A.storage_type == Magma_ELL ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc( &B->dval, A.num_rows * A.max_nnz_row )); CHECK( magma_index_malloc( &B->dcol, A.num_rows * A.max_nnz_row )); // data transfer magma_dsetvector( A.num_rows * A.max_nnz_row, A.val, 1, B->dval, 1, queue ); magma_index_setvector( A.num_rows * A.max_nnz_row, A.col, 1, B->dcol, 1, queue ); } //ELLD-type else if ( A.storage_type == Magma_ELLD ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc( &B->dval, A.num_rows * A.max_nnz_row )); CHECK( magma_index_malloc( &B->dcol, A.num_rows * A.max_nnz_row )); // data transfer magma_dsetvector( A.num_rows * A.max_nnz_row, A.val, 1, B->dval, 1, queue ); magma_index_setvector( A.num_rows * A.max_nnz_row, A.col, 1, B->dcol, 1, queue ); } //ELLRT-type else if ( A.storage_type == Magma_ELLRT ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; B->blocksize = A.blocksize; B->alignment = A.alignment; magma_int_t rowlength = magma_roundup( A.max_nnz_row, A.alignment ); // memory allocation CHECK( magma_dmalloc( &B->dval, A.num_rows * rowlength )); CHECK( magma_index_malloc( &B->dcol, A.num_rows * rowlength )); CHECK( magma_index_malloc( &B->drow, A.num_rows )); // data transfer magma_dsetvector( A.num_rows * rowlength, A.val, 1, B->dval, 1, queue ); magma_index_setvector( A.num_rows * rowlength, A.col, 1, B->dcol, 1, queue ); magma_index_setvector( A.num_rows, A.row, 1, B->drow, 1, queue ); } //SELLP-type else if ( A.storage_type == Magma_SELLP ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; B->blocksize = A.blocksize; B->numblocks = A.numblocks; B->alignment = A.alignment; // memory allocation CHECK( magma_dmalloc( &B->dval, A.nnz )); CHECK( magma_index_malloc( &B->dcol, A.nnz )); CHECK( magma_index_malloc( &B->drow, A.numblocks + 1 )); // data transfer magma_dsetvector( A.nnz, A.val, 1, B->dval, 1, queue ); magma_index_setvector( A.nnz, A.col, 1, B->dcol, 1, queue ); magma_index_setvector( A.numblocks + 1, A.row, 1, B->drow, 1, queue ); } //BCSR-type else if ( A.storage_type == Magma_BCSR ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; B->blocksize = A.blocksize; B->numblocks = A.numblocks; B->alignment = A.alignment; magma_int_t size_b = A.blocksize; //magma_int_t c_blocks = ceil( (float)A.num_cols / (float)size_b ); // max number of blocks per row //magma_int_t r_blocks = ceil( (float)A.num_rows / (float)size_b ); magma_int_t r_blocks = magma_ceildiv( A.num_rows, size_b ); // max number of blocks per column // memory allocation CHECK( magma_dmalloc( &B->dval, size_b * size_b * A.numblocks )); CHECK( magma_index_malloc( &B->drow, r_blocks + 1 )); CHECK( magma_index_malloc( &B->dcol, A.numblocks )); // data transfer magma_dsetvector( size_b * size_b * A.numblocks, A.val, 1, B->dval, 1, queue ); magma_index_setvector( r_blocks + 1, A.row, 1, B->drow, 1, queue ); magma_index_setvector( A.numblocks, A.col, 1, B->dcol, 1, queue ); } //DENSE-type else if ( A.storage_type == Magma_DENSE ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; B->major = A.major; B->ld = A.ld; // memory allocation CHECK( magma_dmalloc( &B->dval, A.num_rows * A.num_cols )); // data transfer magma_dsetvector( A.num_rows * A.num_cols, A.val, 1, B->dval, 1, queue ); } } // second case: copy matrix from host to host else if ( src == Magma_CPU && dst == Magma_CPU ) { //CSR-type if ( A.storage_type == Magma_CSR || A.storage_type == Magma_CUCSR || A.storage_type == Magma_CSC || A.storage_type == Magma_CSRD || A.storage_type == Magma_CSRL || A.storage_type == Magma_CSRU ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc_cpu( &B->val, A.nnz )); CHECK( magma_index_malloc_cpu( &B->row, A.num_rows + 1 )); CHECK( magma_index_malloc_cpu( &B->col, A.nnz )); // data transfer for( magma_int_t i=0; i<A.nnz; i++ ) { B->val[i] = A.val[i]; B->col[i] = A.col[i]; } for( magma_int_t i=0; i<A.num_rows+1; i++ ) { B->row[i] = A.row[i]; } } //COO-type else if ( A.storage_type == Magma_COO ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc_cpu( &B->val, A.nnz )); CHECK( magma_index_malloc_cpu( &B->col, A.nnz )); CHECK( magma_index_malloc_cpu( &B->rowidx, A.nnz )); // data transfer for( magma_int_t i=0; i<A.nnz; i++ ) { B->val[i] = A.val[i]; B->col[i] = A.col[i]; B->rowidx[i] = A.rowidx[i]; } } //CSRCOO-type else if ( A.storage_type == Magma_CSRCOO ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc_cpu( &B->val, A.nnz )); CHECK( magma_index_malloc_cpu( &B->row, A.num_rows + 1 )); CHECK( magma_index_malloc_cpu( &B->col, A.nnz )); CHECK( magma_index_malloc_cpu( &B->rowidx, A.nnz )); // data transfer for( magma_int_t i=0; i<A.nnz; i++ ) { B->val[i] = A.val[i]; B->col[i] = A.col[i]; B->rowidx[i] = A.rowidx[i]; } for( magma_int_t i=0; i<A.num_rows+1; i++ ) { B->row[i] = A.row[i]; } } //ELL/ELLPACKT-type else if ( A.storage_type == Magma_ELLPACKT || A.storage_type == Magma_ELL ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc_cpu( &B->val, A.num_rows * A.max_nnz_row )); CHECK( magma_index_malloc_cpu( &B->col, A.num_rows * A.max_nnz_row )); // data transfer for( magma_int_t i=0; i<A.num_rows*A.max_nnz_row; i++ ) { B->val[i] = A.val[i]; B->col[i] = A.col[i]; } } //ELLD-type else if ( A.storage_type == Magma_ELLD ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc_cpu( &B->val, A.num_rows * A.max_nnz_row )); CHECK( magma_index_malloc_cpu( &B->col, A.num_rows * A.max_nnz_row )); // data transfer for( magma_int_t i=0; i<A.num_rows*A.max_nnz_row; i++ ) { B->val[i] = A.val[i]; B->col[i] = A.col[i]; } } //ELLRT-type else if ( A.storage_type == Magma_ELLRT ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; B->blocksize = A.blocksize; B->alignment = A.alignment; //int threads_per_row = A.alignment; //int rowlength = magma_roundup( A.max_nnz_row, threads_per_row ); magma_int_t rowlength = magma_roundup( A.max_nnz_row, A.alignment ); // memory allocation CHECK( magma_dmalloc_cpu( &B->val, rowlength * A.num_rows )); CHECK( magma_index_malloc_cpu( &B->row, A.num_rows )); CHECK( magma_index_malloc_cpu( &B->col, rowlength * A.num_rows )); // data transfer for( magma_int_t i=0; i<A.num_rows*rowlength; i++ ) { B->val[i] = A.val[i]; B->col[i] = A.col[i]; } for( magma_int_t i=0; i<A.num_rows; i++ ) { B->row[i] = A.row[i]; } } //SELLP-type else if ( A.storage_type == Magma_SELLP ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; B->blocksize = A.blocksize; B->alignment = A.alignment; B->numblocks = A.numblocks; // memory allocation CHECK( magma_dmalloc_cpu( &B->val, A.nnz )); CHECK( magma_index_malloc_cpu( &B->col, A.nnz )); CHECK( magma_index_malloc_cpu( &B->row, A.numblocks + 1 )); // data transfer for( magma_int_t i=0; i<A.nnz; i++ ) { B->val[i] = A.val[i]; B->col[i] = A.col[i]; } for( magma_int_t i=0; i<A.numblocks+1; i++ ) { B->row[i] = A.row[i]; } } //BCSR-type else if ( A.storage_type == Magma_BCSR ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; B->blocksize = A.blocksize; B->numblocks = A.numblocks; B->alignment = A.alignment; magma_int_t size_b = A.blocksize; //magma_int_t c_blocks = ceil( (float)A.num_cols / (float)size_b ); // max number of blocks per row //magma_int_t r_blocks = ceil( (float)A.num_rows / (float)size_b ); magma_int_t r_blocks = magma_ceildiv( A.num_rows, size_b ); // max number of blocks per column // memory allocation CHECK( magma_dmalloc_cpu( &B->val, size_b * size_b * A.numblocks )); CHECK( magma_index_malloc_cpu( &B->row, r_blocks + 1 )); CHECK( magma_index_malloc_cpu( &B->col, A.numblocks )); // data transfer //magma_dsetvector( size_b * size_b * A.numblocks, A.val, 1, B->dval, 1, queue ); for( magma_int_t i=0; i<size_b*size_b*A.numblocks; i++ ) { B->dval[i] = A.val[i]; } //magma_index_setvector( r_blocks + 1, A.row, 1, B->drow, 1, queue ); for( magma_int_t i=0; i<r_blocks+1; i++ ) { B->drow[i] = A.row[i]; } //magma_index_setvector( A.numblocks, A.col, 1, B->dcol, 1, queue ); for( magma_int_t i=0; i<A.numblocks; i++ ) { B->dcol[i] = A.col[i]; } } //DENSE-type else if ( A.storage_type == Magma_DENSE ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; B->major = A.major; B->ld = A.ld; // memory allocation CHECK( magma_dmalloc_cpu( &B->val, A.num_rows * A.num_cols )); // data transfer for( magma_int_t i=0; i<A.num_rows*A.num_cols; i++ ) { B->val[i] = A.val[i]; } } } // third case: copy matrix from device to host else if ( src == Magma_DEV && dst == Magma_CPU ) { //CSR-type if ( A.storage_type == Magma_CSR || A.storage_type == Magma_CUCSR || A.storage_type == Magma_CSC || A.storage_type == Magma_CSRD || A.storage_type == Magma_CSRL || A.storage_type == Magma_CSRU ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc_cpu( &B->val, A.nnz )); CHECK( magma_index_malloc_cpu( &B->row, A.num_rows + 1 )); CHECK( magma_index_malloc_cpu( &B->col, A.nnz )); // data transfer magma_dgetvector( A.nnz, A.dval, 1, B->val, 1, queue ); magma_index_getvector( A.num_rows + 1, A.drow, 1, B->row, 1, queue ); magma_index_getvector( A.nnz, A.dcol, 1, B->col, 1, queue ); } //COO-type else if ( A.storage_type == Magma_COO ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc_cpu( &B->val, A.nnz )); CHECK( magma_index_malloc_cpu( &B->col, A.nnz )); CHECK( magma_index_malloc_cpu( &B->rowidx, A.nnz )); // data transfer magma_dgetvector( A.nnz, A.dval, 1, B->val, 1, queue ); magma_index_getvector( A.nnz, A.dcol, 1, B->col, 1, queue ); magma_index_getvector( A.nnz, A.drowidx, 1, B->rowidx, 1, queue ); } //CSRCOO-type else if ( A.storage_type == Magma_CSRCOO ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc_cpu( &B->val, A.nnz )); CHECK( magma_index_malloc_cpu( &B->row, A.num_rows + 1 )); CHECK( magma_index_malloc_cpu( &B->col, A.nnz )); CHECK( magma_index_malloc_cpu( &B->rowidx, A.nnz )); // data transfer magma_dgetvector( A.nnz, A.dval, 1, B->val, 1, queue ); magma_index_getvector( A.num_rows + 1, A.drow, 1, B->row, 1, queue ); magma_index_getvector( A.nnz, A.dcol, 1, B->col, 1, queue ); magma_index_getvector( A.nnz, A.drowidx, 1, B->rowidx, 1, queue ); } //ELL/ELLPACKT-type else if ( A.storage_type == Magma_ELLPACKT || A.storage_type == Magma_ELL ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc_cpu( &B->val, A.num_rows * A.max_nnz_row )); CHECK( magma_index_malloc_cpu( &B->col, A.num_rows * A.max_nnz_row )); // data transfer magma_dgetvector( A.num_rows * A.max_nnz_row, A.dval, 1, B->val, 1, queue ); magma_index_getvector( A.num_rows * A.max_nnz_row, A.dcol, 1, B->col, 1, queue ); } //ELLD-type else if ( A.storage_type == Magma_ELLD ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc_cpu( &B->val, A.num_rows * A.max_nnz_row )); CHECK( magma_index_malloc_cpu( &B->col, A.num_rows * A.max_nnz_row )); // data transfer magma_dgetvector( A.num_rows * A.max_nnz_row, A.dval, 1, B->val, 1, queue ); magma_index_getvector( A.num_rows * A.max_nnz_row, A.dcol, 1, B->col, 1, queue ); } //ELLRT-type else if ( A.storage_type == Magma_ELLRT ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; B->blocksize = A.blocksize; B->alignment = A.alignment; //int threads_per_row = A.alignment; //int rowlength = magma_roundup( A.max_nnz_row, threads_per_row ); magma_int_t rowlength = magma_roundup( A.max_nnz_row, A.alignment ); // memory allocation CHECK( magma_dmalloc_cpu( &B->val, rowlength * A.num_rows )); CHECK( magma_index_malloc_cpu( &B->row, A.num_rows )); CHECK( magma_index_malloc_cpu( &B->col, rowlength * A.num_rows )); // data transfer magma_dgetvector( A.num_rows * rowlength, A.dval, 1, B->val, 1, queue ); magma_index_getvector( A.num_rows * rowlength, A.dcol, 1, B->col, 1, queue ); magma_index_getvector( A.num_rows, A.drow, 1, B->row, 1, queue ); } //SELLP-type else if ( A.storage_type == Magma_SELLP ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; B->blocksize = A.blocksize; B->numblocks = A.numblocks; B->alignment = A.alignment; // memory allocation CHECK( magma_dmalloc_cpu( &B->val, A.nnz )); CHECK( magma_index_malloc_cpu( &B->col, A.nnz )); CHECK( magma_index_malloc_cpu( &B->row, A.numblocks + 1 )); // data transfer magma_dgetvector( A.nnz, A.dval, 1, B->val, 1, queue ); magma_index_getvector( A.nnz, A.dcol, 1, B->col, 1, queue ); magma_index_getvector( A.numblocks + 1, A.drow, 1, B->row, 1, queue ); } //BCSR-type else if ( A.storage_type == Magma_BCSR ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; B->blocksize = A.blocksize; B->numblocks = A.numblocks; B->alignment = A.alignment; magma_int_t size_b = A.blocksize; //magma_int_t c_blocks = ceil( (float)A.num_cols / (float)size_b ); // max number of blocks per row //magma_int_t r_blocks = ceil( (float)A.num_rows / (float)size_b ); magma_int_t r_blocks = magma_ceildiv( A.num_rows, size_b ); // max number of blocks per column // memory allocation CHECK( magma_dmalloc_cpu( &B->val, size_b * size_b * A.numblocks )); CHECK( magma_index_malloc_cpu( &B->row, r_blocks + 1 )); CHECK( magma_index_malloc_cpu( &B->col, A.numblocks )); // data transfer magma_dgetvector( size_b * size_b * A.numblocks, A.dval, 1, B->val, 1, queue ); magma_index_getvector( r_blocks + 1, A.drow, 1, B->row, 1, queue ); magma_index_getvector( A.numblocks, A.dcol, 1, B->col, 1, queue ); } //DENSE-type else if ( A.storage_type == Magma_DENSE ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_CPU; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; B->major = A.major; B->ld = A.ld; // memory allocation CHECK( magma_dmalloc_cpu( &B->val, A.num_rows * A.num_cols )); // data transfer magma_dgetvector( A.num_rows * A.num_cols, A.dval, 1, B->val, 1, queue ); } } // fourth case: copy matrix from device to device else if ( src == Magma_DEV && dst == Magma_DEV ) { //CSR-type if ( A.storage_type == Magma_CSR || A.storage_type == Magma_CUCSR || A.storage_type == Magma_CSC || A.storage_type == Magma_CSRD || A.storage_type == Magma_CSRL || A.storage_type == Magma_CSRU ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc( &B->dval, A.nnz )); CHECK( magma_index_malloc( &B->drow, A.num_rows + 1 )); CHECK( magma_index_malloc( &B->dcol, A.nnz )); // data transfer magma_dcopyvector( A.nnz, A.dval, 1, B->dval, 1, queue ); magma_index_copyvector( A.num_rows + 1, A.drow, 1, B->drow, 1, queue ); magma_index_copyvector( A.nnz, A.dcol, 1, B->dcol, 1, queue ); } //COO-type else if ( A.storage_type == Magma_COO ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc( &B->dval, A.nnz )); CHECK( magma_index_malloc( &B->dcol, A.nnz )); CHECK( magma_index_malloc( &B->drowidx, A.nnz )); // data transfer magma_dcopyvector( A.nnz, A.dval, 1, B->dval, 1, queue ); magma_index_copyvector( A.nnz, A.dcol, 1, B->dcol, 1, queue ); magma_index_copyvector( A.nnz, A.drowidx, 1, B->drowidx, 1, queue ); } //CSRCOO-type else if ( A.storage_type == Magma_CSRCOO ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc( &B->dval, A.nnz )); CHECK( magma_index_malloc( &B->drow, A.num_rows + 1 )); CHECK( magma_index_malloc( &B->dcol, A.nnz )); CHECK( magma_index_malloc( &B->drowidx, A.nnz )); // data transfer magma_dcopyvector( A.nnz, A.dval, 1, B->dval, 1, queue ); magma_index_copyvector( A.num_rows + 1, A.drow, 1, B->drow, 1, queue ); magma_index_copyvector( A.nnz, A.dcol, 1, B->dcol, 1, queue ); magma_index_copyvector( A.nnz, A.drowidx, 1, B->drowidx, 1, queue ); } //ELL/ELLPACKT-type else if ( A.storage_type == Magma_ELLPACKT || A.storage_type == Magma_ELL ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc( &B->dval, A.num_rows * A.max_nnz_row )); CHECK( magma_index_malloc( &B->dcol, A.num_rows * A.max_nnz_row )); // data transfer magma_dcopyvector( A.num_rows * A.max_nnz_row, A.dval, 1, B->dval, 1, queue ); magma_index_copyvector( A.num_rows * A.max_nnz_row, A.dcol, 1, B->dcol, 1, queue ); } //ELLD-type else if ( A.storage_type == Magma_ELLD ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; // memory allocation CHECK( magma_dmalloc( &B->dval, A.num_rows * A.max_nnz_row )); CHECK( magma_index_malloc( &B->dcol, A.num_rows * A.max_nnz_row )); // data transfer magma_dcopyvector( A.num_rows * A.max_nnz_row, A.dval, 1, B->dval, 1, queue ); magma_index_copyvector( A.num_rows * A.max_nnz_row, A.dcol, 1, B->dcol, 1, queue ); } //ELLRT-type else if ( A.storage_type == Magma_ELLRT ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; B->blocksize = A.blocksize; B->alignment = A.alignment; //int threads_per_row = A.alignment; //int rowlength = magma_roundup( A.max_nnz_row, threads_per_row ); magma_int_t rowlength = magma_roundup( A.max_nnz_row, A.alignment ); // memory allocation CHECK( magma_dmalloc( &B->dval, A.num_rows * rowlength )); CHECK( magma_index_malloc( &B->dcol, A.num_rows * rowlength )); CHECK( magma_index_malloc( &B->drow, A.num_rows )); // data transfer magma_dcopyvector( A.num_rows * rowlength, A.dval, 1, B->dval, 1, queue ); magma_index_copyvector( A.num_rows * rowlength, A.dcol, 1, B->dcol, 1, queue ); magma_index_copyvector( A.num_rows, A.drow, 1, B->drow, 1, queue ); } //SELLP-type else if ( A.storage_type == Magma_SELLP ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; B->blocksize = A.blocksize; B->numblocks = A.numblocks; B->alignment = A.alignment; // memory allocation CHECK( magma_dmalloc( &B->dval, A.nnz )); CHECK( magma_index_malloc( &B->dcol, A.nnz )); CHECK( magma_index_malloc( &B->drow, A.numblocks + 1 )); // data transfer magma_dcopyvector( A.nnz, A.dval, 1, B->dval, 1, queue ); magma_index_copyvector( A.nnz, A.dcol, 1, B->dcol, 1, queue ); magma_index_copyvector( A.numblocks + 1, A.drow, 1, B->drow, 1, queue ); } //BCSR-type else if ( A.storage_type == Magma_BCSR ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; B->blocksize = A.blocksize; B->numblocks = A.numblocks; B->alignment = A.alignment; magma_int_t size_b = A.blocksize; //magma_int_t c_blocks = ceil( (float)A.num_cols / (float)size_b ); // max number of blocks per row //magma_int_t r_blocks = ceil( (float)A.num_rows / (float)size_b ); magma_int_t r_blocks = magma_ceildiv( A.num_rows, size_b ); // max number of blocks per column // memory allocation CHECK( magma_dmalloc( &B->dval, size_b * size_b * A.numblocks )); CHECK( magma_index_malloc( &B->drow, r_blocks + 1 )); CHECK( magma_index_malloc( &B->dcol, A.numblocks )); // data transfer magma_dcopyvector( size_b * size_b * A.numblocks, A.dval, 1, B->dval, 1, queue ); magma_index_copyvector( r_blocks + 1, A.drow, 1, B->drow, 1, queue ); magma_index_copyvector( A.numblocks, A.dcol, 1, B->dcol, 1, queue ); } //DENSE-type else if ( A.storage_type == Magma_DENSE ) { // fill in information for B B->storage_type = A.storage_type; B->memory_location = Magma_DEV; B->sym = A.sym; B->diagorder_type = A.diagorder_type; B->fill_mode = A.fill_mode; B->num_rows = A.num_rows; B->num_cols = A.num_cols; B->nnz = A.nnz; B->true_nnz = A.true_nnz; B->max_nnz_row = A.max_nnz_row; B->diameter = A.diameter; B->major = A.major; B->ld = A.ld; // memory allocation CHECK( magma_dmalloc( &B->dval, A.num_rows * A.num_cols )); // data transfer magma_dcopyvector( A.num_rows * A.num_cols, A.dval, 1, B->dval, 1, queue ); } } cleanup: if( info != 0 ){ magma_dmfree( B, queue ); } return info; }
/** Purpose ------- SSYEVD_GPU computes all eigenvalues and, optionally, eigenvectors of a real symmetric matrix A. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments --------- @param[in] jobz magma_vec_t - = MagmaNoVec: Compute eigenvalues only; - = MagmaVec: Compute eigenvalues and eigenvectors. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] dA REAL array on the GPU, dimension (LDDA, N). On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, if JOBZ = MagmaVec, then if INFO = 0, A contains the orthonormal eigenvectors of the matrix A. If JOBZ = MagmaNoVec, then on exit the lower triangle (if UPLO=MagmaLower) or the upper triangle (if UPLO=MagmaUpper) of A, including the diagonal, is destroyed. @param[in] ldda INTEGER The leading dimension of the array DA. LDDA >= max(1,N). @param[out] w REAL array, dimension (N) If INFO = 0, the eigenvalues in ascending order. @param wA (workspace) REAL array, dimension (LDWA, N) @param[in] ldwa INTEGER The leading dimension of the array wA. LDWA >= max(1,N). @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LWORK >= 2*N + N*NB. If JOBZ = MagmaVec and N > 1, LWORK >= max( 2*N + N*NB, 1 + 6*N + 2*N**2 ). NB can be obtained through magma_get_ssytrd_nb(N). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. @param[out] iwork (workspace) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK. @param[in] liwork INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LIWORK >= 1. If JOBZ = MagmaVec and N > 1, LIWORK >= 3 + 5*N. \n If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i and JOBZ = MagmaNoVec, then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = MagmaVec, then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1). Further Details --------------- Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified description of INFO. Sven, 16 Feb 05. @ingroup magma_ssyev_driver ********************************************************************/ extern "C" magma_int_t magma_ssyevd_gpu( magma_vec_t jobz, magma_uplo_t uplo, magma_int_t n, magmaFloat_ptr dA, magma_int_t ldda, float *w, float *wA, magma_int_t ldwa, float *work, magma_int_t lwork, #ifdef COMPLEX float *rwork, magma_int_t lrwork, #endif magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { magma_int_t ione = 1; float d__1; float eps; magma_int_t inde; float anrm; float rmin, rmax; float sigma; magma_int_t iinfo, lwmin; magma_int_t lower; magma_int_t wantz; magma_int_t indwk2, llwrk2; magma_int_t iscale; float safmin; float bignum; magma_int_t indtau; magma_int_t indwrk, liwmin; magma_int_t llwork; float smlnum; magma_int_t lquery; magmaFloat_ptr dwork; magma_int_t lddc = ldda; wantz = (jobz == MagmaVec); lower = (uplo == MagmaLower); lquery = (lwork == -1 || liwork == -1); *info = 0; if (! (wantz || (jobz == MagmaNoVec))) { *info = -1; } else if (! (lower || (uplo == MagmaUpper))) { *info = -2; } else if (n < 0) { *info = -3; } else if (ldda < max(1,n)) { *info = -5; } magma_int_t nb = magma_get_ssytrd_nb( n ); if ( n <= 1 ) { lwmin = 1; liwmin = 1; } else if ( wantz ) { lwmin = max( 2*n + n*nb, 1 + 6*n + 2*n*n ); liwmin = 3 + 5*n; } else { lwmin = 2*n + n*nb; liwmin = 1; } work[0] = magma_smake_lwork( lwmin ); iwork[0] = liwmin; if ((lwork < lwmin) && !lquery) { *info = -10; } else if ((liwork < liwmin) && ! lquery) { *info = -12; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); /* If matrix is very small, then just call LAPACK on CPU, no need for GPU */ if (n <= 128) { magma_int_t lda = n; float *A; magma_smalloc_cpu( &A, lda*n ); magma_sgetmatrix( n, n, dA, ldda, A, lda, queue ); lapackf77_ssyevd( lapack_vec_const(jobz), lapack_uplo_const(uplo), &n, A, &lda, w, work, &lwork, iwork, &liwork, info ); magma_ssetmatrix( n, n, A, lda, dA, ldda, queue ); magma_free_cpu( A ); magma_queue_destroy( queue ); return *info; } // ssytrd2_gpu requires ldda*ceildiv(n,64) + 2*ldda*nb // sormtr_gpu requires lddc*n // slansy requires n magma_int_t ldwork = max( ldda*magma_ceildiv(n,64) + 2*ldda*nb, lddc*n ); ldwork = max( ldwork, n ); if ( wantz ) { // sstedx requires 3n^2/2 ldwork = max( ldwork, 3*n*(n/2 + 1) ); } if (MAGMA_SUCCESS != magma_smalloc( &dwork, ldwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* Get machine constants. */ safmin = lapackf77_slamch("Safe minimum"); eps = lapackf77_slamch("Precision"); smlnum = safmin / eps; bignum = 1. / smlnum; rmin = magma_ssqrt( smlnum ); rmax = magma_ssqrt( bignum ); /* Scale matrix to allowable range, if necessary. */ anrm = magmablas_slansy( MagmaMaxNorm, uplo, n, dA, ldda, dwork, ldwork, queue ); iscale = 0; sigma = 1; if (anrm > 0. && anrm < rmin) { iscale = 1; sigma = rmin / anrm; } else if (anrm > rmax) { iscale = 1; sigma = rmax / anrm; } if (iscale == 1) { magmablas_slascl( uplo, 0, 0, 1., sigma, n, n, dA, ldda, queue, info ); } /* Call SSYTRD to reduce symmetric matrix to tridiagonal form. */ // ssytrd work: e (n) + tau (n) + llwork (n*nb) ==> 2n + n*nb // sstedx work: e (n) + tau (n) + z (n*n) + llwrk2 (1 + 4*n + n^2) ==> 1 + 6n + 2n^2 inde = 0; indtau = inde + n; indwrk = indtau + n; indwk2 = indwrk + n*n; llwork = lwork - indwrk; llwrk2 = lwork - indwk2; magma_timer_t time=0; timer_start( time ); #ifdef FAST_SYMV magma_ssytrd2_gpu( uplo, n, dA, ldda, w, &work[inde], &work[indtau], wA, ldwa, &work[indwrk], llwork, dwork, ldwork, &iinfo ); #else magma_ssytrd_gpu( uplo, n, dA, ldda, w, &work[inde], &work[indtau], wA, ldwa, &work[indwrk], llwork, &iinfo ); #endif timer_stop( time ); #ifdef FAST_SYMV timer_printf( "time ssytrd2 = %6.2f\n", time ); #else timer_printf( "time ssytrd = %6.2f\n", time ); #endif /* For eigenvalues only, call SSTERF. For eigenvectors, first call SSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the tridiagonal matrix, then call SORMTR to multiply it to the Householder transformations represented as Householder vectors in A. */ if (! wantz) { lapackf77_ssterf( &n, w, &work[inde], info ); } else { timer_start( time ); magma_sstedx( MagmaRangeAll, n, 0., 0., 0, 0, w, &work[inde], &work[indwrk], n, &work[indwk2], llwrk2, iwork, liwork, dwork, info ); timer_stop( time ); timer_printf( "time sstedx = %6.2f\n", time ); timer_start( time ); magma_ssetmatrix( n, n, &work[indwrk], n, dwork, lddc, queue ); magma_sormtr_gpu( MagmaLeft, uplo, MagmaNoTrans, n, n, dA, ldda, &work[indtau], dwork, lddc, wA, ldwa, &iinfo ); magma_scopymatrix( n, n, dwork, lddc, dA, ldda, queue ); timer_stop( time ); timer_printf( "time sormtr + copy = %6.2f\n", time ); } /* If matrix was scaled, then rescale eigenvalues appropriately. */ if (iscale == 1) { d__1 = 1. / sigma; blasf77_sscal( &n, &d__1, w, &ione ); } work[0] = magma_smake_lwork( lwmin ); iwork[0] = liwmin; magma_queue_destroy( queue ); magma_free( dwork ); return *info; } /* magma_ssyevd_gpu */
/** Purpose ------- CHETRD reduces a complex Hermitian matrix A to real symmetric tridiagonal form T by an orthogonal similarity transformation: Q**H * A * Q = T. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] nqueue INTEGER The number of GPU queues used for update. 10 >= nqueue > 0. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if UPLO = MagmaUpper, the diagonal and first superdiagonal of A are overwritten by the corresponding elements of the tridiagonal matrix T, and the elements above the first superdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = MagmaLower, the diagonal and first subdiagonal of A are over- written by the corresponding elements of the tridiagonal matrix T, and the elements below the first subdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] d COMPLEX array, dimension (N) The diagonal elements of the tridiagonal matrix T: D(i) = A(i,i). @param[out] e COMPLEX array, dimension (N-1) The off-diagonal elements of the tridiagonal matrix T: E(i) = A(i,i+1) if UPLO = MagmaUpper, E(i) = A(i+1,i) if UPLO = MagmaLower. @param[out] tau COMPLEX array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) COMPLEX array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= N*NB, where NB is the optimal blocksize given by magma_get_chetrd_nb(). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n-1) . . . H(2) H(1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in A(1:i-1,i+1), and tau in TAU(i). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(n-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i), and tau in TAU(i). The contents of A on exit are illustrated by the following examples with n = 5: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( d e v2 v3 v4 ) ( d ) ( d e v3 v4 ) ( e d ) ( d e v4 ) ( v1 e d ) ( d e ) ( v1 v2 e d ) ( d ) ( v1 v2 v3 e d ) where d and e denote diagonal and off-diagonal elements of T, and vi denotes an element of the vector defining H(i). @ingroup magma_cheev_comp ********************************************************************/ extern "C" magma_int_t magma_chetrd_mgpu( magma_int_t ngpu, magma_int_t nqueue, magma_uplo_t uplo, magma_int_t n, magmaFloatComplex *A, magma_int_t lda, float *d, float *e, magmaFloatComplex *tau, magmaFloatComplex *work, magma_int_t lwork, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define dA(id, i, j) (dA[(id)] + (j)*ldda + (i)) #define dW(id, i, j) (dW[(id)] + (j)*ldda + (i)) /* Constants */ const magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; const magmaFloatComplex c_one = MAGMA_C_ONE; const float d_one = MAGMA_D_ONE; /* Local variables */ const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t nlocal, ldda; magma_int_t nb = magma_get_chetrd_nb(n), ib, ib2; #ifdef PROFILE_SY2RK float mv_time = 0.0; float up_time = 0.0; #endif magma_int_t kk, nx; magma_int_t i, ii, iii, j, dev, i_n; magma_int_t iinfo; magma_int_t ldwork, lddw, lwkopt, ldwork2, lhwork; // set pointers to NULL so it is safe to goto CLEANUP if any malloc fails. magma_queue_t queues[MagmaMaxGPUs][10] = { { NULL, NULL } }; magma_queue_t queues0[MagmaMaxGPUs] = { NULL }; magmaFloatComplex *hwork = NULL; magmaFloatComplex_ptr dwork2[MagmaMaxGPUs] = { NULL }; magmaFloatComplex_ptr dA[MagmaMaxGPUs] = { NULL }; magmaFloatComplex_ptr dW[MagmaMaxGPUs] = { NULL }; *info = 0; bool upper = (uplo == MagmaUpper); bool lquery = (lwork == -1); if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } else if (lwork < nb*n && ! lquery) { *info = -9; } else if ( nqueue > 2 ) { *info = 2; // TODO fix } /* Determine the block size. */ ldwork = n; lwkopt = n * nb; if (*info == 0) { work[0] = magma_cmake_lwork( lwkopt ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { work[0] = c_one; return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); //#define PROFILE_SY2RK #ifdef PROFILE_SY2RK float times[11] = { 0 }; magma_event_t start, stop; float etime; magma_setdevice( 0 ); magma_event_create( &start ); magma_event_create( &stop ); #endif ldda = magma_roundup( lda, 32 ); lddw = ldda; nlocal = nb*(1 + n/(nb*ngpu)); ldwork2 = ldda*( magma_ceildiv( n, nb ) + 1); // i.e., ldda*(blocks + 1) for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); // TODO fix memory leak if ( MAGMA_SUCCESS != magma_cmalloc( &dA[dev], nlocal*ldda + 3*lddw*nb ) || MAGMA_SUCCESS != magma_cmalloc( &dwork2[dev], ldwork2 ) ) { *info = MAGMA_ERR_DEVICE_ALLOC; goto CLEANUP; } dW[dev] = dA[dev] + nlocal*ldda; for( kk=0; kk < nqueue; kk++ ) { magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[dev][kk] ); } queues0[dev] = queues[dev][0]; } lhwork = nqueue*ngpu*n; if ( MAGMA_SUCCESS != magma_cmalloc_pinned( &hwork, lhwork ) ) { *info = MAGMA_ERR_HOST_ALLOC; goto CLEANUP; } // nx <= n is required // use LAPACK for n < 3000, otherwise switch at 512 if (n < 3000) nx = n; else nx = 512; if (upper) { /* Copy the matrix to the GPU */ if (1 <= n-nx) { magma_chtodhe( ngpu, uplo, n, nb, A, lda, dA, ldda, queues, &iinfo ); } /* Reduce the upper triangle of A. Columns 1:kk are handled by the unblocked method. */ for (i = nb*((n-1)/nb); i >= nx; i -= nb) { ib = min(nb, n-i); ii = nb*(i/(nb*ngpu)); dev = (i/nb)%ngpu; /* wait for the next panel */ if (i != nb*((n-1)/nb)) { magma_setdevice( dev ); magma_queue_sync( queues[dev][0] ); } magma_clatrd_mgpu( ngpu, uplo, i+ib, ib, nb, A(0, 0), lda, e, tau, work, ldwork, dA, ldda, 0, dW, i+ib, hwork, lhwork, dwork2, ldwork2, queues0 ); magma_cher2k_mgpu( ngpu, MagmaUpper, MagmaNoTrans, nb, i, ib, c_neg_one, dW, i+ib, 0, d_one, dA, ldda, 0, nqueue, queues ); /* get the next panel */ if (i-nb >= nx ) { ib2 = min(nb, n-(i-nb)); ii = nb*((i-nb)/(nb*ngpu)); dev = ((i-nb)/nb)%ngpu; magma_setdevice( dev ); magma_cgetmatrix_async( (i-nb)+ib2, ib2, dA(dev, 0, ii), ldda, A(0, i-nb), lda, queues[dev][0] ); } /* Copy superdiagonal elements back into A, and diagonal elements into D */ for (j = i; j < i+ib; ++j) { if ( j > 0 ) { *A(j-1,j) = MAGMA_C_MAKE( e[j - 1], 0 ); } d[j] = MAGMA_C_REAL( *A(j, j) ); } } /* end of for i=... */ if ( nx > 0 ) { if (1 <= n-nx) { /* else A is already on CPU */ for (i=0; i < nx; i += nb) { ib = min(nb, n-i); ii = nb*(i/(nb*ngpu)); dev = (i/nb)%ngpu; magma_setdevice( dev ); magma_cgetmatrix_async( nx, ib, dA(dev, 0, ii), ldda, A(0, i), lda, queues[dev][0] ); } } for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); magma_queue_sync( queues[dev][0] ); } /* Use CPU code to reduce the last or only block */ lapackf77_chetrd( uplo_, &nx, A(0, 0), &lda, d, e, tau, work, &lwork, &iinfo ); } } else { trace_init( 1, ngpu, nqueue, queues ); /* Copy the matrix to the GPU */ if (1 <= n-nx) { magma_chtodhe( ngpu, uplo, n, nb, A, lda, dA, ldda, queues, &iinfo ); } /* Reduce the lower triangle of A */ for (i = 0; i < n-nx; i += nb) { ib = min(nb, n-i); ii = nb*(i/(nb*ngpu)); dev = (i/nb)%ngpu; /* Reduce columns i:i+ib-1 to tridiagonal form and form the matrix W which is needed to update the unreduced part of the matrix */ /* Get the current panel (no need for the 1st iteration) */ if (i != 0) { magma_setdevice( dev ); trace_gpu_start( dev, 0, "comm", "get" ); magma_cgetmatrix_async( n-i, ib, dA(dev, i, ii), ldda, A(i,i), lda, queues[dev][0] ); trace_gpu_end( dev, 0 ); magma_queue_sync( queues[dev][0] ); magma_setdevice( 0 ); } magma_clatrd_mgpu( ngpu, uplo, n-i, ib, nb, A(i, i), lda, &e[i], &tau[i], work, ldwork, dA, ldda, i, dW, n-i, hwork, lhwork, dwork2, ldwork2, queues0 ); #ifdef PROFILE_SY2RK magma_setdevice( 0 ); if ( i > 0 ) { cudaEventElapsedTime( &etime, start, stop ); up_time += (etime/1000.0); } magma_event_record( start, 0 ); #endif magma_cher2k_mgpu( ngpu, MagmaLower, MagmaNoTrans, nb, n-i-ib, ib, c_neg_one, dW, n-i, ib, d_one, dA, ldda, i+ib, nqueue, queues ); #ifdef PROFILE_SY2RK magma_setdevice( 0 ); magma_event_record( stop, 0 ); #endif /* Copy subdiagonal elements back into A, and diagonal elements into D */ for (j = i; j < i+ib; ++j) { if ( j+1 < n ) { *A(j+1,j) = MAGMA_C_MAKE( e[j], 0 ); } d[j] = MAGMA_C_REAL( *A(j, j) ); } } /* for i=... */ /* Use CPU code to reduce the last or only block */ if ( i < n ) { iii = i; i_n = n-i; if ( i > 0 ) { for (; i < n; i += nb) { ib = min(nb, n-i); ii = nb*(i/(nb*ngpu)); dev = (i/nb)%ngpu; magma_setdevice( dev ); magma_cgetmatrix_async( i_n, ib, dA(dev, iii, ii), ldda, A(iii, i), lda, queues[dev][0] ); } for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); magma_queue_sync( queues[dev][0] ); } } lapackf77_chetrd( uplo_, &i_n, A(iii, iii), &lda, &d[iii], &e[iii], &tau[iii], work, &lwork, &iinfo ); } } for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); for( kk=0; kk < nqueue; kk++ ) { magma_queue_sync( queues[dev][kk] ); } } #ifdef PROFILE_SY2RK magma_setdevice( 0 ); if ( n > nx ) { cudaEventElapsedTime( &etime, start, stop ); up_time += (etime/1000.0); } magma_event_destroy( start ); magma_event_destroy( stop ); #endif trace_finalize( "chetrd.svg", "trace.css" ); #ifdef PROFILE_SY2RK printf( " n=%d nb=%d\n", n, nb ); printf( " Time in CLARFG: %.2e seconds\n", times[0] ); //printf( " Time in CHEMV : %.2e seconds\n", mv_time ); printf( " Time in CHER2K: %.2e seconds\n", up_time ); #endif CLEANUP: for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); for( kk=0; kk < nqueue; kk++ ) { magma_queue_destroy( queues[dev][kk] ); } magma_free( dA[dev] ); magma_free( dwork2[dev] ); } magma_free_pinned( hwork ); magma_setdevice( orig_dev ); work[0] = magma_cmake_lwork( lwkopt ); return *info; } /* magma_chetrd */
extern "C" magma_int_t magma_zbulge_applyQ_v2_m( magma_int_t ngpu, magma_side_t side, magma_int_t NE, magma_int_t N, magma_int_t NB, magma_int_t Vblksiz, magmaDoubleComplex *E, magma_int_t lde, magmaDoubleComplex *V, magma_int_t ldv, magmaDoubleComplex *T, magma_int_t ldt, magma_int_t *info) { //%=========================== //% local variables //%=========================== magma_int_t Vm, Vn, mt, nt; magma_int_t myrow, mycol, blkj, blki; magma_int_t blkid,vpos,tpos; magma_int_t firstrow, nbcolinvolvd; magma_int_t versionL = 113; magma_int_t versionR = 92; magma_int_t Vchunksiz = 10; *info=0; /* Quick return */ if ( NE == 0 ) { return *info; } if ( N == 0 ) { return *info; } if ( NB == 0 ) { return *info; } /* ========================================== * some infos for developer * Initialisation and checking nb of cores * ==========================================*/ /* we have 2 algo for left (113 114) and 2 algo for right (91 92) * which correspond to versionL versionR. * They are very similar (detail explained in tech report and matlab code) * however version 114 and 92 improve locality. * while version 113 is used in case WNATZ=1 (construct Q2) which allow * the construction to be done in an optimized way taking into * consideration that the matrix is Identity so making less flops. * */ // Initialize streaming and events magma_device_sync(); magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); magma_int_t nbevents =2, nstream=2; magma_queue_t streams[MagmaMaxGPUs][20]; magma_event_t myevent[MagmaMaxGPUs][20]; for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); for( magma_int_t i = 0; i < nstream; ++i ) { magma_queue_create( &streams[dev][i] ); } for( magma_int_t i = 0; i < nbevents; ++i ) { cudaEventCreateWithFlags(&myevent[dev][i],cudaEventDisableTiming); } } // Azzam 21/11/2012 // NOTE THAT dwork was of size 2*NE*Vblksiz+... // but I am thinking why not modifing it to NE*Vblksiz+... // BUT NO because the 2* is used because of making 2 streams working and so // they might be using dwork in parallel magmaDoubleComplex *dE[MagmaMaxGPUs]; magmaDoubleComplex *dwork[MagmaMaxGPUs], *dwork0[MagmaMaxGPUs], *dwork1[MagmaMaxGPUs]; //magmaDoubleComplex *dwvt[MagmaMaxGPUs]; magmaDoubleComplex *dwvt0[MagmaMaxGPUs], *dwvt1[MagmaMaxGPUs]; magmaDoubleComplex *dT0[MagmaMaxGPUs], *dV0[MagmaMaxGPUs], *dT1[MagmaMaxGPUs], *dV1[MagmaMaxGPUs]; magma_int_t dev; magma_int_t ldde = N; magma_int_t lddv = ldv; magma_int_t lddt = ldt; magma_int_t ne_loc = magma_ceildiv(NE, ngpu); if (ne_loc < 256) ne_loc=256; magma_int_t dwVTsiz = lddv*Vblksiz; // lddv*lddv + lddv*NE; // lddv*Vblksiz; magma_int_t dworksiz = ne_loc*Vblksiz; // lddv*Vblksiz; // NE*Vblksiz; ngpu = min(ngpu, magma_ceildiv(NE,ne_loc)); // Don't use GPU that will not have data. // copy dE to GPUs for (dev=0; dev < ngpu; ++dev) { magma_setdevice( dev ); if (MAGMA_SUCCESS != magma_zmalloc( &dE[dev], ldde * ne_loc)) { printf ("!!!! magma_zbulge_applyQ magma_alloc failed for: dE\n" ); exit(-1); } if (MAGMA_SUCCESS != magma_zmalloc( &dwork[dev], 2*dworksiz + 2*dwVTsiz + 2*Vchunksiz* (Vblksiz* (lddv+lddt)) )) { printf ("!!!! magma_zbulge_applyQ magma_alloc failed for: dwork\n" ); exit(-1); } dwork0[dev] = dwork[dev]; // size = dworksiz; dwork1[dev] = dwork0[dev] + dworksiz; // size = dworksiz; dwvt0[dev] = dwork[dev] + 2*dworksiz; // size = dwVTsiz; dwvt1[dev] = dwvt0[dev] + dwVTsiz; // size = dwVTsiz; dV0[dev] = dwork[dev] + 2*dworksiz + 2*dwVTsiz; dT0[dev] = dV0[dev] + Vchunksiz*Vblksiz*lddv; dV1[dev] = dT0[dev] + Vchunksiz*Vblksiz*lddt; dT1[dev] = dV1[dev] + Vchunksiz*Vblksiz*lddv; magma_int_t ie_loc = min(ne_loc, NE - ne_loc*dev); magma_zsetmatrix_async( N, ie_loc, E+lde*ne_loc*dev, lde, dE(dev, 0, 0), ldde, streams[dev][1] ); } // make overlapped copy magma_int_t ncpy = 0; magma_int_t copyed=0, copyst=0; magma_int_t blkcnt,nothing, mysiz, flip, vld,tld, locpos; findVTsiz(N, NB, Vblksiz, &blkcnt, ¬hing); flip = 0; /* SIDE LEFT meaning apply E = Q*E = (q_1*q_2*.....*q_n) * E ==> so traverse Vs in reverse order (forward) from q_n to q_1 * Also E is splitten by row meaning each apply consist in a block of row (horizontal block) */ /* SIDE RIGHT meaning apply E = E*Q = E * (q_1*q_2*.....*q_n) ==> so tarverse Vs in normal order (forward) from q_1 to q_n * Also E is splitten by col meaning each apply consist in a block of col (vertical block) */ #ifdef ENABLE_DEBUG printf(" APPLY Q_v22_m GPU with NGPU %d N %d, NE %d, NB %d, Vblksiz %d, versionL %d versionR %d SIDE %c \n", ngpu, N, NE, NB, Vblksiz, versionL, versionR, side); #endif /* * MagmamaLeft */ if (side == MagmaLeft) { /* * Version 113: * loop over the block_col (nt) and for each find the * number of tiles (mt) in this block_col. then loop over mt, find * the size of the V's(Vm,Vn) and apply it to the corresponding * portion of E. */ if ( versionL == 113 ) { nt = magma_ceildiv((N-1),Vblksiz); for (blkj=nt-1; blkj >= 0; blkj--) { /* the index of the first row on the top of block (blkj) */ firstrow = blkj * Vblksiz + 1; /*find the number of tile for this block */ if ( blkj == nt-1 ) mt = magma_ceildiv( N - firstrow, NB); else mt = magma_ceildiv( N - (firstrow+1), NB); /*loop over the tiles find the size of the Vs and apply it */ for (blki=mt; blki > 0; blki--) { /*calculate the size of each losange of Vs= (Vm,Vn)*/ myrow = firstrow + (mt-blki)*NB; mycol = blkj*Vblksiz; Vm = min( NB+Vblksiz-1, N-myrow); if ( ( blkj == nt-1 ) && ( blki == mt ) ) { Vn = min (Vblksiz, Vm); } else { Vn = min (Vblksiz, Vm-1); } /*calculate the pointer to the Vs and the Ts. * Note that Vs and Ts have special storage done * by the bulgechasing function*/ //printf("voici blkj %d blki %d Vm %d Vn %d mycol %d vpos %d \n",blkj,blki,Vm, Vn,mycol,vpos); magma_bulge_findpos113(N, NB, Vblksiz, mycol, myrow, &blkid); // COPY Vchunksiz Vs and Vchunksiz Ts to GPU and store it in dV0/dV1 and dT0/dT1 if (ncpy == 0) { // flip = 1 for this. copyst = 0; // meaning that copy will start copying from blkid =copyst copyed = min(copyst+Vchunksiz, blkcnt); // meaning that copy will end copying at blkid =copyed-1==> next copy had to start at copyed mysiz = copyed-copyst; // the size of the chunk to be copied if (mysiz > 0) { ncpy = 1; flip = 1; vpos = copyst*Vblksiz*ldv; tpos = copyst*Vblksiz*ldt; vld = mysiz * ldv; tld = mysiz * ldt; for( dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ 1 ] ); magma_zsetmatrix_async(vld, Vblksiz, V(vpos), vld, dV1[dev], vld, streams[dev][1]); magma_zsetmatrix_async(tld, Vblksiz, T(tpos), tld, dT1[dev], tld, streams[dev][1]); } //printf("doing the first copy of mysiz %2d copyst %2d copyed %2d vpos %8d tpos %8d into dV1 dT1\n",mysiz,copyst,copyed,vpos,tpos); } } if (blkid == copyst) { flip = ncpy % 2; copyst = copyed; // meaning that copy will start copying from blkid =copyst copyed = min(copyst+Vchunksiz, blkcnt); // meaning that copy will end copying at blkid =copyed-1==> next copy had to start at copyed mysiz = copyed-copyst; // the size of the chunk to be copied //printf(" get to copy blkid %d blkid+(2*Vchunksiz) %d copyst %d copyed %d\n",blkid,blkid+(Vchunksiz),copyst,copyed); if (mysiz > 0) { ncpy = ncpy + 1; vpos = copyst*Vblksiz*ldv; tpos = copyst*Vblksiz*ldt; vld = mysiz * ldv; tld = mysiz * ldt; if (flip == 0) { // now I am working on dV0 so copy the next and put it on dV1 //printf("doing overlapping copy of mysiz %2d copyst %2d copyed %2d vpos %8d tpos %8d into dV1 dT1\n",mysiz,copyst,copyed,vpos,tpos); for( dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ 1 ] ); magma_zsetmatrix_async(vld, Vblksiz, V(vpos), vld, dV1[dev], vld, streams[dev][1]); magma_zsetmatrix_async(tld, Vblksiz, T(tpos), tld, dT1[dev], tld, streams[dev][1]); } } else { // now I am working on dV1 so copy the next and put it on dV0 //printf("doing overlapping copy of mysiz %2d copyst %2d copyed %2d vpos %8d tpos %8d into dV0 dT0\n",mysiz,copyst,copyed,vpos,tpos); for( dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ 0 ] ); magma_zsetmatrix_async(vld, Vblksiz, V(vpos), vld, dV0[dev], vld, streams[dev][0]); magma_zsetmatrix_async(tld, Vblksiz, T(tpos), tld, dT0[dev], tld, streams[dev][0]); } } } } if ((Vm > 0) && (Vn > 0)) { locpos = blkid%Vchunksiz; magma_int_t lcvpos = locpos*Vblksiz*lddv; magma_int_t lctpos = locpos*Vblksiz*lddt; //printf("voici blkj %d blki %d Vm %d Vn %d mycol %d locvpos %5d loctpos %5d blkid %2d using data in dV%1d dT%1d \n",blkj,blki,Vm, Vn,mycol,lcvpos,lctpos, blkid,flip,flip); if (flip == 0) { for( dev = 0; dev < ngpu; ++dev ) { magma_int_t ie_loc = min(ne_loc, NE - ne_loc*dev); magma_int_t nr_bl = magma_ceildiv(ie_loc,10000); //nr of blocks magma_int_t sz_bl = magma_ceildiv(ie_loc,nr_bl*64)*64; //maximum size of blocks (to have blocks of around the same size and multiple of 64) magma_int_t ib; //size of current block magma_setdevice( dev ); magmablasSetKernelStream(streams[dev][0]); magma_queue_wait_event( streams[dev][0], myevent[dev][1] ); for (magma_int_t i=0; i < ie_loc; i += sz_bl) { ib = min(sz_bl, ie_loc-i); //magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV0[dev]+lcvpos, lddv, dT0[dev]+lctpos, lddt, dE(dev,myrow,i), ldde, dwork0[dev], ib); magma_zlarfb_gpu_gemm( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV0[dev]+lcvpos, lddv, dT0[dev]+lctpos, lddt, dE(dev,myrow,i), ldde, dwork0[dev], ib, dwvt0[dev], Vm); } magma_event_record( myevent[dev][0], streams[dev][0] ); } } else { for( dev = 0; dev < ngpu; ++dev ) { magma_int_t ie_loc = min(ne_loc, NE - ne_loc*dev); magma_int_t nr_bl = magma_ceildiv(ie_loc,10000); //nr of blocks magma_int_t sz_bl = magma_ceildiv(ie_loc,nr_bl*64)*64; //maximum size of blocks (to have blocks of around the same size and multiple of 64) magma_int_t ib; //size of current block magma_setdevice( dev ); magmablasSetKernelStream(streams[dev][1]); magma_queue_wait_event( streams[dev][1], myevent[dev][0] ); for (magma_int_t i=0; i < ie_loc; i += sz_bl) { ib = min(sz_bl, ie_loc-i); //magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV1[dev]+lcvpos, lddv, dT1[dev]+lctpos, lddt, dE(dev,myrow,i), ldde, dwork1[dev], ib); magma_zlarfb_gpu_gemm( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV1[dev]+lcvpos, lddv, dT1[dev]+lctpos, lddt, dE(dev,myrow,i), ldde, dwork1[dev], ib, dwvt1[dev], Vm); } magma_event_record( myevent[dev][1], streams[dev][1] ); } } } // end for (Vm &Vn) > 0 } // end for blki } // end for blkj } // end if version=113 /* * Version 114: * loop over the block_row (mt) and for each find diagonally the * number of tiles (nt) in this block_row. then loop over nt, find * the size of the V's(Vm,Vn) and apply it to the corresponding * portion of E. */ else { printf("versionL 114 not implemented in zbulge_applyQ_v2_m\n"); exit(-1); mt = magma_ceildiv((N-1),NB); for (blki = mt; blki > 0; blki--) { /* nbcolinvolvd = number of column corresponding to this block_row (blki) */ nbcolinvolvd = min(N-1, blki*NB); /*find the number of tile for this block (diagonal row of tiles) */ nt = magma_ceildiv(nbcolinvolvd,Vblksiz); /*loop over the tiles find the size of the Vs and apply it */ for (blkj = nt-1; blkj >= 0; blkj--) { /* the index of the first row of the first col meaning * the block on the top left (blki) */ firstrow = (mt-blki)*NB+1; /*calculate the size of each losange of Vs= (Vm,Vn)*/ myrow = firstrow + blkj*Vblksiz; mycol = blkj*Vblksiz; Vm = min( NB+Vblksiz-1, N-myrow); if ( ( blkj == nt-1 ) && ( blki == mt ) ) { Vn = min (Vblksiz, Vm); } else { Vn = min (Vblksiz, Vm-1); } if ((Vm > 0) && (Vn > 0)) { /*calculate the pointer to the Vs and the Ts. * Note that Vs and Ts have special storage done * by the bulgechasing function*/ /* magma_bulge_findVTpos(N, NB, Vblksiz, mycol, myrow, ldv, ldt, &vpos, &tpos); magma_zsetmatrix_async(Vm, Vn, V(vpos), ldv, dV0, lddv, NULL); magma_zsetmatrix_async(Vn, Vn, T(tpos), ldt, dT0, lddt, NULL); //printf("voici blki %d rownbm %d mycol %d coled %d blkid %d vpos %d tpos %d\n", blki, rownbm, mycol, coled, blkid, vpos, tpos); for (magma_int_t i=0; i < NE; i += sz_bl) { ib = min(sz_bl, NE-i); magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV0, lddv, dT0, lddt, dE(myrow,i), ldde, dwork, NE); } */ } // end for (Vm &Vn) > 0 } // end for blkj } // end for blki } // end version 114 } // end LEFT /* * MagmaRight */ else { printf("Side 'R' not implemented in zbulge_applyQ_v2_m\n"); exit(-1); /* * Version 91: */ if ( versionR == 91 ) { nt = magma_ceildiv((N-1),Vblksiz); for (blkj=0; blkj < nt; blkj++) { /* the index of the first myrow on the top of block (blkj) */ firstrow = blkj * Vblksiz + 1; /*find the number of tile for this block */ if ( blkj == nt-1 ) mt = magma_ceildiv( N - firstrow, NB); else mt = magma_ceildiv( N - (firstrow+1), NB); /*loop over the tiles find the size of the Vs and apply it */ for (blki=1; blki <= mt; blki++) { /*calculate the size of each losange of Vs= (Vm,Vn)*/ myrow = firstrow + (mt-blki)*NB; Vm = min( NB+Vblksiz-1, N-myrow); if ( (blkj == nt-1) && (blki == mt) ) { Vn = min (Vblksiz, Vm); } else { Vn = min (Vblksiz, Vm-1); } mycol = blkj*Vblksiz; if ((Vm > 0) && (Vn > 0)) { /*calculate the pointer to the Vs and the Ts. * Note that Vs and Ts have special storage done * by the bulgechasing function*/ /* magma_bulge_findVTpos(N, NB, Vblksiz, mycol, myrow, ldv, ldt, &vpos, &tpos); magma_zsetmatrix_async(Vm, Vn, V(vpos), ldv, dV0, lddv, NULL); magma_zsetmatrix_async(Vn, Vn, T(tpos), ldt, dT0, lddt, NULL); magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, Vm, Vn, dV0, lddv, dT0, lddt, dE(0, myrow), ldde, dwork, NE); */ } // end for (Vm &Vn) > 0 } // end for blki } // end fo blkj } // end of version 91 /* * Version 92: */ else { mt = magma_ceildiv((N-1),NB); for (blki = 1; blki <= mt; blki++) { /* nbcolinvolvd = number of column corresponding to this block_row (blki) */ nbcolinvolvd = min(N-1, blki*NB); /*find the number of tile for this block (diagonal row of tiles) */ nt = magma_ceildiv(nbcolinvolvd,Vblksiz); /*loop over the tiles find the size of the Vs and apply it */ for (blkj = 0; blkj < nt; blkj++) { /* the index of the first row of the first col meaning * the block on the top left (blki) */ firstrow = (mt-blki)*NB+1; /*calculate the size of each losange of Vs= (Vm,Vn)*/ myrow = firstrow + blkj*Vblksiz; mycol = blkj*Vblksiz; Vm = min( NB+Vblksiz-1, N-myrow); if ( ( blkj == nt-1 ) && ( blki == mt ) ) { Vn = min (Vblksiz, Vm); } else { Vn = min (Vblksiz, Vm-1); } if ((Vm > 0) && (Vn > 0)) { /*calculate the pointer to the Vs and the Ts. * Note that Vs and Ts have special storage done * by the bulgechasing function*/ /* magma_bulge_findVTpos(N, NB, Vblksiz, mycol, myrow, ldv, ldt, &vpos, &tpos); magma_zsetmatrix_async(Vm, Vn, V(vpos), ldv, dV0, lddv, NULL); magma_zsetmatrix_async(Vn, Vn, T(tpos), ldt, dT0, lddt, NULL); magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, Vm, Vn, dV0, lddv, dT0, lddt, dE(0, myrow), ldde, dwork, NE); */ } // end for (Vm &Vn) > 0 } //end for blkj } // end for blki } //end of version 92 } // end RIGHT // copy back the dE form each GPU for( dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magmablasSetKernelStream(streams[dev][0]); magma_queue_wait_event( streams[dev][0], myevent[dev][1] ); magma_queue_wait_event( streams[dev][0], myevent[dev][0] ); magma_int_t ie_loc = min(ne_loc, NE - ne_loc*dev); magma_zgetmatrix_async( N, ie_loc, dE(dev, 0, 0), ldde, E+lde*ne_loc*dev, lde, streams[dev][0] ); magma_event_record( myevent[dev][0], streams[dev][0] ); } for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magmablasSetKernelStream(streams[dev][0]); magma_queue_wait_event( streams[dev][0], myevent[dev][0] ); magma_device_sync(); // no need for synchronize magma_free(dwork[dev]); magma_free(dE[dev]); for( magma_int_t i = 0; i < nbevents; ++i ) { magma_event_destroy( myevent[dev][i] ); } for( magma_int_t i = 0; i < nstream; ++i ) { magma_queue_destroy( streams[dev][i] ); } } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return *info; }
extern "C" magma_int_t magma_ssytrd_sb2st(magma_int_t threads, char uplo, magma_int_t n, magma_int_t nb, magma_int_t Vblksiz, float *A, magma_int_t lda, float *D, float *E, float *V, magma_int_t ldv, float *TAU, magma_int_t compT, float *T, magma_int_t ldt) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= Arguments ========= THREADS (input) INTEGER Specifies the number of pthreads used. THREADS > 0 UPLO (input) CHARACTER*1 = 'U': Upper triangles of A is stored; = 'L': Lower triangles of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. NB (input) INTEGER The order of the band matrix A. N >= NB >= 0. VBLKSIZ (input) INTEGER The size of the block of householder vectors applied at once. A (input/workspace) REAL array, dimension (LDA, N) On entry the band matrix stored in the following way: LDA (input) INTEGER The leading dimension of the array A. LDA >= 2*NB. D (output) DOUBLE array, dimension (N) The diagonal elements of the tridiagonal matrix T: D(i) = A(i,i). E (output) DOUBLE array, dimension (N-1) The off-diagonal elements of the tridiagonal matrix T: E(i) = A(i,i+1) if UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. V (output) REAL array, dimension (BLKCNT, LDV, VBLKSIZ) On exit it contains the blocks of householder reflectors BLKCNT is the number of block and it is returned by the funtion MAGMA_BULGE_GET_BLKCNT. LDV (input) INTEGER The leading dimension of V. LDV > NB + VBLKSIZ + 1 TAU (output) REAL dimension(BLKCNT, VBLKSIZ) ??? COMPT (input) INTEGER if COMPT = 0 T is not computed if COMPT = 1 T is computed T (output) REAL dimension(LDT *) if COMPT = 1 on exit contains the matrices T needed for Q2 if COMPT = 0 T is not referenced LDT (input) INTEGER The leading dimension of T. LDT > Vblksiz INFO (output) INTEGER ???????????????????????????????????????????????????????????????????????????????????? = 0: successful exit ===================================================================== */ #ifdef ENABLE_TIMER real_Double_t timeblg=0.0; #endif //char uplo_[2] = {uplo, 0}; magma_int_t mklth = threads; magma_int_t INgrsiz=1; magma_int_t blkcnt = magma_bulge_get_blkcnt(n, nb, Vblksiz); magma_int_t nbtiles = magma_ceildiv(n, nb); memset(T, 0, blkcnt*ldt*Vblksiz*sizeof(float)); memset(TAU, 0, blkcnt*Vblksiz*sizeof(float)); memset(V, 0, blkcnt*ldv*Vblksiz*sizeof(float)); magma_int_t* prog; magma_malloc_cpu((void**) &prog, (2*nbtiles+threads+10)*sizeof(magma_int_t)); memset(prog, 0, (2*nbtiles+threads+10)*sizeof(magma_int_t)); magma_sbulge_id_data* arg; magma_malloc_cpu((void**) &arg, threads*sizeof(magma_sbulge_id_data)); pthread_t* thread_id; magma_malloc_cpu((void**) &thread_id, threads*sizeof(pthread_t)); pthread_attr_t thread_attr; magma_setlapack_numthreads(1); magma_sbulge_data data_bulge(threads, n, nb, nbtiles, INgrsiz, Vblksiz, compT, A, lda, V, ldv, TAU, T, ldt, prog); // Set one thread per core pthread_attr_init(&thread_attr); pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM); pthread_setconcurrency(threads); //timing #ifdef ENABLE_TIMER timeblg = magma_wtime(); #endif // Launch threads for (magma_int_t thread = 1; thread < threads; thread++) { arg[thread] = magma_sbulge_id_data(thread, &data_bulge); pthread_create(&thread_id[thread], &thread_attr, magma_ssytrd_sb2st_parallel_section, &arg[thread]); } arg[0] = magma_sbulge_id_data(0, &data_bulge); magma_ssytrd_sb2st_parallel_section(&arg[0]); // Wait for completion for (magma_int_t thread = 1; thread < threads; thread++) { void *exitcodep; pthread_join(thread_id[thread], &exitcodep); } // timing #ifdef ENABLE_TIMER timeblg = magma_wtime()-timeblg; printf(" time BULGE+T = %f \n" ,timeblg); #endif magma_free_cpu(thread_id); magma_free_cpu(arg); magma_free_cpu(prog); magma_setlapack_numthreads(mklth); /*================================================ * store resulting diag and lower diag D and E * note that D and E are always real *================================================*/ /* Make diagonal and superdiagonal elements real, * storing them in D and E */ /* In real case, the off diagonal element are * not necessary real. we have to make off-diagonal * elements real and copy them to E. * When using HouseHolder elimination, * the SLARFG give us a real as output so, all the * diagonal/off-diagonal element except the last one are already * real and thus we need only to take the abs of the last * one. * */ #if defined(PRECISION_z) || defined(PRECISION_c) if(uplo==MagmaLower){ for (magma_int_t i=0; i < n-1 ; i++) { D[i] = MAGMA_S_REAL(A[i*lda ]); E[i] = MAGMA_S_REAL(A[i*lda+1]); } D[n-1] = MAGMA_S_REAL(A[(n-1)*lda]); } else { /* MagmaUpper not tested yet */ for (magma_int_t i=0; i<n-1; i++) { D[i] = MAGMA_S_REAL(A[i*lda+nb]); E[i] = MAGMA_S_REAL(A[i*lda+nb-1]); } D[n-1] = MAGMA_S_REAL(A[(n-1)*lda+nb]); } /* end MagmaUpper */ #else if( uplo == MagmaLower ){ for (magma_int_t i=0; i < n-1; i++) { D[i] = A[i*lda]; // diag E[i] = A[i*lda+1]; //lower diag } D[n-1] = A[(n-1)*lda]; } else { for (magma_int_t i=0; i < n-1; i++) { D[i] = A[i*lda+nb]; // diag E[i] = A[i*lda+nb-1]; //lower diag } D[n-1] = A[(n-1)*lda+nb]; } #endif return MAGMA_SUCCESS; }
static void magma_dtile_bulge_applyQ( magma_int_t core_id, magma_side_t side, magma_int_t n_loc, magma_int_t n, magma_int_t nb, magma_int_t Vblksiz, double *E, magma_int_t lde, double *V, magma_int_t ldv, double *TAU, double *T, magma_int_t ldt) //, magma_int_t* info) { //%=========================== //% local variables //%=========================== magma_int_t firstcolj; magma_int_t bg, rownbm; magma_int_t st,ed,fst,vlen,vnb,colj; magma_int_t vpos,tpos; magma_int_t cur_blksiz,avai_blksiz, ncolinvolvd; magma_int_t nbgr, colst, coled; if (n <= 0) return; if (n_loc <= 0) return; //info = 0; magma_int_t INFO=0; magma_int_t nbGblk = magma_ceildiv(n-1, Vblksiz); /* * version v1: for each chunck it apply all the V's then move to * the other chunck. the locality here inside each * chunck meaning that thread t apply V_k then move * to V_k+1 which overlap with V_k meaning that the * E_k+1 overlap with E_k. so here is the * locality however thread t had to read V_k+1 and * T_k+1 at each apply. note that all thread if they * run at same speed they might reading the same V_k * and T_k at the same time. * */ magma_int_t nb_loc = 128; //$$$$$$$$ magma_int_t lwork = 2*nb_loc*max(Vblksiz,64); double *work, *work2; magma_dmalloc_cpu(&work, lwork); magma_dmalloc_cpu(&work2, lwork); magma_int_t nbchunk = magma_ceildiv(n_loc, nb_loc); /* SIDE LEFT meaning apply E = Q*E = (q_1*q_2*.....*q_n) * E ==> so traverse Vs in reverse order (forward) from q_n to q_1 * each q_i consist of applying V to a block of row E(row_i,:) and applies are overlapped meaning * that q_i+1 overlap a portion of the E(row_i, :). * IN parallel E is splitten in vertical block over the threads */ /* SIDE RIGHT meaning apply E = E*Q = E * (q_1*q_2*.....*q_n) ==> so tarverse Vs in normal order (forward) from q_1 to q_n * each q_i consist of applying V to a block of col E(:, col_i,:) and the applies are overlapped meaning * that q_i+1 overlap a portion of the E(:, col_i). * IN parallel E is splitten in horizontal block over the threads */ #ifdef ENABLE_DEBUG if ((core_id == 0) || (core_id == 1)) printf(" APPLY Q2_cpu dbulge_back N %d N_loc %d nbchunk %d NB %d Vblksiz %d SIDE %c \n", n, n_loc, nbchunk, nb, Vblksiz, side); #endif for (magma_int_t i = 0; i < nbchunk; i++) { magma_int_t ib_loc = min(nb_loc, (n_loc - i*nb_loc)); if (side == MagmaLeft) { for (bg = nbGblk; bg > 0; bg--) { firstcolj = (bg-1)*Vblksiz + 1; rownbm = magma_ceildiv((n-(firstcolj+1)),nb); if (bg == nbGblk) rownbm = magma_ceildiv((n-(firstcolj)),nb); // last blk has size=1 used for real to handle A(N,N-1) for (magma_int_t j = rownbm; j > 0; j--) { vlen = 0; vnb = 0; colj = (bg-1)*Vblksiz; // for k=0; I compute the fst and then can remove it from the loop fst = (rownbm -j)*nb+colj +1; for (magma_int_t k=0; k < Vblksiz; k++) { colj = (bg-1)*Vblksiz + k; st = (rownbm -j)*nb+colj +1; ed = min(st+nb-1,n-1); if (st > ed) break; if ((st == ed) && (colj != n-2)) break; vlen = ed-fst+1; vnb = k+1; } colst = (bg-1)*Vblksiz; magma_bulge_findVTpos(n, nb, Vblksiz, colst, fst, ldv, ldt, &vpos, &tpos); if ((vlen > 0) && (vnb > 0)) { lapackf77_dlarfb( "L", "N", "F", "C", &vlen, &ib_loc, &vnb, V(vpos), &ldv, T(tpos), &ldt, E(fst,i*nb_loc), &lde, work, &ib_loc); } if (INFO != 0) printf("ERROR DORMQR INFO %d \n", (int) INFO); } } } else if (side == MagmaRight) { rownbm = magma_ceildiv((n-1),nb); for (magma_int_t k = 1; k <= rownbm; k++) { ncolinvolvd = min(n-1, k*nb); avai_blksiz = min(Vblksiz,ncolinvolvd); nbgr = magma_ceildiv(ncolinvolvd,avai_blksiz); for (magma_int_t j = 1; j <= nbgr; j++) { vlen = 0; vnb = 0; cur_blksiz = min(ncolinvolvd-(j-1)*avai_blksiz, avai_blksiz); colst = (j-1)*avai_blksiz; coled = colst + cur_blksiz -1; fst = (rownbm -k)*nb+colst +1; for (colj=colst; colj <= coled; colj++) { st = (rownbm -k)*nb+colj +1; ed = min(st+nb-1,n-1); if (st > ed) break; if ((st == ed) && (colj != n-2)) break; vlen = ed-fst+1; vnb = vnb+1; } magma_bulge_findVTpos(n, nb, Vblksiz, colst, fst, ldv, ldt, &vpos, &tpos); if ((vlen > 0) && (vnb > 0)) { lapackf77_dlarfb( "R", "N", "F", "C", &ib_loc, &vlen, &vnb, V(vpos), &ldv, T(tpos), &ldt, E(i*nb_loc,fst), &lde, work, &ib_loc); } } } } else { printf("ERROR SIDE %d \n",side); } } // END loop over the chunks magma_free_cpu(work); magma_free_cpu(work2); }
static void magma_stile_bulge_parallel(magma_int_t my_core_id, magma_int_t cores_num, float *A, magma_int_t lda, float *V, magma_int_t ldv, float *TAU, magma_int_t n, magma_int_t nb, magma_int_t nbtiles, magma_int_t grsiz, magma_int_t Vblksiz, volatile magma_int_t *prog) { magma_int_t sweepid, myid, shift, stt, st, ed, stind, edind; magma_int_t blklastind, colpt; magma_int_t stepercol; magma_int_t i,j,m,k; magma_int_t thgrsiz, thgrnb, thgrid, thed; magma_int_t coreid; magma_int_t colblktile,maxrequiredcores,colpercore,mycoresnb; magma_int_t fin; float *work; if(n<=0) return ; if(grsiz<=0) return ; //printf("=================> my core id %d of %d \n",my_core_id, cores_num); /* As I store V in the V vector there are overlap between * tasks so shift is now 4 where group need to be always * multiple of 2, because as example if grs=1 task 2 from * sweep 2 can run with task 6 sweep 1., but task 2 sweep 2 * will overwrite the V of tasks 5 sweep 1 which are used by * task 6, so keep in mind that group need to be multiple of 2, * and thus tasks 2 sweep 2 will never run with task 6 sweep 1. * However, when storing V in A, shift could be back to 3. * */ magma_smalloc_cpu(&work, n); mycoresnb = cores_num; shift = 5; if(grsiz==1) colblktile=1; else colblktile=grsiz/2; maxrequiredcores = nbtiles/colblktile; if(maxrequiredcores<1)maxrequiredcores=1; colpercore = colblktile*nb; if(mycoresnb > maxrequiredcores) mycoresnb = maxrequiredcores; thgrsiz = n; stepercol = magma_ceildiv(shift, grsiz); thgrnb = magma_ceildiv(n-1, thgrsiz); #ifdef ENABLE_DEBUG if(my_core_id==0){ if(cores_num > maxrequiredcores) { printf("==================================================================================\n"); printf(" WARNING only %3d threads are required to run this test optimizing cache reuse\n",maxrequiredcores); printf("==================================================================================\n"); } printf(" Static bulgechasing version v9_9col threads %4d N %5d NB %5d grs %4d thgrsiz %4d \n",cores_num, n, nb, grsiz,thgrsiz); } #endif for (thgrid = 1; thgrid<=thgrnb; thgrid++){ stt = (thgrid-1)*thgrsiz+1; thed = min( (stt + thgrsiz -1), (n-1)); for (i = stt; i <= n-1; i++){ ed=min(i,thed); if(stt>ed)break; for (m = 1; m <=stepercol; m++){ st=stt; for (sweepid = st; sweepid <=ed; sweepid++){ for (k = 1; k <=grsiz; k++){ myid = (i-sweepid)*(stepercol*grsiz) +(m-1)*grsiz + k; if(myid%2 ==0){ colpt = (myid/2)*nb+1+sweepid-1; stind = colpt-nb+1; edind = min(colpt,n); blklastind = colpt; if(stind>=edind){ printf("ERROR---------> st>=ed %d %d \n\n", (int) stind, (int) edind); exit(-10); } }else{ colpt = ((myid+1)/2)*nb + 1 +sweepid -1 ; stind = colpt-nb+1; edind = min(colpt,n); if( (stind>=edind-1) && (edind==n) ) blklastind=n; else blklastind=0; if(stind>edind){ printf("ERROR---------> st>=ed %d %d \n\n", (int) stind, (int) edind); exit(-10); } } coreid = (stind/colpercore)%mycoresnb; if(my_core_id==coreid) { fin=0; while(fin==0) { if(myid==1) { if( (prog[myid+shift-1]== (sweepid-1)) ) { magma_strdtype1cbHLsym_withQ_v2(n, nb, A, lda, V, ldv, TAU, stind, edind, sweepid, Vblksiz, work); fin=1; prog[myid]= sweepid; if(blklastind >= (n-1)) { for (j = 1; j <= shift; j++) prog[myid+j]=sweepid; } } // END progress condition }else{ if( (prog[myid-1]==sweepid) && (prog[myid+shift-1]== (sweepid-1)) ) { if(myid%2 == 0) magma_strdtype2cbHLsym_withQ_v2(n, nb, A, lda, V, ldv, TAU, stind, edind, sweepid, Vblksiz, work); else magma_strdtype3cbHLsym_withQ_v2(n, nb, A, lda, V, ldv, TAU, stind, edind, sweepid, Vblksiz, work); fin=1; prog[myid]= sweepid; if(blklastind >= (n-1)) { for (j = 1; j <= shift+mycoresnb; j++) prog[myid+j]=sweepid; } } // END progress condition } // END if myid==1 } // END while loop } // END if my_core_id==coreid if(blklastind >= (n-1)) { stt=stt+1; break; } } // END for k=1:grsiz } // END for sweepid=st:ed } // END for m=1:stepercol } // END for i=1:n-1 } // END for thgrid=1:thgrnb magma_free_cpu(work); } // END FUNCTION