void magmablas_ssymm_mgpu_com( magma_side_t side, magma_uplo_t uplo, magma_int_t m, magma_int_t n, float alpha, float *dA[], magma_int_t ldda, magma_int_t offset, float *dB[], magma_int_t lddb, float beta, float *dC[], magma_int_t lddc, float *dwork[], magma_int_t dworksiz, float *C, magma_int_t ldc, float *work[], magma_int_t worksiz, magma_int_t ngpu, magma_int_t nb, magma_queue_t streams[][20], magma_int_t nstream, magma_event_t redevents[][MagmaMaxGPUs*MagmaMaxGPUs+10], magma_int_t nbevents, magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2], magma_int_t nbcmplx ) { #define dA(dev, i, j) (dA[dev] + (i) + (j)*ldda) #define dB(dev, i, j) (dB[dev] + (i) + (j)*lddb) #define dC(dev, i, j) (dC[dev] + (i) + (j)*lddc) #define dwork(dev, i, j) (dwork[dev] + (i) + (j)*lddwork) #define C(i, j) (C + (i) + (j)*ldc) //printf("####################################################\n"); //printf(" start ssymm \n"); //printf("####################################################\n"); if ( side != MagmaLeft || uplo != MagmaLower ) { fprintf( stderr, "%s: only Left Lower implemented\n", __func__ ); } assert( ldda >= m ); assert( lddb >= m ); assert( lddc >= m ); assert( nstream >= ngpu ); assert( nbevents >= ngpu*ngpu ); float c_one = MAGMA_S_ONE; float *dwork1[MagmaMaxGPUs]; float *dwork2[MagmaMaxGPUs]; magma_int_t maxgsize = n*m; magma_int_t lddwork = lddc; magma_int_t ldwork = m; for( magma_int_t dev = 0; dev < ngpu; ++dev ) { dwork1[dev] = dwork[dev]; // size of dwork1 is n*lddwork dwork2[dev] = dwork[dev]+n*lddwork; // size of dwork2 is maxgsize*ngpu } assert( dworksiz >= (n*lddwork+maxgsize*ngpu) ); assert( worksiz >= (n*ldwork) ); magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_t cstream; magmablasGetKernelStream(&cstream); magma_int_t dev, devperm, myblk, mycolsize, myblkoffst; magma_int_t gmaster; magma_int_t masterdev, lcdev, lccolsize, myngpu; magma_int_t stdev = (offset/nb)%ngpu; magma_int_t blockoffset = offset % nb; magma_int_t fstblksiz = 0; if(blockoffset>0){ fstblksiz = min(m, (nb - blockoffset)); } //magma_int_t nbblk = magma_ceildiv(m, nb); magma_int_t nbblk = magma_ceildiv((m+blockoffset), nb); magma_int_t remm = m- fstblksiz; magma_int_t nbblkoffst = offset/nb; magma_int_t nblstblks = -1; magma_int_t devlstblk = -1; magma_int_t lstblksiz = remm%nb; if(lstblksiz>0){ nblstblks = nbblk%ngpu; devlstblk = (nblstblks-1+ngpu)%ngpu; } magma_int_t nbcmplxactive = 0; magma_int_t cmplxisactive[MagmaMaxGPUs]; magma_int_t gpuisactive[MagmaMaxGPUs]; memset(gpuisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t)); memset(cmplxisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t)); for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ 0 ] ); cudaMemset(dwork(dev,0,0), 0, (lddwork)*(n)*sizeof(float) ); // put all dC on all dev to 0 except the one which // hold i==0 because this one has to multiply by beta. if(dev!=stdev){ cudaMemset(dC(dev,0,0), 0, (lddc)*(n)*sizeof(float) ); } } magma_int_t newoffset = offset; // 1. symmetrize if(blockoffset>0){ newoffset = offset+fstblksiz; // newoffset is adjusted over nb magma_int_t myblkoffst = (nbblkoffst/ngpu)+(nbblkoffst%ngpu > stdev?1:0); //printf("STDEV %d voici offset %d remm %d myblockoffset %d siz %d \n", stdev, offset, remm, myblkoffst, fstblksiz); magma_setdevice( stdev ); magmablasSetKernelStream( streams[ stdev ][ 0 ] ); magmablas_ssymmetrize_tiles( MagmaLower, fstblksiz, dA(stdev, offset, myblkoffst*nb+blockoffset), ldda, 1, ngpu*nb, nb ); } for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_int_t newstdev = (newoffset/nb)%ngpu; magma_int_t nbblk = remm/nb; // number of block of size nb. if m%nb>0 then a last block exist and is of size ib=m%nb magma_int_t myblk = (nbblk/ngpu) + (nbblk%ngpu > ((dev-newstdev+ngpu)%ngpu) ? 1:0 ); magma_int_t devperm = (dev-newstdev+ngpu)%ngpu; magma_int_t nbblkoffst = newoffset/nb; magma_int_t myblkoffst = (nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0); //printf("dev %d devperm %d newoffset %d rowoff %d coloff %d myblk %d \n", dev, devperm, newoffset, newoffset+devperm*nb, myblkoffst*nb, myblk); magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ 0 ] ); magmablas_ssymmetrize_tiles( MagmaLower, nb, dA(dev, newoffset+devperm*nb, myblkoffst*nb), ldda, myblk, ngpu*nb, nb ); if(remm%nb>0){ magma_int_t nblstblks = (nbblk+1)%ngpu; magma_int_t devlstblk = (nblstblks-1+ngpu)%ngpu; //printf("==> siz %d devperm %d, devlstblk %d, newoffset+nbblk*nb %d, myblkoffst*nb+ myblk*nb %d\n", remm % nb, devperm, devlstblk, newoffset+nbblk*nb, myblkoffst*nb+ myblk*nb); if(devperm==devlstblk) magmablas_ssymmetrize( MagmaLower, remm % nb, dA(dev, newoffset+nbblk*nb, myblkoffst*nb+ myblk*nb), ldda ); // last partial tile } } /* magma_int_t siz = m+offset; float *R; magma_smalloc_cpu( &R, siz*siz ); // collecte back A magmablas_sgetmatrix_1D_bcyclic( siz, siz, dA, ldda, R, siz, ngpu, nb ); magma_setdevice( 0 ); magmablasSetKernelStream( streams[ dev ][ 0 ] ); //magma_sgetmatrix( siz, siz, dA[0], ldda, R, siz ); FILE *trace_file; trace_file = fopen("AJETE/Aafter", "w"); for (int j = 0; j < siz ; j++) for (int i = 0; i < siz ; i++) fprintf(trace_file, "%10d%10d%40.30e\n", i+1, j+1, R[j*siz+i]); fclose(trace_file); return; */ // ROW GEMM transpose a row and make a gemm with a block // if only 1 GPU used the ROW GEMM is integrated with the // COL GEMM (better accuracy observed) and better perf if(ngpu>1){ for( magma_int_t i = fstblksiz; i < m; i += nb ) { magma_int_t ib = min( nb, m-i ); // block size magma_int_t ioff = i + offset; // start global index in parent matrix //magma_int_t dev = (ioff / nb) % ngpu; magma_int_t nbblkoffst = offset/nb; magma_int_t nbblk = magma_ceildiv(i, nb); for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_int_t myblk = (nbblk/ngpu) + (nbblk%ngpu > ((dev-stdev+ngpu)%ngpu) ? 1:0 ); magma_int_t myblkoffst = (nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0); magma_int_t myrowsize = myblk * nb; magma_int_t coloffset = myblkoffst*nb; if(dev==stdev) { myrowsize = myrowsize -blockoffset; coloffset = myblkoffst*nb+blockoffset; } //printf("ROW GEMM: voici i %d ib %d ioff %d nbblkoffst %d stdev %d dev %d myblk %d myblkoffset %d coloffset %d rowsize %d\n", i, ib, ioff, nbblkoffst, stdev, dev, myblk, myblkoffst, coloffset, myrowsize); if(myrowsize>0){ magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ 1 ] ); magma_sgemm( MagmaConjTrans, MagmaNoTrans, myrowsize, n, ib, alpha, dA(dev,ioff,coloffset), ldda, dB(dev,i,0), lddb, c_one, dwork(dev,0,0), lddwork ); } } } for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_event_record(redevents[dev][1], streams[dev][1]); } } // COL GEMM // blockoffset is offset within first block; for subsequent blocks it is 0 if(blockoffset>0){ magma_int_t ib = min( nb-blockoffset, m ); // block size magma_int_t iblock = (offset / nb) / ngpu; // local block id magma_int_t di = iblock*nb+blockoffset; // local index in parent matrix magma_setdevice( stdev ); magmablasSetKernelStream( streams[ stdev ][ 0 ] ); //printf("DEV %d COL GEMM first ioff %d di %d m %d n %d ib %d \n", stdev, offset, di, m, n, ib); magma_sgemm( MagmaNoTrans, MagmaNoTrans, m, n, ib, alpha, dA(stdev,offset,di), ldda, dB(stdev,0,0), lddb, beta, dC(stdev,0,0), lddc ); } // COL GEMM for( magma_int_t i = fstblksiz; i < m; i += nb ) { magma_int_t ib = min( nb, m-i ); // block size magma_int_t ioff = i + offset; // start global index in parent matrix magma_int_t iblock = (ioff / nb) / ngpu; // local block id magma_int_t dev = (ioff / nb) % ngpu; magma_int_t di = iblock*nb; // local index in parent matrix //printf("DEV %d COL GEMM i %d ioff %d di %d m-i %d n %d ib %d \n", dev, i, ioff, di, m-i, n, ib); magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ 0 ] ); if(i==0){ magma_sgemm( MagmaNoTrans, MagmaNoTrans, m-i, n, ib, alpha, dA(dev,ioff,di), ldda, dB(dev,i,0), lddb, beta, dC(dev,i,0), lddc ); }else{ magma_sgemm( MagmaNoTrans, MagmaNoTrans, m-i, n, ib, alpha, dA(dev,ioff,di), ldda, dB(dev,i,0), lddb, c_one, dC(dev,i,0), lddc ); } magma_event_record(redevents[dev][0], streams[dev][0]); // if only 1 GPU is used, do the ROW GEMM if(ngpu==1){ // NOTE THAT because the COL gemm write dC below the diagonal (i) // and the ROW GEMM write dC from 0 to diag-1, so they could // run in parallel on different streams. // // NO NO NO because // it might happen that col finished i and strated i+1 while row still at i // magmablasSetKernelStream( streams[ dev ][ 0 ] ); magma_sgemm( MagmaConjTrans, MagmaNoTrans, i, n, ib, alpha, dA(dev,ioff,offset), ldda, dB(dev,i,0), lddb, c_one, dC(dev,0,0), lddc ); } } if(ngpu>1){ for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_int_t nbblk = magma_ceildiv((m+blockoffset), nb); magma_int_t nbblkrow = nbblk-1; magma_int_t devperm = (dev-stdev+ngpu)%ngpu; magma_int_t myblk = (nbblkrow/ngpu) + (nbblkrow%ngpu > devperm ? 1:0 ); magma_int_t myrowsize = myblk * nb; if(dev==stdev) { myrowsize = myrowsize - blockoffset; } //printf("blockoffset %d nbblkrow %d devperm %d DEV %d RECEIVING myblk %d myrowsize %d\n", blockoffset, nbblkrow, devperm, dev, myblk, myrowsize); if(myrowsize>0){ magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ 0 ] ); magma_queue_wait_event(streams[ dev ][ 0 ], redevents[dev][1]); //magma_queue_sync( streams[ dev ][ 1 ] ); // for each dev add the computed ROW block each on its placment with dC for( magma_int_t blki = 0; blki < myblk; ++blki){ magma_int_t gbblki = (blki*ngpu + devperm)*nb - blockoffset; magma_int_t lcblki = blki*nb; magma_int_t ib = nb;// min(nb, m-gbblki); if(dev==stdev){ lcblki = blki*nb-blockoffset; if(blki==0){ gbblki = 0; lcblki = 0; ib = nb-blockoffset; } } magmablas_sgeadd(ib, n, c_one, &dwork[dev][lcblki], lddwork, &dC[dev][gbblki] , lddc ); } magma_event_record(redevents[dev][0], streams[dev][0]); } } } // =========================================================== // COMMUNICATION ALL_REDUCE_SUM // =========================================================== if(ngpu==1){ return; } // INITIALIZE COMM for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { masterdev = -1; gnode[cmplxid][MagmaMaxGPUs+1] = -1; myngpu = gnode[cmplxid][MagmaMaxGPUs]; for( magma_int_t idev = 0; idev < myngpu; ++idev ) { dev = gnode[cmplxid][idev]; devperm = (dev-stdev+ngpu)%ngpu; myblk = (nbblk/ngpu) + (nbblk%ngpu > devperm ? 1:0 ); mycolsize = myblk*nb; myblkoffst = nb*((nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0)); if(dev==stdev){ mycolsize -= blockoffset; myblkoffst += blockoffset; // local index in parent matrix } if((devperm==devlstblk)&&(lstblksiz>0)){ mycolsize -= (nb-(remm%nb)); } mycolsize = min(mycolsize, m); if(mycolsize>0){ gpuisactive[dev] = mycolsize; if(masterdev==-1) { masterdev = dev; nbcmplxactive = nbcmplxactive +1; cmplxisactive[cmplxid] = 1; gnode[cmplxid][MagmaMaxGPUs+1] = masterdev; } } } } /* for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_device_sync(); } */ //******************************* // each GPU send its result // to its master. The master make // the addition and then send to // to the masters of other real // and receive from the masters of // other real make the addition // and broadcast locally the final // result. //******************************* //printf("=======================================================================\n"); //printf(" sending to my master \n"); //printf("=======================================================================\n"); for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { myngpu = gnode[cmplxid][MagmaMaxGPUs]; masterdev = gnode[cmplxid][MagmaMaxGPUs+1]; //check if real is active if(masterdev!=-1){ for( magma_int_t idev = 0; idev < myngpu; ++idev ) { dev = gnode[cmplxid][idev]; mycolsize = gpuisactive[dev]; if(mycolsize>0){ // I am an active GPU. if I am not the master, then send my result to my master. // store result on dwork[masterdev][dev*maxgsize] if(dev!=masterdev){ magma_setdevice( dev ); //printf(" GPU %d sending to my master %d\n", dev, masterdev); // wait the geadd of my ROW and COL GEMM is done magma_queue_wait_event(streams[ dev ][ 0 ], redevents[dev][0]); // sending to the master of my real magma_scopymatrix_async( m, n, &dC[dev][0], lddc, &dwork2[masterdev][maxgsize*dev], m, streams[dev][0] ); magma_event_record(redevents[dev][masterdev], streams[dev][0]); } // end I am not the masterdev }// end if mycolsize>0 }// for idev }// end of if masterdev!=-1 maening real is active }// for cmplxid /* for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_device_sync(); } */ //printf("=======================================================================\n"); //printf(" each master do addition of local result and broadcast to other masters \n"); //printf("=======================================================================\n"); for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { myngpu = gnode[cmplxid][MagmaMaxGPUs]; masterdev = gnode[cmplxid][MagmaMaxGPUs+1]; //check if real is active if(masterdev!=-1){ magma_setdevice( masterdev ); // addition is done on stream 0 sequentially magmablasSetKernelStream( streams[ masterdev ][ 0 ] ); // wait the geadd of my ROW and COL GEMM is done magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[masterdev][0]); // ======================================== // local addition // ======================================== for( magma_int_t l = 0; l < myngpu; ++l ) { lcdev = gnode[cmplxid][l]; lccolsize = gpuisactive[lcdev]; if((lcdev!=masterdev)&&(lccolsize>0)){ //printf(" master %d receiving from %d and adding \n", masterdev, lcdev); // this is an active GPU of my real. // wait I received what he send it to me and then do addition. magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[lcdev][masterdev]); magmablas_sgeadd(m, n, c_one, &dwork2[masterdev][maxgsize*lcdev], m, &dC[masterdev][0] , lddc ); } }// for l=1:myngpu // because addition is done sequentially on stream 0, // I have to record this to be able to synch using it magma_event_record(redevents[masterdev][masterdev], streams[masterdev][0]); // ======================================== // // ======================================== // send to other masters // ======================================== for( magma_int_t k = 0; k < nbcmplx; ++k ) { if(k!=cmplxid){ gmaster = gnode[k][MagmaMaxGPUs+1]; if(gmaster!=-1){ //real is active //Master has to wait until finish the local addition then send using gmaster stream. //use stream 0 to make it sequential or stream gmaster to make it parallel. //Now both re the same. //printf(" master %d from cmplx %d sending to other master %d on cmplx %d \n", masterdev, cmplxid, gmaster, k); magma_queue_wait_event(streams[ masterdev ][ gmaster ], redevents[masterdev][masterdev]); magma_scopymatrix_async( m, n, &dC[masterdev][0], lddc, &dwork2[gmaster][maxgsize*masterdev], m, streams[masterdev][gmaster] ); magma_event_record(redevents[masterdev][gmaster], streams[masterdev][gmaster]); magma_event_record(redevents[masterdev][masterdev], streams[masterdev][gmaster]); } // end of gmaster!=-1 } // end of k!=cmplxid }// for k = 0: nbcmplx // ======================================== }// end of if masterdev!=-1 maening real is active }// for cmplxid /* for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_device_sync(); } */ //printf("=======================================================================\n"); //printf(" each master wait receiving other masters results, do the addition and broadcast locally \n"); //printf("=======================================================================\n"); for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { myngpu = gnode[cmplxid][MagmaMaxGPUs]; masterdev = gnode[cmplxid][MagmaMaxGPUs+1]; //check if real is active if(masterdev!=-1){ magma_setdevice( masterdev ); // addition is done on stream 0 sequentially magmablasSetKernelStream( streams[ masterdev ][ 0 ] ); // master has to wait until finishing all the send to other masters. magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[masterdev][masterdev]); // ======================================== // addition of results from other masters // ======================================== for( magma_int_t k = 0; k < nbcmplx; ++k ) { if(k!=cmplxid){ gmaster = gnode[k][MagmaMaxGPUs+1]; if(gmaster!=-1){ //real is active //Master has to wait until receiving from gmaster, then do addition using stream 0 //printf(" master %d from cmplx %d receiving from other master %d on cmplx %d and adding \n", masterdev, cmplxid, gmaster, k); magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[gmaster][masterdev]); magmablas_sgeadd(m, n, c_one, &dwork2[masterdev][maxgsize*gmaster], m, &dC[masterdev][0] , lddc ); } // end of gmaster!=-1 } // end of k!=cmplxid }// for k = 0: nbcmplx // because addition is done sequentially on stream 0, // I have to record this to be able to synch using it magma_event_record(redevents[masterdev][masterdev], streams[masterdev][0]); // ======================================== // ======================================== // local broadcast of final results // ======================================== for( magma_int_t l = 0; l < myngpu; ++l ) { lcdev = gnode[cmplxid][l]; lccolsize = gpuisactive[lcdev]; if((lcdev!=masterdev)&&(lccolsize>0)){ // this is an active GPU of my real. // wait the previous addition is done maening stream 0 is finished and broadcast sequentially for now. // to make it parallel put stream lcdev instead of stream 0 //printf(" master %d broadcasting local to %d \n", masterdev, lcdev); magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[masterdev][masterdev]); magma_scopymatrix_async( m, n, &dC[masterdev][0], lddc, &dC[lcdev][0], lddc, streams[masterdev][0] ); magma_event_record(redevents[masterdev][lcdev], streams[masterdev][0]); } }// for l=1:myngpu // ======================================== }// end of if masterdev!=-1 maening real is active }// for cmplxid /* for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_device_sync(); } */ for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { myngpu = gnode[cmplxid][MagmaMaxGPUs]; masterdev = gnode[cmplxid][MagmaMaxGPUs+1]; //check if real is active if(masterdev!=-1){ for( magma_int_t l = 0; l < myngpu; ++l ) { lcdev = gnode[cmplxid][l]; lccolsize = gpuisactive[lcdev]; if(lccolsize>0){ magma_setdevice( lcdev ); magma_queue_wait_event(streams[ lcdev ][ 0 ], redevents[lcdev][0]); magma_queue_wait_event(streams[ lcdev ][ 0 ], redevents[masterdev][lcdev]); } }// for l=1:myngpu }// end of if masterdev!=-1 maening real is active }// for cmplxid //printf("****************************************************\n"); //printf(" finish ssymm \n"); //printf("****************************************************\n"); magma_setdevice( cdev ); magmablasSetKernelStream( cstream ); }
/** Purpose ------- SORGQR generates an M-by-N REAL matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by SGEQRF_GPU. Arguments --------- @param[in] m INTEGER The number of rows of the matrix Q. M >= 0. @param[in] n INTEGER The number of columns of the matrix Q. M >= N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. @param[in,out] dA REAL array A on the GPU, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by SGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. @param[in] ldda INTEGER The first dimension of the array A. LDDA >= max(1,M). @param[in] tau REAL array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by SGEQRF_GPU. @param[in] dT (workspace) REAL work space array on the GPU, dimension (2*MIN(M, N) + (N+31)/32*32 )*NB. This must be the 6th argument of magma_sgeqrf_gpu [ note that if N here is bigger than N in magma_sgeqrf_gpu, the workspace requirement DT in magma_sgeqrf_gpu must be as specified in this routine ]. @param[in] nb INTEGER This is the block size used in SGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in DT. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument has an illegal value @ingroup magma_sgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_sorgqr_gpu(magma_int_t m, magma_int_t n, magma_int_t k, float *dA, magma_int_t ldda, float *tau, float *dT, magma_int_t nb, magma_int_t *info) { #define dA(i,j) (dA + (i) + (j)*ldda) #define dT(j) (dT + (j)*nb) float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, lpanel; magma_int_t i, ib, ki, kk, iinfo; magma_int_t lddwork; float *dV, *dW; float *work, *panel; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (ldda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min( k, ki+nb ); } else { ki = 0; kk = 0; } // Allocate CPU work space // n*nb for sorgqr workspace // (m - kk)*(n - kk) for last block's panel lwork = n*nb; lpanel = (m - kk)*(n - kk); magma_smalloc_cpu( &work, lwork + lpanel ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } panel = work + lwork; // Allocate work space on GPU if (MAGMA_SUCCESS != magma_smalloc( &dV, ldda*nb )) { magma_free_cpu( work ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } // dT workspace has: // 2*min(m,n)*nb for T and R^{-1} matrices from geqrf // ((n+31)/32*32 )*nb for dW larfb workspace. lddwork = min(m,n); dW = dT + 2*lddwork*nb; magma_queue_t stream; magma_queue_create( &stream ); // Use unblocked code for the last or only block. if (kk < n) { m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; magma_sgetmatrix( m_kk, k_kk, dA(kk, kk), ldda, panel, m_kk ); lapackf77_sorgqr( &m_kk, &n_kk, &k_kk, panel, &m_kk, &tau[kk], work, &lwork, &iinfo ); magma_ssetmatrix( m_kk, n_kk, panel, m_kk, dA(kk, kk), ldda ); // Set A(1:kk,kk+1:n) to zero. magmablas_slaset( MagmaFull, kk, n - kk, c_zero, c_zero, dA(0, kk), ldda ); } if (kk > 0) { // Use blocked code // stream: copy Aii to V --> laset --> laset --> larfb --> [next] // CPU has no computation magmablasSetKernelStream( stream ); for (i = ki; i >= 0; i -= nb) { ib = min( nb, k-i ); mi = m - i; // Copy current panel on the GPU from dA to dV magma_scopymatrix_async( mi, ib, dA(i,i), ldda, dV, ldda, stream ); // set panel to identity magmablas_slaset( MagmaFull, i, ib, c_zero, c_zero, dA(0, i), ldda ); magmablas_slaset( MagmaFull, mi, ib, c_zero, c_one, dA(i, i), ldda ); if (i < n) { // Apply H to A(i:m,i:n) from the left magma_slarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, n-i, ib, dV, ldda, dT(i), nb, dA(i, i), ldda, dW, lddwork ); } } } magma_queue_sync( stream ); magmablasSetKernelStream( NULL ); magma_free( dV ); magma_free_cpu( work ); magma_queue_destroy( stream ); return *info; } /* magma_sorgqr_gpu */
extern "C" magma_int_t magma_sorgqr_gpu(magma_int_t m, magma_int_t n, magma_int_t k, float *dA, magma_int_t ldda, float *tau, float *dT, magma_int_t nb, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= SORGQR generates an M-by-N REAL matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by SGEQRF_GPU. Arguments ========= M (input) INTEGER The number of rows of the matrix Q. M >= 0. N (input) INTEGER The number of columns of the matrix Q. M >= N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. DA (input/output) REAL array A on the GPU, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by SGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. LDDA (input) INTEGER The first dimension of the array A. LDDA >= max(1,M). TAU (input) REAL array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by SGEQRF_GPU. DT (input/workspace) REAL work space array on the GPU, dimension (2*MIN(M, N) + (N+31)/32*32 )*NB. This must be the 6th argument of magma_sgeqrf_gpu [ note that if N here is bigger than N in magma_sgeqrf_gpu, the workspace requirement DT in magma_sgeqrf_gpu must be as specified in this routine ]. NB (input) INTEGER This is the block size used in SGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in DT. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument has an illegal value ===================================================================== */ #define dA(i,j) (dA + (i) + (j)*ldda) #define dT(j) (dT + (j)*nb) magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, lpanel; magma_int_t i, ib, ki, kk, iinfo; magma_int_t lddwork; float *dV, *dW; float *work, *panel; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (ldda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } // first kk columns are handled by blocked method. if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min( k, ki+nb ); } else { kk = 0; } // Allocate CPU work space // n*nb for sorgqr workspace // (m - kk)*(n - kk) for last block's panel lwork = n*nb; lpanel = (m - kk)*(n - kk); magma_smalloc_cpu( &work, lwork + lpanel ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } panel = work + lwork; // Allocate work space on GPU if (MAGMA_SUCCESS != magma_smalloc( &dV, ldda*nb )) { magma_free_cpu( work ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } // dT workspace has: // 2*min(m,n)*nb for T and R^{-1} matrices from geqrf // ((n+31)/32*32 )*nb for dW larfb workspace. lddwork = min(m,n); dW = dT + 2*lddwork*nb; cudaStream_t stream; magma_queue_create( &stream ); // Use unblocked code for the last or only block. if (kk < n) { m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; magma_sgetmatrix( m_kk, n_kk, dA(kk, kk), ldda, panel, m_kk ); lapackf77_sorgqr( &m_kk, &n_kk, &k_kk, panel, &m_kk, &tau[kk], work, &lwork, &iinfo ); magma_ssetmatrix( m_kk, n_kk, panel, m_kk, dA(kk, kk), ldda ); // Set A(1:kk,kk+1:n) to zero. magmablas_slaset( MagmaUpperLower, kk, n - kk, dA(0, kk), ldda ); } if (kk > 0) { // Use blocked code // stream: copy Aii to V --> laset --> laset --> larfb --> [next] // CPU has no computation magmablasSetKernelStream( stream ); for (i = ki; i >= 0; i -= nb) { ib = min( nb, k-i ); mi = m - i; // Copy current panel on the GPU from dA to dV magma_scopymatrix_async( mi, ib, dA(i,i), ldda, dV, ldda, stream ); // set panel to identity magmablas_slaset( MagmaUpperLower, i, ib, dA(0, i), ldda ); magmablas_slaset_identity( mi, ib, dA(i, i), ldda ); if (i < n) { // Apply H to A(i:m,i:n) from the left magma_slarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, n-i, ib, dV, ldda, dT(i), nb, dA(i, i), ldda, dW, lddwork ); } } } magma_queue_sync( stream ); magmablasSetKernelStream( NULL ); magma_free( dV ); magma_free_cpu( work ); magma_queue_destroy( stream ); return *info; } /* magma_sorgqr_gpu */
void magmablas_ssymm_mgpu_spec( magma_side_t side, magma_uplo_t uplo, magma_int_t m, magma_int_t n, float alpha, magmaFloat_ptr dA[], magma_int_t ldda, magma_int_t offset, magmaFloat_ptr dB[], magma_int_t lddb, float beta, magmaFloat_ptr dC[], magma_int_t lddc, magmaFloat_ptr dwork[], magma_int_t dworksiz, float *C, magma_int_t ldc, float *work[], magma_int_t worksiz, // TODO unused magma_int_t ngpu, magma_int_t nb, magma_queue_t queues[][20], magma_int_t nqueue, magma_event_t redevents[][MagmaMaxGPUs*MagmaMaxGPUs+10], magma_int_t nbevents, magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2], magma_int_t nbcmplx ) { #define dA(dev, i, j) (dA[dev] + (i) + (j)*ldda) #define dB(dev, i, j) (dB[dev] + (i) + (j)*lddb) #define dC(dev, i, j) (dC[dev] + (i) + (j)*lddc) #define dwork(dev, i, j) (dwork[dev] + (i) + (j)*lddwork) #define C(i, j) (C + (i) + (j)*ldc) if ( side != MagmaLeft || uplo != MagmaLower ) { fprintf( stderr, "%s: only Left Lower implemented\n", __func__ ); } assert( ldda >= m ); assert( lddb >= m ); assert( lddc >= m ); assert( nqueue >= ngpu ); assert( nbevents >= ngpu*ngpu ); magmaFloat_ptr dwork1[MagmaMaxGPUs]; magmaFloat_ptr dwork2[MagmaMaxGPUs]; magma_int_t lddwork = lddc; for( magma_int_t dev = 0; dev < ngpu; ++dev ) { dwork1[dev] = dwork[dev]; dwork2[dev] = dwork[dev]+n*lddwork; } assert( dworksiz >= (2*n*lddwork) ); magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_t cstream; magmablasGetKernelStream(&cstream); magma_int_t dev, devperm, myblk, mycolsize, myblkoffst; magma_int_t gdev, gcolsize, gmaster, gngpu; magma_int_t masterdev, lcdev, lccolsize, myngpu; magma_int_t stdev = (offset/nb)%ngpu; magma_int_t blockoffset = offset % nb; magma_int_t fstblksiz = 0; if(blockoffset>0){ fstblksiz = min(m, (nb - blockoffset)); } //magma_int_t nbblk = magma_ceildiv(m, nb); magma_int_t nbblk = magma_ceildiv((m+blockoffset), nb); magma_int_t maxgsize = n*nb*magma_ceildiv(nbblk, ngpu); magma_int_t remm = m- fstblksiz; magma_int_t nbblkoffst = offset/nb; magma_int_t nblstblks = -1; magma_int_t devlstblk = -1; magma_int_t lstblksiz = remm%nb; if(lstblksiz>0){ nblstblks = nbblk%ngpu; devlstblk = (nblstblks-1+ngpu)%ngpu; } magma_int_t nbcmplxactive = 0; magma_int_t cmplxisactive[MagmaMaxGPUs]; magma_int_t gpuisactive[MagmaMaxGPUs]; memset(gpuisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t)); memset(cmplxisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t)); //******************************* // each GPU make a GEMM with the // transpose of its blocks to compute // a final portion of X=A*VT //******************************* /* dB = V*T already ==> dB**H = T**H * V**H * compute T**H * V**H * X is equal to compute locally (VT)**H_i*X_i * then each GPU broadcast its X_i to assemble the full X which is used * to compute W = X - 0.5 * V * T**H * V**H * X = X - 0.5 * V *dwork3 */ if(ngpu ==1){ magma_setdevice( 0 ); magmablasSetKernelStream( queues[ 0 ][ 0 ] ); // compute X[me] = A*VT = A[me]^tr *VT; magma_sgemm( MagmaConjTrans, MagmaNoTrans, m, n, m, alpha, dA(0, offset, offset), ldda, dB[0], lddb, beta, dC[0], lddc ); return; } //ngpu>1 for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { masterdev = -1; gnode[cmplxid][MagmaMaxGPUs+1] = -1; myngpu = gnode[cmplxid][MagmaMaxGPUs]; for( magma_int_t idev = 0; idev < myngpu; ++idev ) { dev = gnode[cmplxid][idev]; devperm = (dev-stdev+ngpu)%ngpu; myblk = (nbblk/ngpu) + (nbblk%ngpu > devperm ? 1:0 ); mycolsize = myblk*nb; myblkoffst = nb*((nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0)); if(dev==stdev){ mycolsize -= blockoffset; myblkoffst += blockoffset; // local index in parent matrix } if((devperm==devlstblk)&&(lstblksiz>0)){ mycolsize -= (nb-(remm%nb)); } mycolsize = min(mycolsize, m); if(mycolsize>0){ if(masterdev==-1) masterdev = dev; //printf("dev %d devperm %d on cmplx %d master %d nbblk %d myblk %d m %d n %d mycolsize %d stdev %d fstblksize %d lastdev %d lastsize %d dA(%d, %d, %d) ==> dwork(%d, %d)\n", dev, devperm, cmplxid, masterdev, nbblk, myblk, m, n, mycolsize, stdev, fstblksiz, devlstblk, remm%nb, dev, offset, myblkoffst, dev, maxgsize*dev); gpuisactive[dev] = mycolsize; magma_setdevice( dev ); magmablasSetKernelStream( queues[ dev ][ dev ] ); magma_sgemm( MagmaConjTrans, MagmaNoTrans, mycolsize, n, m, alpha, dA(dev, offset, myblkoffst), ldda, dB(dev, 0, 0), lddb, beta, &dwork[dev][maxgsize*dev], mycolsize ); magma_event_record(redevents[dev][dev*ngpu+dev], queues[dev][dev]); } if(dev == masterdev){ nbcmplxactive = nbcmplxactive +1; cmplxisactive[cmplxid] = 1; gnode[cmplxid][MagmaMaxGPUs+1] = masterdev; } } } /* for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_queue_sync( queues[ dev ][ dev ] ); } */ //******************************* // each Master GPU has the final // result either by receiving // from CPU of by making the add // by himself, so now it is time // to broadcast over the GPUs of // its board. //******************************* //printf("=======================================================================\n"); //printf(" sending \n"); //printf("=======================================================================\n"); for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { myngpu = gnode[cmplxid][MagmaMaxGPUs]; masterdev = gnode[cmplxid][MagmaMaxGPUs+1]; for( magma_int_t idev = 0; idev < myngpu; ++idev ) { dev = gnode[cmplxid][idev]; mycolsize = gpuisactive[dev]; if(mycolsize>0){ // I am an active GPU send my portion local // to all active gpu of my cmplex and global to the // active master of the other real and they should // send it out to their actives slaves. magma_setdevice( dev ); //============================================== // sending to the master of the active real //============================================== //printf ("\n\n**************GPU %d\n ", dev); //printf (" GPU %d sending to cmplx masters\n", dev); for( magma_int_t k = 0; k < nbcmplx; ++k ) { if(k!=cmplxid){ gmaster = gnode[k][MagmaMaxGPUs+1]; if(gmaster!=-1){ //real is active //printf (" device %d from cmplx %d is sending to master %d on cmplx %d block of size %d event %d\n", dev, cmplxid, gmaster, k, mycolsize, redevents[dev][gmaster*ngpu+dev]); magma_queue_wait_event(queues[ dev ][ gmaster ], redevents[dev][dev*ngpu+dev]); magma_scopymatrix_async( mycolsize, n, &dwork[dev ][maxgsize*dev], mycolsize, &dwork[gmaster][maxgsize*dev], mycolsize, queues[dev][gmaster] ); magma_event_record(redevents[dev][gmaster*ngpu+dev], queues[dev][gmaster]); } } } //============================================== // //============================================== // sending to the active GPUs of my real //============================================== //printf (" GPU %d sending internal\n", dev); for( magma_int_t l = 0; l < myngpu; ++l ) { lcdev = gnode[cmplxid][l]; lccolsize = gpuisactive[lcdev]; if((lcdev!=dev)&&(lccolsize>0)){ //printf (" device %d from cmplx %d is sending internal to dev %d block of size %d event %d\n", dev, cmplxid, lcdev, mycolsize, redevents[dev][lcdev*ngpu+dev]); magma_queue_wait_event(queues[ dev ][ lcdev ], redevents[dev][dev*ngpu+dev]); magma_scopymatrix_async( mycolsize, n, &dwork[dev ][maxgsize*dev], mycolsize, &dwork[lcdev][maxgsize*dev], mycolsize, queues[dev][lcdev] ); magma_event_record(redevents[dev][lcdev*ngpu+dev], queues[dev][lcdev]); } } //============================================== }// end if mycolsize>0 }// for idev }// for cmplxid //printf("=======================================================================\n"); //printf(" master wait and resend internally \n"); //printf("=======================================================================\n"); for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { myngpu = gnode[cmplxid][MagmaMaxGPUs]; masterdev = gnode[cmplxid][MagmaMaxGPUs+1]; //============================================== // if I am active master so wait receiving contribution // of the GPUs of other real and send it locally //============================================== if(masterdev != -1){ mycolsize = gpuisactive[masterdev]; magma_setdevice( masterdev ); //printf(" GPU %d distributing internal\n", masterdev); for( magma_int_t k = 0; k < nbcmplx; ++k ) { if(k!=cmplxid){ gngpu = gnode[k][MagmaMaxGPUs]; for( magma_int_t g = 0; g < gngpu; ++g ) { gdev = gnode[k][g]; gcolsize = gpuisactive[gdev]; // check if I received from this GPU, // if yes send it to my group if(gcolsize>0){ magma_queue_wait_event(queues[ masterdev ][ gdev ], redevents[gdev][masterdev*ngpu+gdev]); for( magma_int_t l = 0; l < myngpu; ++l ) { lcdev = gnode[cmplxid][l]; lccolsize = gpuisactive[lcdev]; if((lcdev!=masterdev)&&(lccolsize>0)){ //printf(" Master %d on cmplx %d waiting on event %d is distributing internal results of %d to lcdev %d block of size %d event %d\n", masterdev, cmplxid, redevents[gdev][masterdev*ngpu+gdev], gdev, lcdev, gcolsize, redevents[masterdev][lcdev*ngpu+gdev]); magma_scopymatrix_async( gcolsize, n, &dwork[masterdev][maxgsize*gdev], gcolsize, &dwork[lcdev ][maxgsize*gdev], gcolsize, queues[masterdev][gdev] ); magma_event_record(redevents[masterdev][lcdev*ngpu+gdev], queues[masterdev][gdev]); } } } } } } }// if active master //============================================== }// for cmplxid /* for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_queue_sync( queues[ dev ][ 0 ] ); for( magma_int_t s = 0; s < ngpu; ++s ) { magma_queue_sync( queues[ dev ][ s ] ); } } */ //printf("=======================================================================\n"); //printf(" distributing \n"); //printf("=======================================================================\n"); magma_int_t lcblki, gbblki, gblk, ib; for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { myngpu = gnode[cmplxid][MagmaMaxGPUs]; masterdev = gnode[cmplxid][MagmaMaxGPUs+1]; for( magma_int_t idev = 0; idev < myngpu; ++idev ) { dev = gnode[cmplxid][idev]; mycolsize = gpuisactive[dev]; if(mycolsize>0){ // I am an active GPU //printf("\n\n==============GPU %d collecting\n", dev); magma_setdevice( dev ); // collect my results first as tyhere is no need to wait to // receive nothing, just wait that my gemm are done. // in theory this should be inside the loop but cuda was not // able to run it first for all gpu and on gpu>0 it was waiting // however it was on different stream so it should run. but maybe // this is because there are too many function call and this make // cuda not handleit so nice. anyway it coul dbe removed when cuda // is able to lunch it first without wait. gdev = dev; gcolsize = gpuisactive[gdev]; if(gcolsize>0){ devperm = (gdev-stdev+ngpu)%ngpu; gblk = (nbblk/ngpu) + (nbblk%ngpu > devperm ? 1:0 ); magmablasSetKernelStream( queues[ dev ][ gdev ] ); magma_queue_wait_event(queues[ dev ][ gdev ], redevents[gdev][dev*ngpu+gdev]); //printf (" GPU %d stream %d doing slacpy\n", dev, queues[ dev ][ gdev ]); for( magma_int_t blki = 0; blki < gblk; ++blki){ gbblki = (blki*ngpu + devperm)*nb - blockoffset; lcblki = blki*nb; ib = nb;//min(nb, m-gbblki); if(gdev==stdev){ lcblki = blki*nb-blockoffset; if(blki==0){ gbblki = 0; lcblki = 0; ib = nb-blockoffset; } } ib = min(ib, m-gbblki); //printf(" blockoffset %d nbblk %d stdev %d receiving from gdev %d gblk %d gcolsize %d copying blki %d of size ibxn %dx%d from work[%d] to C[%d]\n", blockoffset, nbblk, stdev, gdev, gblk, gcolsize, blki, ib, n, lcblki, gbblki); magmablas_slacpy( MagmaFull, ib, n, &dwork[dev][maxgsize*gdev+lcblki], gcolsize, &dC[dev][gbblki], lddc); }// end blki } for( magma_int_t k = 0; k < nbcmplx; ++k ) { gngpu = gnode[k][MagmaMaxGPUs]; for( magma_int_t g = 0; g < gngpu; ++g ) { gdev = gnode[k][g]; gcolsize = gpuisactive[gdev]; // if gcolsize>0, ==> gpu gdev was active and so // I received from him/computed a portion of dwork, // so go over its gblk and distribute it on dC. if(gdev!=dev){ if(gcolsize>0){ devperm = (gdev-stdev+ngpu)%ngpu; gblk = (nbblk/ngpu) + (nbblk%ngpu > devperm ? 1:0 ); magmablasSetKernelStream( queues[ dev ][ gdev ] ); if(k==cmplxid){ //we are on the same group so wait on event issued by gdev for me citing his id magma_queue_wait_event(queues[ dev ][ gdev ], redevents[gdev][dev*ngpu+gdev]); //printf (" GPU %d queue %d waiting on event %d to collecte from %d the size of gcolsize %d\n", dev, queues[ dev ][ gdev ], redevents[gdev][dev*ngpu+gdev], gdev, gcolsize); }else{ //we are on different group so: //if I am the master wait on the event issued by gdev for me citing his id //else wait event issued by my master for me on the behalf of gdev //printf (" GPU %d queue %d waiting on event %d to collecte from %d the size of gcolsize %d\n", dev, queues[ dev ][ gdev ], redevents[masterdev][dev*ngpu+gdev], gdev, gcolsize); if(dev==masterdev) magma_queue_wait_event(queues[ dev ][ gdev ], redevents[gdev][dev*ngpu+gdev]); else magma_queue_wait_event(queues[ dev ][ gdev ], redevents[masterdev][dev*ngpu+gdev]); } //printf (" GPU %d stream %d doing slacpy\n", dev, queues[ dev ][ gdev ]); for( magma_int_t blki = 0; blki < gblk; ++blki){ gbblki = (blki*ngpu + devperm)*nb - blockoffset; lcblki = blki*nb; ib = nb;//min(nb, m-gbblki); if(gdev==stdev){ lcblki = blki*nb-blockoffset; if(blki==0){ gbblki = 0; lcblki = 0; ib = nb-blockoffset; } } ib = min(ib, m-gbblki); //printf(" blockoffset %d nbblk %d stdev %d receiving from gdev %d gblk %d gcolsize %d copying blki %d of size ibxn %dx%d from work[%d] to C[%d]\n", blockoffset, nbblk, stdev, gdev, gblk, gcolsize, blki, ib, n, lcblki, gbblki); magmablas_slacpy( MagmaFull, ib, n, &dwork[dev][maxgsize*gdev+lcblki], gcolsize, &dC[dev][gbblki], lddc); }// end blki }// en gcolsize>0 meaning gdev is active } // end if gdev != dev }// end loop over the g gpus of the cmplx k }//end loop over the real k }// end mycolsize>0 meaning that I am active }// end loop over idev of cmplxid }// end loop of the cmplx for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_device_sync(); } // put back the input gpu and its input stream magma_setdevice( cdev ); magmablasSetKernelStream( cstream ); }
/** Purpose ------- SPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix dA. Auxiliary subroutine for spotrf2_ooc. It is multiple gpu interface to compute Cholesky of a "rectangular" matrix. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] dA REAL array on the GPU, dimension (LDDA,N) On entry, the symmetric matrix dA. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_sposv_comp ********************************************************************/ extern "C" magma_int_t magma_spotrf3_mgpu( magma_int_t ngpu, magma_uplo_t uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb, magmaFloat_ptr d_lA[], magma_int_t ldda, magmaFloat_ptr d_lP[], magma_int_t lddp, float *A, magma_int_t lda, magma_int_t h, magma_queue_t queues[][3], magma_event_t events[][5], magma_int_t *info ) { #define Alo(i, j) (A + ((j)+off_j)*lda + (nb*(((i)/nb)%h)+off_i)) #define Aup(i, j) (A + (nb*(((j)/nb)%h)+off_j)*lda + (i+off_i)) #define dlA(id, i, j) (d_lA[(id)] + (j)*ldda + (i)) #define dlP(id, i, j, k) (d_lP[(id)] + (k)*nb*lddp + (j)*lddp + (i)) #define dlPT(id, i, j, k) (d_lP[(id)] + (k)*nb*lddp + (j)*nb + (i)) magma_int_t j, jb, nb0, nb2, d, dd, id, j_local, j_local2, buf; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; float d_one = 1.0; float d_neg_one = -1.0; int upper = (uplo == MagmaUpper); float *dlpanel; magma_int_t n_local[MagmaMaxGPUs], ldpanel; const magma_int_t stream1 = 0, stream2 = 1, stream3 = 2; *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (!upper && ngpu*ldda < max(1,n)) { *info = -4; } else if (upper && ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK) /* used by strsm_work */ float c_zero = MAGMA_S_ZERO; int trsm_nb = 128; int trsm_n = trsm_nb*((nb+trsm_nb-1)/trsm_nb); float *d_dinvA[MagmaMaxGPUs]; float *d_x[MagmaMaxGPUs]; #define dinvA(d,j) &(d_dinvA[(d)][(j)*trsm_nb*trsm_n]) #define dx(d,j) &(d_x[(d)][(j)*nb*m]) /* * Allocate device memory for the inversed diagonal blocks, size=N*BLOCK_SIZE */ // TODO free memory on failure. for( d=0; d < ngpu; d++ ) { magma_setdevice(d); if ( (MAGMA_SUCCESS != magma_smalloc( &d_dinvA[d], 2*trsm_nb*trsm_n )) || (MAGMA_SUCCESS != magma_smalloc( &d_x[d], 2*nb*(upper ? n : m) )) ) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } magma_setdevice(0); #endif /* initialization */ for( d=0; d < ngpu; d++ ) { /* local-n and local-ld */ if (upper) { n_local[d] = (n/(nb*ngpu))*nb; if (d < (n/nb)%ngpu) n_local[d] += nb; else if (d == (n/nb)%ngpu) n_local[d] += n%nb; } else { n_local[d] = (m/(nb*ngpu))*nb; if (d < (m/nb)%ngpu) n_local[d] += nb; else if (d == (m/nb)%ngpu) n_local[d] += m%nb; } } /* == initialize the trace */ trace_init( 1, ngpu, 3, (CUstream_st**)queues ); if (upper) { /* ---------------------------------------------- */ /* Upper-triangular case */ /* > Compute the Cholesky factorization A = U'*U. */ /* ---------------------------------------------- */ for (j=0; j < m; j += nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%ngpu; buf = (j/nb)%ngpu; // right now, we have ngpu buffers, so id and buf are the same.. /* Set the local index where the current panel is */ j_local = j/(nb*ngpu); jb = min(nb, (m-j)); /* Update the current diagonal block on stream1 */ magma_setdevice(id); if ( j > 0 ) { magmablasSetKernelStream( queues[id][stream1] ); trace_gpu_start( id, stream1, "syrk", "syrk" ); magma_ssyrk(MagmaUpper, MagmaConjTrans, jb, j, d_neg_one, dlA(id, 0, nb*j_local), ldda, d_one, dlA(id, j, nb*j_local), ldda); trace_gpu_end( id, stream1 ); } /* send the diagonal to cpu on stream1 */ trace_gpu_start( id, stream1, "comm", "D to CPU" ); magma_sgetmatrix_async( jb, jb, dlA(id, j, nb*j_local), ldda, Aup(j,j), lda, queues[id][stream1] ); trace_gpu_end( id, stream1 ); /* update off-diagonal blocks in the panel */ if ( j > 0 ) { d = (j/nb+1)%ngpu; for( dd=0; dd < ngpu; dd++ ) { j_local2 = j_local+1; if ( d > id ) j_local2 --; nb0 = nb*j_local2; // number of local columns in the panel, while jb is panel-size (number of rows) if ( n_local[d] > nb0 ) { magma_setdevice(d); magmablasSetKernelStream( queues[d][stream2] ); if ( d == id ) { dlpanel = dlA(d,0,nb*j_local); ldpanel = ldda; // the GPU owns the row from start, and no need of synch. //magma_queue_wait_event( queues[d][stream2], events[d][0] ); // rows arrived at gpu magma_queue_wait_event( queues[d][stream2], events[d][4] ); // wait for look-ahead trsm to finish } else { dlpanel = dlP(d,nb,0,buf); ldpanel = lddp; magma_queue_wait_event( queues[d][stream2], events[d][0] ); // rows arrived at gpu } trace_gpu_start( d, stream2, "gemm", "gemm" ); magma_sgemm(MagmaConjTrans, MagmaNoTrans, jb, n_local[d]-nb0, j, c_neg_one, dlpanel, ldpanel, dlA(d, 0, nb0), ldda, c_one, dlA(d, j, nb0), ldda); trace_gpu_end( d, stream2 ); magma_event_record( events[d][2], queues[d][stream2] ); } d = (d+1)%ngpu; } } /* wait for panel and factorize it on cpu */ magma_setdevice(id); magma_queue_sync( queues[id][stream1] ); trace_cpu_start( 0, "getrf", "getrf" ); lapackf77_spotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info); trace_cpu_end( 0 ); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus on stream1 */ if ( (j+jb) < n) { d = (j/nb+1)%ngpu; for( dd=0; dd < ngpu; dd++ ) { if ( d == id ) { dlpanel = dlA(d, j, nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d,0,0,buf); ldpanel = lddp; } magma_setdevice(d); trace_gpu_start( d, stream1, "comm", "comm" ); magma_ssetmatrix_async( jb, jb, Aup(j,j), lda, dlpanel, ldpanel, queues[d][stream1] ); trace_gpu_end( d, stream1 ); magma_event_record( events[d][1], queues[d][stream1] ); d = (d+1)%ngpu; } } else { magma_setdevice(id); trace_gpu_start( id, stream1, "comm", "comm" ); magma_ssetmatrix_async( jb, jb, Aup(j,j), lda, dlA(id, j, nb*j_local), ldda, queues[id][stream1] ); trace_gpu_end( id, stream1 ); } /* panel-factorize the off-diagonal */ if ( (j+jb) < n) { d = (j/nb+1)%ngpu; for( dd=0; dd < ngpu; dd++ ) { /* next column */ j_local2 = j_local+1; if ( d > id ) j_local2--; if ( d == id ) { dlpanel = dlA(d,j,nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d,0,0,buf); ldpanel = lddp; } nb2 = n_local[d] - j_local2*nb; magma_setdevice(d); if ( j+jb < m && d == (j/nb+1)%ngpu ) { /* owns the next column, look-ahead next block on stream1 */ nb0 = min(nb, nb2); magmablasSetKernelStream( queues[d][stream1] ); magma_queue_wait_event( queues[d][stream1], events[d][2] ); // wait for gemm update trace_gpu_start( d, stream1, "trsm", "trsm" ); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK) magmablas_slaset( MagmaFull, trsm_nb, trsm_n, c_zero, c_zero, dinvA(d,0), trsm_nb ); magmablas_slaset( MagmaFull, nb0, jb, c_zero, c_zero, dx(d,0), nb0 ); magmablas_strsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, 1, dinvA(d,0), dx(d,0) ); #else magma_strsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda); #endif magma_event_record( events[d][4], queues[d][stream1] ); trace_gpu_end( d, stream1 ); } else if ( nb2 > 0 ) { /* update all the blocks on stream2 */ magma_queue_wait_event( queues[d][stream2], events[d][1] ); // wait for cholesky factor trace_gpu_start( d, stream2, "trsm", "trsm" ); magmablasSetKernelStream( queues[d][stream2] ); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK) magmablas_slaset( MagmaFull, trsm_nb, trsm_n, c_zero, c_zero, dinvA(d,0), trsm_nb ); magmablas_slaset( MagmaFull, nb2, jb, c_zero, c_zero, dx(d,0), nb2 ); magmablas_strsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, 1, dinvA(d,0), dx(d,0) ); #else magma_strsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda); #endif trace_gpu_end( d, stream2 ); } d = (d+1)%ngpu; } /* end of for */ /* ========================================================== */ if ( j+jb < m ) { d = (j/nb+1)%ngpu; /* next column */ j_local2 = j_local+1; if ( d > id ) j_local2--; nb0 = min(nb, n_local[d]-nb*j_local2 ); /* even on 1 gpu, off-diagonals are copied to cpu (synchronize at the end). * * so we have the Cholesky factor, but only diagonal submatrix of the big panel, * * on cpu at the end. */ int d2, buf2; magma_setdevice(d); /* lookahead done */ magma_queue_wait_event( queues[d][stream3], events[d][4] ); trace_gpu_start( d, stream3, "comm", "row to CPU" ); magma_sgetmatrix_async( (j+jb), nb0, dlA(d, 0, nb*j_local2), ldda, Aup(0,j+jb), lda, queues[d][stream3] ); trace_gpu_end( d, stream3 ); magma_event_record( events[d][3], queues[d][stream3] ); /* needed on pluto */ //magma_queue_sync( queues[d][stream3] ); /* broadcast rows to gpus on stream2 */ buf2 = ((j+jb)/nb)%ngpu; for( d2=0; d2 < ngpu; d2++ ) { if ( d2 != d ) { magma_setdevice(d2); trace_gpu_start( d2, stream3, "comm", "row to GPUs" ); magma_queue_wait_event( queues[d2][stream3], events[d][3] ); // rows arrived at cpu on stream3 magma_ssetmatrix_async( j+jb, nb0, Aup(0,j+jb), lda, dlP(d2,nb,0,buf2), lddp, queues[d2][stream3] ); trace_gpu_end( d2, stream3 ); magma_event_record( events[d2][0], queues[d2][stream3] ); } } /* =========================== */ /* update the remaining blocks */ nb2 = n_local[d]-(nb*j_local2 + nb0); if ( nb2 > 0 ) { if ( d == id ) { dlpanel = dlA(d, j, nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d,0,0,buf); ldpanel = lddp; } magma_setdevice(d); magmablasSetKernelStream( queues[d][stream2] ); trace_gpu_start( d, stream2, "trsm", "trsm" ); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK) int flag = 0; if (flag == 0) { magma_queue_wait_event( queues[d][stream2], events[d][4] ); // lookahead -> diagonal inversion } else { magmablas_slaset( MagmaFull, trsm_nb, trsm_n, c_zero, c_zero, dinvA(d,flag), trsm_nb ); magma_queue_wait_event( queues[d][stream2], events[d][1] ); // panel received } magmablas_slaset( MagmaFull, nb2, jb, c_zero, c_zero, dx(d,1), nb2 ); magmablas_strsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda, flag, dinvA(d,flag), dx(d,1) ); #else magma_queue_wait_event( queues[d][stream2], events[d][1] ); // wait for cholesky factor magma_strsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda); #endif trace_gpu_end( d, stream2 ); } } } /* end of strsm */ } /* end of for j=1, .., n */ } else { /* ---------------------------------------------- */ /* Lower-triangular case */ /* > Compute the Cholesky factorization A = L*L'. */ /* ---------------------------------------------- */ for (j=0; j < n; j += nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%ngpu; buf = (j/nb)%ngpu; /* Set the local index where the current panel is */ j_local = j/(nb*ngpu); jb = min(nb, (n-j)); /* Update the current diagonal block on stream1 */ magma_setdevice(id); if ( j > 0 ) { magmablasSetKernelStream( queues[id][stream1] ); magma_ssyrk(MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dlA(id, nb*j_local, 0), ldda, d_one, dlA(id, nb*j_local, j), ldda); } /* send the diagonal to cpu on stream1 */ magma_sgetmatrix_async( jb, jb, dlA(id, nb*j_local, j), ldda, Alo(j,j), lda, queues[id][stream1] ); /* update off-diagonal blocks of the panel */ if ( j > 0 ) { d = (j/nb+1)%ngpu; for( dd=0; dd < ngpu; dd++ ) { j_local2 = j_local+1; if ( d > id ) j_local2 --; nb0 = nb*j_local2; if ( nb0 < n_local[d] ) { magma_setdevice(d); magmablasSetKernelStream( queues[d][stream2] ); if ( d == id ) { dlpanel = dlA(d, nb*j_local, 0); ldpanel = ldda; magma_queue_wait_event( queues[d][stream2], events[d][4] ); // wait for look-ahead trsm to finish } else { dlpanel = dlPT(d,0,nb,buf); ldpanel = nb; magma_queue_wait_event( queues[d][stream2], events[d][0] ); // rows arrived at gpu } magma_sgemm( MagmaNoTrans, MagmaConjTrans, n_local[d]-nb0, jb, j, c_neg_one, dlA(d, nb0, 0), ldda, dlpanel, ldpanel, c_one, dlA(d, nb0, j), ldda); magma_event_record( events[d][2], queues[d][stream2] ); } d = (d+1)%ngpu; } } /* wait for the panel and factorized it on cpu */ magma_setdevice(id); magma_queue_sync( queues[id][stream1] ); lapackf77_spotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus on stream1 */ if ( (j+jb) < m) { d = (j/nb+1)%ngpu; for( dd=0; dd < ngpu; dd++ ) { if ( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d, 0, 0, buf); ldpanel = nb; } magma_setdevice(d); magma_ssetmatrix_async( jb, jb, Alo(j,j), lda, dlpanel, ldpanel, queues[d][stream1] ); magma_event_record( events[d][1], queues[d][stream1] ); d = (d+1)%ngpu; } } else { magma_setdevice(id); magma_ssetmatrix_async( jb, jb, Alo(j,j), lda, dlA(id, nb*j_local, j), ldda, queues[id][stream1] ); } /* panel factorize the off-diagonal */ if ( (j+jb) < m) { d = (j/nb+1)%ngpu; for( dd=0; dd < ngpu; dd++ ) { /* next column */ j_local2 = j_local+1; if ( d > id ) j_local2--; if ( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d, 0, 0, buf); ldpanel = nb; } nb2 = n_local[d] - j_local2*nb; nb0 = min(nb, nb2); magma_setdevice(d); if ( j+nb < n && d == (j/nb+1)%ngpu ) { /* owns next column, look-ahead next block on stream1 */ if ( j > 0 ) magma_queue_wait_event( queues[d][stream1], events[d][2] ); // wait for gemm update magmablasSetKernelStream( queues[d][stream1] ); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK) magmablas_slaset( MagmaFull, trsm_nb, trsm_n, c_zero, c_zero, dinvA(d,0), trsm_nb ); magmablas_slaset( MagmaFull, nb0, jb, c_zero, c_zero, dx(d,0), nb0 ); magmablas_strsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda, 1, dinvA(d,0), dx(d,0) ); #else magma_strsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda); #endif magma_event_record( events[d][4], queues[d][stream1] ); } else if ( nb2 > 0 ) { /* other gpus updating all the blocks on stream2 */ /* update the entire column */ magma_queue_wait_event( queues[d][stream2], events[d][1] ); // wait for the cholesky factor magmablasSetKernelStream( queues[d][stream2] ); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK) magmablas_slaset( MagmaFull, trsm_nb, trsm_n, c_zero, c_zero, dinvA(d,0), trsm_nb ); magmablas_slaset( MagmaFull, nb2, jb, c_zero, c_zero, dx(d,0), nb2 ); magmablas_strsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda, 1, dinvA(d,0), dx(d,0) ); #else magma_strsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda); #endif } d = (d+1)%ngpu; } /* end for d */ /* ========================================================== */ if ( j+jb < n ) { d = (j/nb+1)%ngpu; /* next column */ j_local2 = j_local+1; if ( d > id ) j_local2--; nb0 = min(nb, n_local[d]-nb*j_local2 ); /* even on 1 gpu, we copy off-diagonal to cpu (but don't synchronize). */ /* so we have the Cholesky factor on cpu at the end. */ int d2, buf2; //#define SPOTRF_DEVICE_TO_DEVICE #ifdef SPOTRF_DEVICE_TO_DEVICE // lookahead done /* broadcast the rows to gpus */ buf2 = ((j+jb)/nb)%ngpu; for( d2=0; d2 < ngpu; d2++ ) { magma_setdevice(d2); magma_queue_wait_event( queues[d2][stream3], events[d][4] ); if ( d2 != d ) { magma_scopymatrix_async( nb0, j+jb, dlPT(d2,0,nb,buf2), nb, // first nbxnb reserved for diagonal block dlA(d, nb*j_local2, 0), ldda, queues[d2][stream3] ); magma_event_record( events[d2][0], queues[d2][stream3] ); } else { magma_sgetmatrix_async( nb0, j+jb, dlA(d, nb*j_local2, 0), ldda, Alo(j+jb,0), lda, queues[d][stream3] ); } } #else // lookahead done magma_setdevice(d); magma_queue_wait_event( queues[d][stream3], events[d][4] ); magma_sgetmatrix_async( nb0, j+jb, dlA(d, nb*j_local2, 0), ldda, Alo(j+jb,0), lda, queues[d][stream3] ); magma_event_record( events[d][3], queues[d][stream3] ); /* syn on rows on CPU, seem to be needed on Pluto */ //magma_queue_sync( queues[d][stream3] ); /* broadcast the rows to gpus */ buf2 = ((j+jb)/nb)%ngpu; for( d2=0; d2 < ngpu; d2++ ) { if ( d2 != d ) { magma_setdevice(d2); magma_queue_wait_event( queues[d2][stream3], events[d][3] ); // getmatrix done magma_ssetmatrix_async( nb0, j+jb, Alo(j+jb,0), lda, dlPT(d2,0,nb,buf2), nb, // first nbxnb reserved for diagonal block queues[d2][stream3] ); magma_event_record( events[d2][0], queues[d2][stream3] ); } } #endif /* =================================== */ /* updates remaining blocks on stream2 */ nb2 = n_local[d] - (j_local2*nb + nb0); if ( nb2 > 0 ) { if ( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d,0,0,buf); ldpanel = nb; } magma_setdevice(d); magmablasSetKernelStream( queues[d][stream2] ); /* update the remaining blocks in the column */ #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK) int flag = 0; if (flag == 0) { magma_queue_wait_event( queues[d][stream2], events[d][4] ); // lookahead -> diagonal inversion } else { magmablas_slaset( MagmaFull, trsm_nb, trsm_n, c_zero, c_zero, dinvA(d,flag), trsm_nb ); magma_queue_wait_event( queues[d][stream2], events[d][1] ); // panel received } magmablas_slaset( MagmaFull, nb2, jb, c_zero, c_zero, dx(d,1), nb2 ); magmablas_strsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda, flag, dinvA(d,flag), dx(d,1) ); #else magma_queue_wait_event( queues[d][stream2], events[d][1] ); // panel received magma_strsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda); #endif } } } } } /* end of else not upper */ /* == finalize the trace == */ trace_finalize( "spotrf.svg", "trace.css" ); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); for( j=0; j < 3; j++ ) { magma_queue_sync( queues[d][j] ); } #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK) magma_free( d_dinvA[d] ); magma_free( d_x[d] ); #endif } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_spotrf_mgpu */