extern "C" int calc_bounding_box(magmaFloatComplex *M, magma_int_t M_lead_dim, float *wReEig, float *wImEig) { magma_int_t rslt = 0; //magmaFloatComplex *AT = nullptr; magmaFloatComplex *dA = nullptr, *dAT = nullptr, *dreA = nullptr, *dimA = nullptr; float *dreEig = nullptr; float *dimEig = nullptr; //magma_int_t *ipiv = NULL; magma_int_t lda = M_lead_dim; //magma_int_t ldx = lda; magma_int_t info = 0; magma_int_t nb = 0; //magma_vec_t jobvl; //magma_vec_t jobvr; magmaFloatComplex *work = nullptr; magma_int_t lwork = 0; float *rwork = nullptr; magma_int_t lrwork = 0; magma_int_t *iwork = nullptr; magma_int_t liwork = 0; nb = magma_get_cgehrd_nb( M_lead_dim ); lwork = 2 * (M_lead_dim + M_lead_dim*nb); // MagmaNoVec //lwork = 2 * max(M_lead_dim + M_lead_dim*nb, 2*M_lead_dim + M_lead_dim*M_lead_dim); // MagmaVec lrwork = M_lead_dim; // MagmaNoVec //lrwork = 1 + 5 * M_lead_dim + 2*M_lead_dim*M_lead_dim; // MagmaVec liwork = 1; // MagmaNoVec //liwork = 3 + 5*M_lead_dim; // MagmaVec magma_imalloc_cpu(&iwork, liwork); magma_smalloc_cpu(&rwork, lrwork); //magma_cmalloc_cpu(&A, lda*M_lead_dim); //magma_cmalloc_cpu(&AT, lda*M_lead_dim); //magma_smalloc_cpu(&reEig, M_lead_dim); //magma_smalloc_cpu(&imEig, M_lead_dim); magma_cmalloc_pinned(&dA, lda*M_lead_dim); magma_cmalloc_pinned(&dAT, lda*M_lead_dim); magma_cmalloc_pinned(&dreA, lda*M_lead_dim); magma_cmalloc_pinned(&dimA, lda*M_lead_dim); //magma_cmalloc_pinned(&VL, lda*M_lead_dim); //magma_cmalloc_pinned(&VR, lda*M_lead_dim); magma_cmalloc_pinned(&work, lwork); magma_smalloc_pinned(&dreEig, M_lead_dim); magma_smalloc_pinned(&dimEig, M_lead_dim); //matrix_fillzero(AT, M_lead_dim); //vector_fillzero(reEig, M_lead_dim); //vector_fillzero(imEig, M_lead_dim); //prepare_matrix_2(M); magma_csetmatrix(M_lead_dim, M_lead_dim, M, lda, dA, M_lead_dim, queue); //magma_csetmatrix(M_lead_dim, M_lead_dim, AT, lda, dAT, M_lead_dim, queue); //magma_ssetvector(M_lead_dim, wReEig, 1, dreEig, 1, queue); //magma_ssetvector(M_lead_dim, wImEig, 1, dimEig, 1, queue); //magma_cprint_gpu(M_lead_dim, M_lead_dim, dA, lda); // reA = ( (A + A')/2.0 ) // A' magmablas_ctranspose(M_lead_dim, M_lead_dim, dA, M_lead_dim, dAT, M_lead_dim, queue); //magma_cprint_gpu(M_lead_dim, M_lead_dim, dAT, lda); // AT = A + A' magmablas_cgeadd(M_lead_dim, M_lead_dim, MAGMA_C_MAKE(1.0f, 0.0f), dA, M_lead_dim, dAT, M_lead_dim, queue); //magma_cprint_gpu(M_lead_dim, M_lead_dim, dAT, lda); // AT=AT*0.5 magma_cscal(lda*M_lead_dim, MAGMA_C_MAKE(0.5f, 0.0f), dAT, 1, queue); //magma_cprint_gpu(M_lead_dim, M_lead_dim, dAT, lda); // reA = AT magma_ccopy(lda*M_lead_dim, dAT, 1, dreA, 1, queue); //magma_cprint_gpu(M_lead_dim, M_lead_dim, dreA, lda); magma_sync_wtime(queue); // imA = ( -1im*(A - A')/2.0 ) // A' magmablas_ctranspose(M_lead_dim, M_lead_dim, dA, M_lead_dim, dAT, M_lead_dim, queue); //magma_cprint_gpu(M_lead_dim, M_lead_dim, dAT, lda); // AT = A + A' magmablas_cgeadd(M_lead_dim, M_lead_dim, MAGMA_C_MAKE(-1.0f, 0.0f), dAT, M_lead_dim, dA, M_lead_dim, queue); // A=A*-1j*0.5 magma_cscal(lda*M_lead_dim, MAGMA_C_MAKE(0.0f, -0.5f), dA, 1, queue); // imA = A magma_ccopy(lda*M_lead_dim, dA, 1, dimA, 1, queue); magma_sync_wtime(queue); //magma_cprint_gpu(M_lead_dim, M_lead_dim, dreA, lda); //magma_cprint_gpu(M_lead_dim, M_lead_dim, dimA, lda); // reEig::Vector=eigvals(reA) rslt = magma_cheevd(MagmaNoVec, MagmaLower, M_lead_dim, dreA, lda, dreEig, work, lwork, rwork, lrwork, iwork, liwork, &info); // imEig::Vector=eigvals(imA) rslt = magma_cheevd(MagmaNoVec, MagmaLower, M_lead_dim, dimA, lda, dimEig, work, lwork, rwork, lrwork, iwork, liwork, &info); //magma_sprint_gpu(M_lead_dim, 1, dreEig, M_lead_dim); //magma_sprint_gpu(M_lead_dim, 1, dimEig, M_lead_dim); magma_sgetvector(M_lead_dim, dreEig, 1, wReEig, 1, queue); //magma_sync_wtime(queue); magma_sgetvector(M_lead_dim, dimEig, 1, wImEig, 1, queue); //magma_sync_wtime(queue); /* maxReIdx = magma_isamax(M_lead_dim, dreEig, 1, queue) - 1; minReIdx = magma_isamin(M_lead_dim, dreEig, 1, queue) - 1; maxImIdx = magma_isamax(M_lead_dim, dimEig, 1, queue) - 1; minImIdx = magma_isamin(M_lead_dim, dimEig, 1, queue) - 1; printf("max re idx = %d\nmin re idx = %d\n", maxReIdx, minReIdx); printf("%f %f\n", wReEig[maxReIdx], wReEig[minReIdx]); printf("max im idx = %d\nmin im idx = %d\n", maxImIdx, minImIdx); printf("%f %f\n", wImEig[maxImIdx], wImEig[minImIdx]); */ //printf("test wReEig: %f %f\n", wReEig[0], wReEig[1]); //printf("test wImEig: %f %f\n", wImEig[0], wImEig[1]); magma_free_cpu(iwork); magma_free_cpu(rwork); //magma_free_cpu(AT); magma_free_pinned(dA); magma_free_pinned(dAT); magma_free_pinned(dreA); magma_free_pinned(dimA); magma_free_pinned(work); magma_free_pinned(dreEig); magma_free_pinned(dimEig); return rslt; }
void magmablas_chemm_mgpu_com( char side, char uplo, magma_int_t m, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex *dA[], magma_int_t ldda, magma_int_t offset, magmaFloatComplex *dB[], magma_int_t lddb, magmaFloatComplex beta, magmaFloatComplex *dC[], magma_int_t lddc, magmaFloatComplex *dwork[], magma_int_t dworksiz, magmaFloatComplex *C, magma_int_t ldc, magmaFloatComplex *work[], magma_int_t worksiz, magma_int_t ngpu, magma_int_t nb, magma_queue_t streams[][20], magma_int_t nstream, magma_event_t redevents[][MagmaMaxGPUs*MagmaMaxGPUs+10], magma_int_t nbevents, magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2], magma_int_t nbcmplx ) { #define dA(dev, i, j) (dA[dev] + (i) + (j)*ldda) #define dB(dev, i, j) (dB[dev] + (i) + (j)*lddb) #define dC(dev, i, j) (dC[dev] + (i) + (j)*lddc) #define dwork(dev, i, j) (dwork[dev] + (i) + (j)*lddwork) #define C(i, j) (C + (i) + (j)*ldc) //printf("####################################################\n"); //printf(" start chemm \n"); //printf("####################################################\n"); assert( ldda >= m ); assert( lddb >= m ); assert( lddc >= m ); assert( nstream >= ngpu ); assert( nbevents >= ngpu*ngpu ); magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex *dwork1[MagmaMaxGPUs]; magmaFloatComplex *dwork2[MagmaMaxGPUs]; magma_int_t maxgsize = n*m; magma_int_t lddwork = lddc; magma_int_t ldwork = m; for( magma_int_t dev = 0; dev < ngpu; ++dev ) { dwork1[dev] = dwork[dev]; // size of dwork1 is n*lddwork dwork2[dev] = dwork[dev]+n*lddwork; // size of dwork2 is maxgsize*ngpu } assert( dworksiz >= (n*lddwork+maxgsize*ngpu) ); assert( worksiz >= (n*ldwork) ); magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_t cstream; magmablasGetKernelStream(&cstream); magma_int_t dev, devperm, myblk, mycolsize, myblkoffst; magma_int_t gmaster; magma_int_t masterdev, lcdev, lccolsize, myngpu; magma_int_t stdev = (offset/nb)%ngpu; magma_int_t blockoffset = offset % nb; magma_int_t fstblksiz = 0; if(blockoffset>0){ fstblksiz = min(m, (nb - blockoffset)); } //magma_int_t nbblk = magma_ceildiv(m, nb); magma_int_t nbblk = magma_ceildiv((m+blockoffset), nb); magma_int_t remm = m- fstblksiz; magma_int_t nbblkoffst = offset/nb; magma_int_t nblstblks = -1; magma_int_t devlstblk = -1; magma_int_t lstblksiz = remm%nb; if(lstblksiz>0){ nblstblks = nbblk%ngpu; devlstblk = (nblstblks-1+ngpu)%ngpu; } magma_int_t nbcmplxactive = 0; magma_int_t cmplxisactive[MagmaMaxGPUs]; magma_int_t gpuisactive[MagmaMaxGPUs]; memset(gpuisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t)); memset(cmplxisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t)); for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ 0 ] ); cudaMemset(dwork(dev,0,0), 0, (lddwork)*(n)*sizeof(magmaFloatComplex) ); // put all dC on all dev to 0 except the one which // hold i==0 because this one has to multiply by beta. if(dev!=stdev){ cudaMemset(dC(dev,0,0), 0, (lddc)*(n)*sizeof(magmaFloatComplex) ); } } magma_int_t newoffset = offset; // 1. symmetrize if(blockoffset>0){ newoffset = offset+fstblksiz; // newoffset is adjusted over nb magma_int_t myblkoffst = (nbblkoffst/ngpu)+(nbblkoffst%ngpu > stdev?1:0); //printf("STDEV %d voici offset %d remm %d myblockoffset %d siz %d \n", stdev, offset, remm, myblkoffst, fstblksiz); magma_setdevice( stdev ); magmablasSetKernelStream( streams[ stdev ][ 0 ] ); magmablas_csymmetrize_tiles( MagmaLower, fstblksiz, dA(stdev, offset, myblkoffst*nb+blockoffset), ldda, 1, ngpu*nb, nb ); } for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_int_t newstdev = (newoffset/nb)%ngpu; magma_int_t nbblk = remm/nb; // number of block of size nb. if m%nb>0 then a last block exist and is of size ib=m%nb magma_int_t myblk = (nbblk/ngpu) + (nbblk%ngpu > ((dev-newstdev+ngpu)%ngpu) ? 1:0 ); magma_int_t devperm = (dev-newstdev+ngpu)%ngpu; magma_int_t nbblkoffst = newoffset/nb; magma_int_t myblkoffst = (nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0); //printf("dev %d devperm %d newoffset %d rowoff %d coloff %d myblk %d \n", dev, devperm, newoffset, newoffset+devperm*nb, myblkoffst*nb, myblk); magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ 0 ] ); magmablas_csymmetrize_tiles( MagmaLower, nb, dA(dev, newoffset+devperm*nb, myblkoffst*nb), ldda, myblk, ngpu*nb, nb ); if(remm%nb>0){ magma_int_t nblstblks = (nbblk+1)%ngpu; magma_int_t devlstblk = (nblstblks-1+ngpu)%ngpu; //printf("==> siz %d devperm %d, devlstblk %d, newoffset+nbblk*nb %d, myblkoffst*nb+ myblk*nb %d\n", remm % nb, devperm, devlstblk, newoffset+nbblk*nb, myblkoffst*nb+ myblk*nb); if(devperm==devlstblk) magmablas_csymmetrize( MagmaLower, remm % nb, dA(dev, newoffset+nbblk*nb, myblkoffst*nb+ myblk*nb), ldda ); // last partial tile } } /* magma_int_t siz = m+offset; magmaFloatComplex *R=(magmaFloatComplex *) malloc(siz*siz*sizeof(magmaFloatComplex)); // collecte back A magmablas_cgetmatrix_1D_bcyclic( siz, siz, dA, ldda, R, siz, ngpu, nb ); magma_setdevice( 0 ); magmablasSetKernelStream( streams[ dev ][ 0 ] ); //magma_cgetmatrix( siz, siz, dA[0], ldda, R, siz ); FILE *trace_file; trace_file = fopen("AJETE/Aafter", "w"); for (int j = 0; j < siz ; j++) for (int i = 0; i < siz ; i++) fprintf(trace_file, "%10d%10d%40.30e\n", i+1, j+1, R[j*siz+i]); fclose(trace_file); return; */ // ROW GEMM transpose a row and make a gemm with a block // if only 1 GPU used the ROW GEMM is integrated with the // COL GEMM (better accuracy observed) and better perf if(ngpu>1){ for( magma_int_t i = fstblksiz; i < m; i += nb ) { magma_int_t ib = min( nb, m-i ); // block size magma_int_t ioff = i + offset; // start global index in parent matrix //magma_int_t dev = (ioff / nb) % ngpu; magma_int_t nbblkoffst = offset/nb; magma_int_t nbblk = magma_ceildiv(i, nb); for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_int_t myblk = (nbblk/ngpu) + (nbblk%ngpu > ((dev-stdev+ngpu)%ngpu) ? 1:0 ); magma_int_t myblkoffst = (nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0); magma_int_t myrowsize = myblk * nb; magma_int_t coloffset = myblkoffst*nb; if(dev==stdev) { myrowsize = myrowsize -blockoffset; coloffset = myblkoffst*nb+blockoffset; } //printf("ROW GEMM: voici i %d ib %d ioff %d nbblkoffst %d stdev %d dev %d myblk %d myblkoffset %d coloffset %d rowsize %d\n", i, ib, ioff, nbblkoffst, stdev, dev, myblk, myblkoffst, coloffset, myrowsize); if(myrowsize>0){ magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ 1 ] ); magma_cgemm( MagmaConjTrans, MagmaNoTrans, myrowsize, n, ib, alpha, dA(dev,ioff,coloffset), ldda, dB(dev,i,0), lddb, c_one, dwork(dev,0,0), lddwork ); } } } for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_event_record(redevents[dev][1], streams[dev][1]); } } // COL GEMM // blockoffset is offset within first block; for subsequent blocks it is 0 if(blockoffset>0){ magma_int_t ib = min( nb-blockoffset, m ); // block size magma_int_t iblock = (offset / nb) / ngpu; // local block id magma_int_t di = iblock*nb+blockoffset; // local index in parent matrix magma_setdevice( stdev ); magmablasSetKernelStream( streams[ stdev ][ 0 ] ); //printf("DEV %d COL GEMM first ioff %d di %d m %d n %d ib %d \n", stdev, offset, di, m, n, ib); magma_cgemm( MagmaNoTrans, MagmaNoTrans, m, n, ib, alpha, dA(stdev,offset,di), ldda, dB(stdev,0,0), lddb, beta, dC(stdev,0,0), lddc ); } // COL GEMM for( magma_int_t i = fstblksiz; i < m; i += nb ) { magma_int_t ib = min( nb, m-i ); // block size magma_int_t ioff = i + offset; // start global index in parent matrix magma_int_t iblock = (ioff / nb) / ngpu; // local block id magma_int_t dev = (ioff / nb) % ngpu; magma_int_t di = iblock*nb; // local index in parent matrix //printf("DEV %d COL GEMM i %d ioff %d di %d m-i %d n %d ib %d \n", dev, i, ioff, di, m-i, n, ib); magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ 0 ] ); if(i==0){ magma_cgemm( MagmaNoTrans, MagmaNoTrans, m-i, n, ib, alpha, dA(dev,ioff,di), ldda, dB(dev,i,0), lddb, beta, dC(dev,i,0), lddc ); }else{ magma_cgemm( MagmaNoTrans, MagmaNoTrans, m-i, n, ib, alpha, dA(dev,ioff,di), ldda, dB(dev,i,0), lddb, c_one, dC(dev,i,0), lddc ); } magma_event_record(redevents[dev][0], streams[dev][0]); // if only 1 GPU is used, do the ROW GEMM if(ngpu==1){ // NOTE THAT because the COL gemm write dC below the diagonal (i) // and the ROW GEMM write dC from 0 to diag-1, so they could // run in parallel on diferent stream. // // NO NO NO because // it might happen that col finished i and strated i+1 while row still at i // magmablasSetKernelStream( streams[ dev ][ 0 ] ); magma_cgemm( MagmaConjTrans, MagmaNoTrans, i, n, ib, alpha, dA(dev,ioff,offset), ldda, dB(dev,i,0), lddb, c_one, dC(dev,0,0), lddc ); } } if(ngpu>1){ for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_int_t nbblk = magma_ceildiv((m+blockoffset), nb); magma_int_t nbblkrow = nbblk-1; magma_int_t devperm = (dev-stdev+ngpu)%ngpu; magma_int_t myblk = (nbblkrow/ngpu) + (nbblkrow%ngpu > devperm ? 1:0 ); magma_int_t myrowsize = myblk * nb; if(dev==stdev) { myrowsize = myrowsize - blockoffset; } //printf("blockoffset %d nbblkrow %d devperm %d DEV %d RECEIVING myblk %d myrowsize %d\n", blockoffset, nbblkrow, devperm, dev, myblk, myrowsize); if(myrowsize>0){ magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ 0 ] ); magma_queue_wait_event(streams[ dev ][ 0 ], redevents[dev][1]); //magma_queue_sync( streams[ dev ][ 1 ] ); // for each dev add the computed ROW block each on its placment with dC for( magma_int_t blki = 0; blki < myblk; ++blki){ magma_int_t gbblki = (blki*ngpu + devperm)*nb - blockoffset; magma_int_t lcblki = blki*nb; magma_int_t ib = nb;// min(nb, m-gbblki); if(dev==stdev){ lcblki = blki*nb-blockoffset; if(blki==0){ gbblki = 0; lcblki = 0; ib = nb-blockoffset; } } magmablas_cgeadd(ib, n, c_one, &dwork[dev][lcblki], lddwork, &dC[dev][gbblki] , lddc ); } magma_event_record(redevents[dev][0], streams[dev][0]); } } } // =========================================================== // COMMUNICATION ALL_REDUCE_SUM // =========================================================== if(ngpu==1){ return; } // INITIALIZE COMM for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { masterdev = -1; gnode[cmplxid][MagmaMaxGPUs+1] = -1; myngpu = gnode[cmplxid][MagmaMaxGPUs]; for( magma_int_t idev = 0; idev < myngpu; ++idev ) { dev = gnode[cmplxid][idev]; devperm = (dev-stdev+ngpu)%ngpu; myblk = (nbblk/ngpu) + (nbblk%ngpu > devperm ? 1:0 ); mycolsize = myblk*nb; myblkoffst = nb*((nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0)); if(dev==stdev){ mycolsize -= blockoffset; myblkoffst += blockoffset; // local index in parent matrix } if((devperm==devlstblk)&&(lstblksiz>0)){ mycolsize -= (nb-(remm%nb)); } mycolsize = min(mycolsize, m); if(mycolsize>0){ gpuisactive[dev] = mycolsize; if(masterdev==-1) { masterdev = dev; nbcmplxactive = nbcmplxactive +1; cmplxisactive[cmplxid] = 1; gnode[cmplxid][MagmaMaxGPUs+1] = masterdev; } } } } /* for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); cudaDeviceSynchronize(); } */ //******************************* // each GPU send its result // to its master. The master make // the addition and then send to // to the masters of other complex // and receive from the masters of // other complex make the addition // and broadcast locally the final // result. //******************************* //printf("=======================================================================\n"); //printf(" sending to my master \n"); //printf("=======================================================================\n"); for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { myngpu = gnode[cmplxid][MagmaMaxGPUs]; masterdev = gnode[cmplxid][MagmaMaxGPUs+1]; //check if complex is active if(masterdev!=-1){ for( magma_int_t idev = 0; idev < myngpu; ++idev ) { dev = gnode[cmplxid][idev]; mycolsize = gpuisactive[dev]; if(mycolsize>0){ // I am an active GPU. if I am not the master, then send my result to my master. // store result on dwork[masterdev][dev*maxgsize] if(dev!=masterdev){ magma_setdevice( dev ); //printf(" GPU %d sending to my master %d\n", dev, masterdev); // wait the geadd of my ROW and COL GEMM is done magma_queue_wait_event(streams[ dev ][ 0 ], redevents[dev][0]); // sending to the master of my complex cudaMemcpy2DAsync(&dwork2[masterdev][maxgsize*dev], m*sizeof(magmaFloatComplex), &dC[dev][0], lddc*sizeof(magmaFloatComplex), m*sizeof(magmaFloatComplex), n, cudaMemcpyDeviceToDevice, streams[dev][0]); magma_event_record(redevents[dev][masterdev], streams[dev][0]); } // end I am not the masterdev }// end if mycolsize>0 }// for idev }// end of if masterdev!=-1 maening complex is active }// for cmplxid /* for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); cudaDeviceSynchronize(); } */ //printf("=======================================================================\n"); //printf(" each master do addition of local result and broadcast to other masters \n"); //printf("=======================================================================\n"); for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { myngpu = gnode[cmplxid][MagmaMaxGPUs]; masterdev = gnode[cmplxid][MagmaMaxGPUs+1]; //check if complex is active if(masterdev!=-1){ magma_setdevice( masterdev ); // addition is done on stream 0 sequentially magmablasSetKernelStream( streams[ masterdev ][ 0 ] ); // wait the geadd of my ROW and COL GEMM is done magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[masterdev][0]); // ======================================== // local addition // ======================================== for( magma_int_t l = 0; l < myngpu; ++l ) { lcdev = gnode[cmplxid][l]; lccolsize = gpuisactive[lcdev]; if((lcdev!=masterdev)&&(lccolsize>0)){ //printf(" master %d receiving from %d and adding \n", masterdev, lcdev); // this is an active GPU of my complex. // wait I received what he send it to me and then do addition. magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[lcdev][masterdev]); magmablas_cgeadd(m, n, c_one, &dwork2[masterdev][maxgsize*lcdev], m, &dC[masterdev][0] , lddc ); } }// for l=1:myngpu // because addition is done sequentially on stream 0, // I have to record this to be able to synch using it magma_event_record(redevents[masterdev][masterdev], streams[masterdev][0]); // ======================================== // // ======================================== // send to other masters // ======================================== for( magma_int_t k = 0; k < nbcmplx; ++k ) { if(k!=cmplxid){ gmaster = gnode[k][MagmaMaxGPUs+1]; if(gmaster!=-1){ //complex is active //Master has to wait until finish the local addition then send using gmaster stream. //use stream 0 to make it sequential or stream gmaster to make it parallel. //Now both re the same. //printf(" master %d from cmplx %d sending to other master %d on cmplx %d \n", masterdev, cmplxid, gmaster, k); magma_queue_wait_event(streams[ masterdev ][ gmaster ], redevents[masterdev][masterdev]); cudaMemcpy2DAsync(&dwork2[gmaster][maxgsize*masterdev], m*sizeof(magmaFloatComplex), &dC[masterdev][0], lddc*sizeof(magmaFloatComplex), m*sizeof(magmaFloatComplex), n, cudaMemcpyDeviceToDevice, streams[masterdev][gmaster]); magma_event_record(redevents[masterdev][gmaster], streams[masterdev][gmaster]); magma_event_record(redevents[masterdev][masterdev], streams[masterdev][gmaster]); } // end of gmaster!=-1 } // end of k!=cmplxid }// for k = 0: nbcmplx // ======================================== }// end of if masterdev!=-1 maening complex is active }// for cmplxid /* for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); cudaDeviceSynchronize(); } */ //printf("=======================================================================\n"); //printf(" each master wait receiving other masters results, do the addition and broadcast locally \n"); //printf("=======================================================================\n"); for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { myngpu = gnode[cmplxid][MagmaMaxGPUs]; masterdev = gnode[cmplxid][MagmaMaxGPUs+1]; //check if complex is active if(masterdev!=-1){ magma_setdevice( masterdev ); // addition is done on stream 0 sequentially magmablasSetKernelStream( streams[ masterdev ][ 0 ] ); // master has to wait until finishing all the send to other masters. magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[masterdev][masterdev]); // ======================================== // addition of results from other masters // ======================================== for( magma_int_t k = 0; k < nbcmplx; ++k ) { if(k!=cmplxid){ gmaster = gnode[k][MagmaMaxGPUs+1]; if(gmaster!=-1){ //complex is active //Master has to wait until receiving from gmaster, then do addition using stream 0 //printf(" master %d from cmplx %d receiving from other master %d on cmplx %d and adding \n", masterdev, cmplxid, gmaster, k); magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[gmaster][masterdev]); magmablas_cgeadd(m, n, c_one, &dwork2[masterdev][maxgsize*gmaster], m, &dC[masterdev][0] , lddc ); } // end of gmaster!=-1 } // end of k!=cmplxid }// for k = 0: nbcmplx // because addition is done sequentially on stream 0, // I have to record this to be able to synch using it magma_event_record(redevents[masterdev][masterdev], streams[masterdev][0]); // ======================================== // ======================================== // local broadcast of final results // ======================================== for( magma_int_t l = 0; l < myngpu; ++l ) { lcdev = gnode[cmplxid][l]; lccolsize = gpuisactive[lcdev]; if((lcdev!=masterdev)&&(lccolsize>0)){ // this is an active GPU of my complex. // wait the previous addition is done maening stream 0 is finished and broadcast sequentially for now. // to make it parallel put stream lcdev instead of stream 0 //printf(" master %d broadcasting local to %d \n", masterdev, lcdev); magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[masterdev][masterdev]); cudaMemcpy2DAsync(&dC[lcdev][0], lddc*sizeof(magmaFloatComplex), &dC[masterdev][0], lddc*sizeof(magmaFloatComplex), m*sizeof(magmaFloatComplex), n, cudaMemcpyDeviceToDevice, streams[masterdev][0]); magma_event_record(redevents[masterdev][lcdev], streams[masterdev][0]); } }// for l=1:myngpu // ======================================== }// end of if masterdev!=-1 maening complex is active }// for cmplxid /* for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); cudaDeviceSynchronize(); } */ for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { myngpu = gnode[cmplxid][MagmaMaxGPUs]; masterdev = gnode[cmplxid][MagmaMaxGPUs+1]; //check if complex is active if(masterdev!=-1){ for( magma_int_t l = 0; l < myngpu; ++l ) { lcdev = gnode[cmplxid][l]; lccolsize = gpuisactive[lcdev]; if(lccolsize>0){ magma_setdevice( lcdev ); magma_queue_wait_event(streams[ lcdev ][ 0 ], redevents[lcdev][0]); magma_queue_wait_event(streams[ lcdev ][ 0 ], redevents[masterdev][lcdev]); } }// for l=1:myngpu }// end of if masterdev!=-1 maening complex is active }// for cmplxid //printf("****************************************************\n"); //printf(" finish chemm \n"); //printf("****************************************************\n"); magma_setdevice( cdev ); magmablasSetKernelStream( cstream ); }
extern "C" int calc_numerical_range(magmaFloatComplex *M, magma_int_t M_lead_dim, float _from, float _step, magma_int_t _steps, magmaFloatComplex *pts) { magma_int_t idx = 0, rslt = 0; magmaFloatComplex p, scalar; std::complex<float> vtmp; float j; magmaFloatComplex *dA = nullptr; magmaFloatComplex *dAth = NULL, *dAthT = NULL, *dX = NULL, *dY = NULL; float *dE = NULL; //float *hE = NULL; //magma_int_t *ipiv = NULL; magma_int_t lda = M_lead_dim; //magma_int_t ldx = lda; magma_int_t info = 0; magma_int_t nb = 0; //magma_vec_t jobvl; //magma_vec_t jobvr; magmaFloatComplex *work = nullptr; magma_int_t lwork = 0; float *rwork = nullptr; magma_int_t lrwork = 0; magma_int_t *iwork = nullptr; magma_int_t liwork = 0; nb = magma_get_cgehrd_nb( M_lead_dim ); lwork = 2 * max(M_lead_dim + M_lead_dim*nb, 2 * M_lead_dim + M_lead_dim*M_lead_dim); // MagmaVec lrwork = 1 + 5 * M_lead_dim + 2 * M_lead_dim*M_lead_dim; // MagmaVec liwork = (3 + 5 * M_lead_dim); // MagmaVec magma_imalloc_cpu(&iwork, liwork); magma_smalloc_cpu(&rwork, lrwork); magma_cmalloc_pinned(&work, lwork); magma_cmalloc_pinned(&dA, lda*M_lead_dim); magma_cmalloc_pinned(&dAth, lda*M_lead_dim); magma_cmalloc_pinned(&dAthT, lda*M_lead_dim); magma_smalloc_pinned(&dE, M_lead_dim); //magma_smalloc_cpu(&hE, M_lead_dim); magma_cmalloc_pinned(&dX, M_lead_dim); magma_cmalloc_pinned(&dY, M_lead_dim); magma_csetmatrix(M_lead_dim, M_lead_dim, M, lda, dA, M_lead_dim, queue); // th=[0:resolution:2*pi] j = _from; for (idx = 0; idx < _steps; idx++) { //scalar = exp( 1im * -j); vtmp.real( 0.0f ); vtmp.imag( -j ); //vtmp = _FCbuild(0.0f, -j); //printf("vtmp = %f + i%f\n", vtmp._Val[0], vtmp._Val[1]); vtmp = exp(vtmp); scalar.x = vtmp.real(); scalar.y = vtmp.imag(); //printf("scalar = %f + i%f\n", scalar.x, scalar.y); magma_ccopy(lda * M_lead_dim, dA, 1, dAth, 1, queue); // Ath = exp(1im * -j) * As magma_cscal(lda * M_lead_dim, scalar, dAth, 1, queue); //magma_cprint_gpu(N, N, dA, lda); //magma_cprint_gpu(N, N, dAth, lda); // AthT = (Ath + Ath') magmablas_ctranspose_conj(M_lead_dim, M_lead_dim, dAth, M_lead_dim, dAthT, M_lead_dim, queue); magmablas_cgeadd(M_lead_dim, M_lead_dim, MAGMA_C_MAKE(1.0f, 0.0f), dAth, M_lead_dim, dAthT, M_lead_dim, queue); // AthT = AthT / 2 magma_cscal(lda*M_lead_dim, MAGMA_C_MAKE(0.5f, 0.0f), dAthT, 1, queue); magma_sync_wtime(queue); //magma_cprint_gpu(M_lead_dim, M_lead_dim, dAthT, lda); // e, r = eig(AthT) rslt = magma_cheevd(MagmaVec, MagmaLower, M_lead_dim, dAthT, lda, dE, work, lwork, rwork, lrwork, iwork, liwork, &info); magma_sync_wtime(queue); //printf("magma_cheevd info=%d\n", info); //magma_cprint_gpu(M_lead_dim, M_lead_dim, dAthT, lda); //magma_sprint_gpu(M_lead_dim, 1, dE, M_lead_dim); //magma_sgetvector(M_lead_dim, dE, 1, hE, 1, queue); //printf("%f %f\n", hE[0], hE[1]); // p = r[:,s]' * A * r[:,s] // r = r[:,s] magma_ccopy( M_lead_dim, dAthT + (M_lead_dim*(M_lead_dim-1)), 1, // dAthT + (N), where (N) is a column offset dX, 1, queue); magma_sync_wtime(queue); //magma_cprint_gpu(M_lead_dim, 1, dX, M_lead_dim); // pp = A * r[:,s] magma_cgemv(MagmaNoTrans, M_lead_dim, M_lead_dim, MAGMA_C_MAKE(1.0f, 0.0f), dA, lda, dX, 1, MAGMA_C_MAKE(0.0f, 0.0f), dY, 1, queue); magma_sync_wtime(queue); //magma_cprint_gpu(M_lead_dim, 1, dY, M_lead_dim); // p = r' * pp p = magma_cdotc(M_lead_dim, dX, 1, dY, 1, queue); magma_sync_wtime(queue); pts[idx] = p; //printf("p = %f %fi\n", p.x, p.y); j += _step; } // end of for (idx = 0; idx < _steps; idx++) magma_free_pinned(dY); magma_free_pinned(dX); //magma_free_cpu(hE); magma_free_pinned(dE); magma_free_pinned(dAthT); magma_free_pinned(dAth); magma_free_pinned(dA); magma_free_pinned(work); magma_free_cpu(rwork); magma_free_cpu(iwork); //magma_free_cpu(w); //magma_free_cpu(A); return rslt; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgeadd */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; magmaFloatComplex *h_A, *h_B, *d_A, *d_B; magmaFloatComplex alpha = MAGMA_C_MAKE( 3.1415, 2.718 ); magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t M, N, size, lda, ldda; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; parse_opts( argc, argv, &opts ); /* Uncomment these lines to check parameters. * magma_xerbla calls lapack's xerbla to print out error. */ //magmablas_cgeadd( -1, N, alpha, d_A, ldda, d_B, ldda ); //magmablas_cgeadd( M, -1, alpha, d_A, ldda, d_B, ldda ); //magmablas_cgeadd( M, N, alpha, d_A, M-1, d_B, ldda ); //magmablas_cgeadd( M, N, alpha, d_A, ldda, d_B, N-1 ); printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |Bl-Bm|/|Bl|\n"); printf("=========================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; lda = M; ldda = ((M+31)/32)*32; size = lda*N; gflops = 2.*M*N / 1e9; TESTING_MALLOC( h_A, magmaFloatComplex, lda *N ); TESTING_MALLOC( h_B, magmaFloatComplex, lda *N ); TESTING_DEVALLOC( d_A, magmaFloatComplex, ldda*N ); TESTING_DEVALLOC( d_B, magmaFloatComplex, ldda*N ); lapackf77_clarnv( &ione, ISEED, &size, h_A ); lapackf77_clarnv( &ione, ISEED, &size, h_B ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_csetmatrix( M, N, h_A, lda, d_A, ldda ); magma_csetmatrix( M, N, h_B, lda, d_B, ldda ); gpu_time = magma_sync_wtime( NULL ); magmablas_cgeadd( M, N, alpha, d_A, ldda, d_B, ldda ); gpu_time = magma_sync_wtime( NULL ) - gpu_time; gpu_perf = gflops / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); for( int j = 0; j < N; ++j ) { blasf77_caxpy( &M, &alpha, &h_A[j*lda], &ione, &h_B[j*lda], &ione ); } cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check result =================================================================== */ magma_cgetmatrix( M, N, d_B, ldda, h_A, lda ); error = lapackf77_clange( "F", &M, &N, h_B, &lda, work ); blasf77_caxpy( &size, &c_neg_one, h_A, &ione, h_B, &ione ); error = lapackf77_clange( "F", &M, &N, h_B, &lda, work ) / error; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error ); TESTING_FREE( h_A ); TESTING_FREE( h_B ); TESTING_DEVFREE( d_A ); TESTING_DEVFREE( d_B ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }