double QRMatrix_magma(Tensor_core<complex<double>,2>& ph, Tensor_core<double,1>& det_list) { if( det_list.rank(0) != ph.rank(1) ) {cout<<"det_list size is not consistent with ph!"<<endl; exit(1); } magma_int_t L=ph.rank(0); magma_int_t N=ph.rank(1); magma_int_t info; int L_cpu = L; int N_cpu = N; magmaDoubleComplex* tau; magma_zmalloc_cpu( &tau, N ); magmaDoubleComplex work_test[1]; magma_int_t lwork=-1; magma_zgeqrf(L, N, (magmaDoubleComplex *)ph.data(), L, tau, work_test, lwork, &info); lwork=lround( MAGMA_Z_REAL(work_test[0]) ); magmaDoubleComplex* work; magma_zmalloc_cpu( &work, lwork ); magma_zgeqrf(L, N, (magmaDoubleComplex *)ph.data(), L, tau, work, lwork, &info); if(info!=0) {cout<<"QR run is not suceesful: "<<info<<"-th parameter is illegal!"<<endl; exit(1);} complex<double> det={1.0,0.0}; for (int i=0; i<N_cpu; i++) {det_list(i)=ph(i,i).real(); det*=ph(i,i);} magma_zungqr2(L, N, N, (magmaDoubleComplex *)ph.data(), L, tau, &info ); if(info!=0) {cout<<"magma_zungqr2 run is not suceesful: "<<info<<"-th parameter is illegal!"<<endl; exit(1);} //Reshape the phi to get positive det if(det.real()<0) { det=-det; det_list(0)=-det_list(0); for(int i=0; i<L_cpu; i++) ph(i,0)=-ph(i,0); } magma_free_cpu(tau); magma_free_cpu(work); return det.real(); }
// ------------------------------------------------------------ // Solve A * X = B, where A and X are stored in CPU host memory. // Internally, MAGMA transfers data to the GPU device // and uses a hybrid CPU + GPU algorithm. void cpu_interface( magma_int_t n, magma_int_t nrhs ) { magmaDoubleComplex *A=NULL, *X=NULL; magma_int_t *ipiv=NULL; magma_int_t lda = n; magma_int_t ldx = lda; magma_int_t info = 0; // magma_*malloc_cpu routines for CPU memory are type-safe and align to memory boundaries, // but you can use malloc or new if you prefer. magma_zmalloc_cpu( &A, lda*n ); magma_zmalloc_cpu( &X, ldx*nrhs ); magma_imalloc_cpu( &ipiv, n ); if ( A == NULL || X == NULL || ipiv == NULL ) { fprintf( stderr, "malloc failed\n" ); goto cleanup; } // Replace these with your code to initialize A and X zfill_matrix( n, n, A, lda ); zfill_rhs( n, nrhs, X, ldx ); magma_zgesv( n, 1, A, lda, ipiv, X, lda, &info ); if ( info != 0 ) { fprintf( stderr, "magma_zgesv failed with info=%d\n", info ); } // TODO: use result in X cleanup: magma_free_cpu( A ); magma_free_cpu( X ); magma_free_cpu( ipiv ); }
extern "C" void magma_zlarfxsym(magma_int_t N, magmaDoubleComplex *A, magma_int_t LDA, magmaDoubleComplex *V, magmaDoubleComplex *TAU) { magma_int_t IONE=1; magmaDoubleComplex dtmp; magmaDoubleComplex Z_ZERO = MAGMA_Z_ZERO; //magmaDoubleComplex Z_ONE = MAGMA_Z_ONE; magmaDoubleComplex Z_MONE = MAGMA_Z_NEG_ONE; magmaDoubleComplex Z_HALF = MAGMA_Z_HALF; //magmaDoubleComplex WORK[N]; magmaDoubleComplex *WORK; magma_zmalloc_cpu( &WORK, N ); /* apply left and right on A(st:ed,st:ed)*/ //magma_zlarfxsym(len,A(st,st),LDX,V(st),TAU(st)); /* X = AVtau */ blasf77_zhemv("L",&N, TAU, A, &LDA, V, &IONE, &Z_ZERO, WORK, &IONE); /* je calcul dtmp= X'*V */ dtmp = magma_cblas_zdotc(N, WORK, IONE, V, IONE); /* je calcul 1/2 X'*V*t = 1/2*dtmp*tau */ dtmp = -dtmp * Z_HALF * (*TAU); /* je calcul W=X-1/2VX'Vt = X - dtmp*V */ /* for (j = 0; j < N; j++) WORK[j] = WORK[j] + (dtmp*V[j]); */ blasf77_zaxpy(&N, &dtmp, V, &IONE, WORK, &IONE); /* performs the symmetric rank 2 operation A := alpha*x*y' + alpha*y*x' + A */ blasf77_zher2("L",&N,&Z_MONE,WORK,&IONE,V,&IONE,A,&LDA); magma_free_cpu(WORK); }
magma_int_t magma_znan_inf_gpu( magma_uplo_t uplo, magma_int_t m, magma_int_t n, magmaDoubleComplex_const_ptr dA, magma_int_t dA_offset, magma_int_t ldda, magma_int_t *cnt_nan, magma_int_t *cnt_inf, magma_queue_t queue ) { magma_int_t info = 0; if ( uplo != MagmaLower && uplo != MagmaUpper && uplo != MagmaFull ) info = -1; else if ( m < 0 ) info = -2; else if ( n < 0 ) info = -3; else if ( ldda < max(1,m) ) info = -5; if (info != 0) { magma_xerbla( __func__, -(info) ); return info; } magma_int_t lda = m; magmaDoubleComplex* A; magma_zmalloc_cpu( &A, lda*n ); magma_zgetmatrix( m, n, dA, dA_offset, ldda, A, lda, queue ); magma_int_t cnt = magma_znan_inf( uplo, m, n, A, lda, cnt_nan, cnt_inf ); magma_free_cpu( A ); return cnt; }
void SVDMatrix_magma(Tensor_core<complex<double>,2>& U, Tensor_core<double,1>& D, Tensor_core<complex<double>,2>& V) { if( U.rank(0)!=U.rank(1) || U.rank(1)!=D.rank(0) || D.rank(0)!=V.rank(0) || V.rank(0)!=V.rank(1) ) { cout<<"Size is not consistent in SVDMatrix_magma! Only support square matrix."<<endl; exit(1); } magma_int_t m=U.rank(0); magma_int_t n=V.rank(0); magma_vec_t jobz(MagmaOverwriteVec); magma_int_t lda=m; magmaDoubleComplex* u=nullptr; magma_int_t ldu=1; magma_int_t ldv=n; magmaDoubleComplex work_test[1]; magma_int_t lwork=-1; double* rwork; magma_int_t* iwork; magma_dmalloc_cpu( &rwork, 5*m*m+7*m ); magma_imalloc_cpu(&iwork, 8*m); magma_int_t info; magma_zgesdd(jobz, m, n, (magmaDoubleComplex *) U.data(), lda, D.data(), u, ldu, (magmaDoubleComplex *) V.data(), ldv, work_test, lwork, rwork, iwork, &info); lwork=lround( MAGMA_Z_REAL(work_test[0]) ); magmaDoubleComplex* work; magma_zmalloc_cpu(&work, lwork); magma_zgesdd(jobz, m, n, (magmaDoubleComplex *) U.data(), lda, D.data(), u, ldu, (magmaDoubleComplex *) V.data(), ldv, work, lwork, rwork, iwork, &info); magma_free_cpu(work); magma_free_cpu(rwork); magma_free_cpu(iwork); if(info!=0) { cout<<"SVDMatrix_magma is not suceesful, info= "<<info<<endl; exit(1); } }
extern "C" void magma_ztrdtype1cbHLsym_withQ(magma_int_t N, magma_int_t NB, magmaDoubleComplex *A, magma_int_t LDA, magmaDoubleComplex *V, magmaDoubleComplex *TAU, magma_int_t st, magma_int_t ed, magma_int_t sweep, magma_int_t Vblksiz) { //magma_int_t J1, J2, J3, i, j; magma_int_t len, LDX; magma_int_t IONE=1; magma_int_t blkid, vpos, taupos, tpos; //magmaDoubleComplex conjtmp; magmaDoubleComplex Z_ONE = MAGMA_Z_ONE; magmaDoubleComplex *WORK; magma_zmalloc_cpu( &WORK, N ); findVTpos(N,NB,Vblksiz,sweep-1,st-1, &vpos, &taupos, &tpos, &blkid); //printf("voici vpos %d taupos %d tpos %d blkid %d \n", vpos, taupos, tpos, blkid); LDX = LDA-1; len = ed-st+1; *V(vpos) = Z_ONE; memcpy(V(vpos+1), A(st+1, st-1), (len-1)*sizeof(magmaDoubleComplex)); memset(A(st+1, st-1), 0, (len-1)*sizeof(magmaDoubleComplex)); /* Eliminate the col at st-1 */ lapackf77_zlarfg( &len, A(st, st-1), V(vpos+1), &IONE, TAU(taupos) ); /* apply left and right on A(st:ed,st:ed)*/ magma_zlarfxsym(len,A(st,st),LDX,V(vpos),TAU(taupos)); //conjtmp = MAGMA_Z_CNJG(*TAU(taupos)); //lapackf77_zlarfy("L", &len, V(vpos), &IONE, &conjtmp, A(st,st), &LDX, WORK); //&(MAGMA_Z_CNJG(*TAU(taupos))) magma_free_cpu(WORK); }
magma_int_t magma_zmLdiagadd( magma_z_matrix *L, magma_queue_t queue ) { magma_int_t info = 0; magma_z_matrix LL={Magma_CSR}; if( L->row[1]==1 ){ // lower triangular with unit diagonal //printf("L lower triangular.\n"); LL.diagorder_type = Magma_UNITY; CHECK( magma_zmconvert( *L, &LL, Magma_CSR, Magma_CSRL, queue )); } else if ( L->row[1]==0 ){ // strictly lower triangular //printf("L strictly lower triangular.\n"); CHECK( magma_zmtransfer( *L, &LL, Magma_CPU, Magma_CPU, queue )); magma_free_cpu( LL.col ); magma_free_cpu( LL.val ); LL.nnz = L->nnz+L->num_rows; CHECK( magma_zmalloc_cpu( &LL.val, LL.nnz )); CHECK( magma_index_malloc_cpu( &LL.col, LL.nnz )); magma_int_t z=0; for( magma_int_t i=0; i<L->num_rows; i++){ LL.row[i] = z; for( magma_int_t j=L->row[i]; j<L->row[i+1]; j++){ LL.val[z] = L->val[j]; LL.col[z] = L->col[j]; z++; } // add unit diagonal LL.val[z] = MAGMA_Z_MAKE(1.0, 0.0); LL.col[z] = i; z++; } LL.row[LL.num_rows] = z; LL.nnz = z; } else { printf("error: L neither lower nor strictly lower triangular!\n"); } magma_zmfree( L, queue ); CHECK( magma_zmtransfer(LL, L, Magma_CPU, Magma_CPU, queue )); cleanup: if( info != 0 ){ magma_zmfree( L, queue ); } magma_zmfree( &LL, queue ); return info; }
// ------------------------------------------------------------ // Replace with your code to initialize the dA matrix on the GPU device. // This simply leverages the CPU version above to initialize it to random values, // and copies the matrix to the GPU. void zfill_matrix_gpu( magma_int_t m, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda ) { magmaDoubleComplex *A; int lda = ldda; magma_zmalloc_cpu( &A, m*lda ); if ( A == NULL ) { fprintf( stderr, "malloc failed\n" ); return; } zfill_matrix( m, n, A, lda ); magma_zsetmatrix( m, n, A, lda, dA, ldda ); magma_free_cpu( A ); }
extern "C" void magma_ztrdtype3cbHLsym_withQ(magma_int_t N, magma_int_t NB, magmaDoubleComplex *A, magma_int_t LDA, magmaDoubleComplex *V, magmaDoubleComplex *TAU, magma_int_t st, magma_int_t ed, magma_int_t sweep, magma_int_t Vblksiz) { //magma_int_t J1, J2, J3, i, j; magma_int_t len, LDX; //magma_int_t IONE=1; magma_int_t blkid, vpos, taupos, tpos; //magmaDoubleComplex conjtmp; magmaDoubleComplex *WORK; magma_zmalloc_cpu( &WORK, N ); findVTpos(N,NB,Vblksiz,sweep-1,st-1, &vpos, &taupos, &tpos, &blkid); LDX = LDA-1; len = ed-st+1; /* apply left and right on A(st:ed,st:ed)*/ magma_zlarfxsym(len,A(st,st),LDX,V(vpos),TAU(taupos)); //conjtmp = MAGMA_Z_CNJG(*TAU(taupos)); //lapackf77_zlarfy("L", &len, V(vpos), &IONE, &(MAGMA_Z_CNJG(*TAU(taupos))), A(st,st), &LDX, WORK); magma_free_cpu(WORK); }
void eigen_magma(Tensor_core<complex<double>,2>& A, Tensor_core<double,1>& W, char JOBZ, char UPLO) { if( A.rank(0) != A.rank(1) ) {cout<<"Input for eigen is not square matrix!"<<endl; exit(1);} if( A.rank(0) != W.rank(0) ) {cout<<"Input size of W is not consistent with A!"<<endl; exit(1);} magma_vec_t jobz = magma_vec_const(JOBZ); magma_uplo_t uplo = magma_uplo_const(UPLO); magma_int_t N=A.rank(0); magma_int_t info; magmaDoubleComplex work_test[1]; double rwork_test[1]; magma_int_t iwork_test[1]; magma_int_t lwork=-1; magma_int_t lrwork=-1; magma_int_t liwork=-1; magma_zheevd( jobz, uplo, N, (magmaDoubleComplex* ) A.data(), N, W.data(), work_test, lwork, rwork_test, lrwork, iwork_test, liwork, &info ); lwork=lround( MAGMA_Z_REAL(work_test[0]) ); lrwork=lround(rwork_test[0]); liwork=iwork_test[0]; magmaDoubleComplex* work; double* rwork; magma_int_t* iwork; magma_zmalloc_cpu(&work, lwork); magma_dmalloc_cpu(&rwork, lrwork); magma_imalloc_cpu(&iwork, liwork); magma_zheevd( jobz, uplo, N, (magmaDoubleComplex* ) A.data(), N, W.data(), work, lwork, rwork, lrwork, iwork, liwork, &info ); magma_free_cpu(work); magma_free_cpu(rwork); magma_free_cpu(iwork); if(info!=0) {cout<<"Zheevd failed: info= "<< info<<endl; exit(1);} }
extern "C" void magma_ztrdtype2cbHLsym_withQ(magma_int_t N, magma_int_t NB, magmaDoubleComplex *A, magma_int_t LDA, magmaDoubleComplex *V, magmaDoubleComplex *TAU, magma_int_t st, magma_int_t ed, magma_int_t sweep, magma_int_t Vblksiz) { magma_int_t J1, J2, len, lem, LDX; //magma_int_t i, j; magma_int_t IONE=1; magma_int_t blkid, vpos, taupos, tpos; magmaDoubleComplex conjtmp; magmaDoubleComplex Z_ONE = MAGMA_Z_ONE; //magmaDoubleComplex WORK[NB]; magmaDoubleComplex *WORK; magma_zmalloc_cpu( &WORK, NB ); findVTpos(N,NB,Vblksiz,sweep-1,st-1, &vpos, &taupos, &tpos, &blkid); LDX = LDA-1; J1 = ed+1; J2 = min(ed+NB,N); len = ed-st+1; lem = J2-J1+1; if (lem > 0) { /* apply remaining right commming from the top block */ lapackf77_zlarfx("R", &lem, &len, V(vpos), TAU(taupos), A(J1, st), &LDX, WORK); } if (lem > 1) { findVTpos(N,NB,Vblksiz,sweep-1,J1-1, &vpos, &taupos, &tpos, &blkid); /* remove the first column of the created bulge */ *V(vpos) = Z_ONE; memcpy(V(vpos+1), A(J1+1, st), (lem-1)*sizeof(magmaDoubleComplex)); memset(A(J1+1, st),0,(lem-1)*sizeof(magmaDoubleComplex)); /* Eliminate the col at st */ lapackf77_zlarfg( &lem, A(J1, st), V(vpos+1), &IONE, TAU(taupos) ); /* apply left on A(J1:J2,st+1:ed) */ len = len-1; /* because we start at col st+1 instead of st. col st is the col that has been revomved; */ conjtmp = MAGMA_Z_CNJG(*TAU(taupos)); lapackf77_zlarfx("L", &lem, &len, V(vpos), &conjtmp, A(J1, st+1), &LDX, WORK); } magma_free_cpu(WORK); }
extern "C" magma_int_t magma_zgels3_gpu( char trans, magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaDoubleComplex *dA, magma_int_t ldda, magmaDoubleComplex *dB, magma_int_t lddb, magmaDoubleComplex *hwork, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= Solves the overdetermined, least squares problem min || A*X - C || using the QR factorization A. The underdetermined problem (m < n) is not currently handled. Arguments ========= TRANS (input) CHARACTER*1 = 'N': the linear system involves A. Only trans='N' is currently handled. M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. M >= N >= 0. NRHS (input) INTEGER The number of columns of the matrix C. NRHS >= 0. A (input/output) COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, A is overwritten by details of its QR factorization as returned by ZGEQRF3. LDDA (input) INTEGER The leading dimension of the array A, LDDA >= M. DB (input/output) COMPLEX_16 array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. LDDB (input) INTEGER The leading dimension of the array DB. LDDB >= M. HWORK (workspace/output) COMPLEX_16 array, dimension MAX(1,LWORK). On exit, if INFO = 0, HWORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array HWORK, LWORK >= (M - N + NB)*(NRHS + NB) + NRHS*NB, where NB is the blocksize given by magma_get_zgeqrf_nb( M ). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the HWORK array. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ #define a_ref(a_1,a_2) (dA + (a_2)*(ldda) + (a_1)) magmaDoubleComplex *dT, *tau; magma_int_t k; magma_int_t nb = magma_get_zgeqrf_nb(m); magma_int_t lwkopt = (m - n + nb)*(nrhs + nb) + nrhs*nb; int lquery = (lwork == -1); hwork[0] = MAGMA_Z_MAKE( (double)lwkopt, 0. ); *info = 0; /* For now, N is the only case working */ if ( (trans != 'N') && (trans != 'n' ) ) *info = -1; else if (m < 0) *info = -2; else if (n < 0 || m < n) /* LQ is not handle for now*/ *info = -3; else if (nrhs < 0) *info = -4; else if (ldda < max(1,m)) *info = -6; else if (lddb < max(1,m)) *info = -8; else if (lwork < lwkopt && ! lquery) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { hwork[0] = MAGMA_Z_ONE; return *info; } /* * Allocate temporary buffers */ int ldtwork = ( 2*k + ((n+31)/32)*32 )*nb; if (nb < nrhs) ldtwork = ( 2*k + ((n+31)/32)*32 )*nrhs; if (MAGMA_SUCCESS != magma_zmalloc( &dT, ldtwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_zmalloc_cpu( &tau, k ); if ( tau == NULL ) { magma_free( dT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_zgeqrf3_gpu( m, n, dA, ldda, tau, dT, info ); if ( *info == 0 ) { magma_zgeqrs3_gpu( m, n, nrhs, dA, ldda, tau, dT, dB, lddb, hwork, lwork, info ); } magma_free( dT ); magma_free_cpu(tau); return *info; }
static void magma_ztile_bulge_applyQ( magma_int_t core_id, magma_side_t side, magma_int_t n_loc, magma_int_t n, magma_int_t nb, magma_int_t Vblksiz, magmaDoubleComplex *E, magma_int_t lde, magmaDoubleComplex *V, magma_int_t ldv, magmaDoubleComplex *TAU, magmaDoubleComplex *T, magma_int_t ldt) //, magma_int_t* info) { //%=========================== //% local variables //%=========================== magma_int_t firstcolj; magma_int_t bg, rownbm; magma_int_t st,ed,fst,vlen,vnb,colj; magma_int_t vpos,tpos; magma_int_t cur_blksiz,avai_blksiz, ncolinvolvd; magma_int_t nbgr, colst, coled; if (n <= 0) return; if (n_loc <= 0) return; //info = 0; magma_int_t INFO=0; magma_int_t nbGblk = magma_ceildiv(n-1, Vblksiz); /* * version v1: for each chunck it apply all the V's then move to * the other chunck. the locality here inside each * chunck meaning that thread t apply V_k then move * to V_k+1 which overlap with V_k meaning that the * E_k+1 overlap with E_k. so here is the * locality however thread t had to read V_k+1 and * T_k+1 at each apply. note that all thread if they * run at same speed they might reading the same V_k * and T_k at the same time. * */ magma_int_t nb_loc = 128; //$$$$$$$$ magma_int_t lwork = 2*nb_loc*max(Vblksiz,64); magmaDoubleComplex *work, *work2; magma_zmalloc_cpu(&work, lwork); magma_zmalloc_cpu(&work2, lwork); magma_int_t nbchunk = magma_ceildiv(n_loc, nb_loc); /* SIDE LEFT meaning apply E = Q*E = (q_1*q_2*.....*q_n) * E ==> so traverse Vs in reverse order (forward) from q_n to q_1 * each q_i consist of applying V to a block of row E(row_i,:) and applies are overlapped meaning * that q_i+1 overlap a portion of the E(row_i, :). * IN parallel E is splitten in vertical block over the threads */ /* SIDE RIGHT meaning apply E = E*Q = E * (q_1*q_2*.....*q_n) ==> so tarverse Vs in normal order (forward) from q_1 to q_n * each q_i consist of applying V to a block of col E(:, col_i,:) and the applies are overlapped meaning * that q_i+1 overlap a portion of the E(:, col_i). * IN parallel E is splitten in horizontal block over the threads */ #ifdef ENABLE_DEBUG if ((core_id == 0) || (core_id == 1)) printf(" APPLY Q2_cpu zbulge_back_m N %d N_loc %d nbchunk %d NB %d Vblksiz %d SIDE %c \n", n, n_loc, nbchunk, nb, Vblksiz, side); #endif for (magma_int_t i = 0; i < nbchunk; i++) { magma_int_t ib_loc = min(nb_loc, (n_loc - i*nb_loc)); if (side == MagmaLeft) { for (bg = nbGblk; bg > 0; bg--) { firstcolj = (bg-1)*Vblksiz + 1; rownbm = magma_ceildiv((n-(firstcolj+1)),nb); if (bg == nbGblk) rownbm = magma_ceildiv((n-(firstcolj)),nb); // last blk has size=1 used for complex to handle A(N,N-1) for (magma_int_t j = rownbm; j > 0; j--) { vlen = 0; vnb = 0; colj = (bg-1)*Vblksiz; // for k=0; I compute the fst and then can remove it from the loop fst = (rownbm -j)*nb+colj +1; for (magma_int_t k=0; k < Vblksiz; k++) { colj = (bg-1)*Vblksiz + k; st = (rownbm -j)*nb+colj +1; ed = min(st+nb-1,n-1); if (st > ed) break; if ((st == ed) && (colj != n-2)) break; vlen=ed-fst+1; vnb=k+1; } colst = (bg-1)*Vblksiz; magma_bulge_findVTpos(n, nb, Vblksiz, colst, fst, ldv, ldt, &vpos, &tpos); if ((vlen > 0) && (vnb > 0)) { lapackf77_zlarfb( "L", "N", "F", "C", &vlen, &ib_loc, &vnb, V(vpos), &ldv, T(tpos), &ldt, E(fst,i*nb_loc), &lde, work, &ib_loc); } if (INFO != 0) printf("ERROR ZUNMQR INFO %d \n", (int) INFO); } } } else if (side == MagmaRight) { rownbm = magma_ceildiv((n-1),nb); for (magma_int_t k = 1; k <= rownbm; k++) { ncolinvolvd = min(n-1, k*nb); avai_blksiz=min(Vblksiz,ncolinvolvd); nbgr = magma_ceildiv(ncolinvolvd,avai_blksiz); for (magma_int_t j = 1; j <= nbgr; j++) { vlen = 0; vnb = 0; cur_blksiz = min(ncolinvolvd-(j-1)*avai_blksiz, avai_blksiz); colst = (j-1)*avai_blksiz; coled = colst + cur_blksiz -1; fst = (rownbm -k)*nb+colst +1; for (colj=colst; colj <= coled; colj++) { st = (rownbm -k)*nb+colj +1; ed = min(st+nb-1,n-1); if (st > ed) break; if ((st == ed) && (colj != n-2)) break; vlen=ed-fst+1; vnb=vnb+1; } magma_bulge_findVTpos(n, nb, Vblksiz, colst, fst, ldv, ldt, &vpos, &tpos); if ((vlen > 0) && (vnb > 0)) { lapackf77_zlarfb( "R", "N", "F", "C", &ib_loc, &vlen, &vnb, V(vpos), &ldv, T(tpos), &ldt, E(i*nb_loc,fst), &lde, work, &ib_loc); } } } } else { printf("ERROR SIDE %d \n",side); } } // END loop over the chunks magma_free_cpu(work); magma_free_cpu(work2); }
/** Purpose ------- ZUNGQR generates an M-by-N COMPLEX_16 matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by ZGEQRF. Arguments --------- @param[in] m INTEGER The number of rows of the matrix Q. M >= 0. @param[in] n INTEGER The number of columns of the matrix Q. M >= N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. @param[in,out] A COMPLEX_16 array A, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. @param[in] lda INTEGER The first dimension of the array A. LDA >= max(1,M). @param[in] tau COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQRF_GPU. @param[in] T COMPLEX_16 array, dimension (NB, min(M,N)). T contains the T matrices used in blocking the elementary reflectors H(i), e.g., this can be the 6th argument of magma_zgeqrf_gpu (except stored on the CPU, not the GPU). @param[in] nb INTEGER This is the block size used in ZGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in T. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument has an illegal value @ingroup magma_zgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_zungqr_m( magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *T, magma_int_t nb, magma_int_t *info) { #define A(i,j) ( A + (i) + (j)*lda ) #define dA(d,i,j) (dA[d] + (i) + (j)*ldda) #define dT(d,i,j) (dT[d] + (i) + (j)*nb) magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, ldwork; magma_int_t i, ib, ki, kk, iinfo; magmaDoubleComplex *work; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } magma_int_t di, dn; magma_int_t dpanel; magma_int_t ngpu = magma_num_gpus(); magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); // Allocate memory on GPUs for A and workspaces magma_int_t ldda = ((m + 31) / 32) * 32; magma_int_t lddwork = ((n + 31) / 32) * 32; magma_int_t min_lblocks = (n / nb) / ngpu; // min. blocks per gpu magma_int_t last_dev = (n / nb) % ngpu; // device with last block magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 }; magmaDoubleComplex *dA[ MagmaMaxGPUs ] = { NULL }; magmaDoubleComplex *dT[ MagmaMaxGPUs ] = { NULL }; magmaDoubleComplex *dV[ MagmaMaxGPUs ] = { NULL }; magmaDoubleComplex *dW[ MagmaMaxGPUs ] = { NULL }; magma_queue_t stream[ MagmaMaxGPUs ] = { NULL }; for( int d = 0; d < ngpu; ++d ) { // example with n = 75, nb = 10, ngpu = 3 // min_lblocks = 2 // last_dev = 1 // gpu 0: 2 blocks, cols: 0- 9, 30-39, 60-69 // gpu 1: 1+ blocks, cols: 10-19, 40-49, 70-74 (partial) // gpu 2: 1 block, cols: 20-29, 50-59 magma_setdevice( d ); nlocal[d] = min_lblocks*nb; if ( d < last_dev ) { nlocal[d] += nb; } else if ( d == last_dev ) { nlocal[d] += (n % nb); } ldwork = nlocal[d]*ldda // dA + nb*m // dT + nb*ldda // dV + nb*lddwork; // dW if ( MAGMA_SUCCESS != magma_zmalloc( &dA[d], ldwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto CLEANUP; } dT[d] = dA[d] + nlocal[d]*ldda; dV[d] = dT[d] + nb*m; dW[d] = dV[d] + nb*ldda; magma_queue_create( &stream[d] ); } trace_init( 1, ngpu, 1, stream ); // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); } else { ki = 0; kk = 0; } // Allocate CPU work space // n*nb for zungqr workspace lwork = n * nb; magma_zmalloc_cpu( &work, lwork ); if (work == NULL) { *info = MAGMA_ERR_HOST_ALLOC; goto CLEANUP; } // Use unblocked code for the last or only block. if (kk < n) { trace_cpu_start( 0, "ungqr", "ungqr last block" ); m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; dpanel = (kk / nb) % ngpu; di = ((kk / nb) / ngpu) * nb; magma_setdevice( dpanel ); lapackf77_zungqr( &m_kk, &n_kk, &k_kk, A(kk, kk), &lda, &tau[kk], work, &lwork, &iinfo ); magma_zsetmatrix( m_kk, n_kk, A(kk, kk), lda, dA(dpanel, kk, di), ldda ); // Set A(1:kk,kk+1:n) to zero. magmablas_zlaset( MagmaFull, kk, n - kk, c_zero, c_zero, dA(dpanel, 0, di), ldda ); trace_cpu_end( 0 ); } if (kk > 0) { // Use blocked code // send T to all GPUs for( int d = 0; d < ngpu; ++d ) { magma_setdevice( d ); trace_gpu_start( d, 0, "set", "set T" ); magma_zsetmatrix_async( nb, min(m,n), T, nb, dT[d], nb, stream[d] ); trace_gpu_end( d, 0 ); } // stream: set Aii (V) --> laset --> laset --> larfb --> [next] // CPU has no computation for( i = ki; i >= 0; i -= nb ) { ib = min(nb, k - i); mi = m - i; dpanel = (i / nb) % ngpu; di = ((i / nb) / ngpu) * nb; // Send current panel to the GPUs lapackf77_zlaset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda ); for( int d = 0; d < ngpu; ++d ) { magma_setdevice( d ); trace_gpu_start( d, 0, "set", "set V" ); magma_zsetmatrix_async( mi, ib, A(i, i), lda, dV[d], ldda, stream[d] ); trace_gpu_end( d, 0 ); } // set panel to identity magma_setdevice( dpanel ); magmablasSetKernelStream( stream[dpanel] ); trace_gpu_start( dpanel, 0, "laset", "laset" ); magmablas_zlaset( MagmaFull, i, ib, c_zero, c_zero, dA(dpanel, 0, di), ldda ); magmablas_zlaset( MagmaFull, mi, ib, c_zero, c_one, dA(dpanel, i, di), ldda ); trace_gpu_end( dpanel, 0 ); if (i < n) { // Apply H to A(i:m,i:n) from the left for( int d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( stream[d] ); magma_indices_1D_bcyclic( nb, ngpu, d, i, n, &di, &dn ); trace_gpu_start( d, 0, "larfb", "larfb" ); magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, dn-di, ib, dV[d], ldda, dT(d,0,i), nb, dA(d, i, di), ldda, dW[d], lddwork ); trace_gpu_end( d, 0 ); } } } } // copy result back to CPU trace_cpu_start( 0, "get", "get A" ); magma_zgetmatrix_1D_col_bcyclic( m, n, dA, ldda, A, lda, ngpu, nb ); trace_cpu_end( 0 ); #ifdef TRACING char name[80]; snprintf( name, sizeof(name), "zungqr-n%d-ngpu%d.svg", m, ngpu ); trace_finalize( name, "trace.css" ); #endif CLEANUP: for( int d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_free( dA[d] ); magma_queue_destroy( stream[d] ); } magma_free_cpu( work ); magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_zungqr */
/** Purpose ------- Solves the overdetermined, least squares problem min || A*X - C || using the QR factorization A. The underdetermined problem (m < n) is not currently handled. Arguments --------- @param[in] trans magma_trans_t - = MagmaNoTrans: the linear system involves A. Only TRANS=MagmaNoTrans is currently handled. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. M >= N >= 0. @param[in] nrhs INTEGER The number of columns of the matrix C. NRHS >= 0. @param[in,out] dA COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, A is overwritten by details of its QR factorization as returned by ZGEQRF3. @param[in] ldda INTEGER The leading dimension of the array A, LDDA >= M. @param[in,out] dB COMPLEX_16 array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. @param[in] lddb INTEGER The leading dimension of the array dB. LDDB >= M. @param[out] hwork (workspace) COMPLEX_16 array, dimension MAX(1,LWORK). On exit, if INFO = 0, HWORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array HWORK, LWORK >= (M - N + NB)*(NRHS + NB) + NRHS*NB, where NB is the blocksize given by magma_get_zgeqrf_nb( M ). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the HWORK array. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zgels_driver ********************************************************************/ extern "C" magma_int_t magma_zgels3_gpu( magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaDoubleComplex *dA, magma_int_t ldda, magmaDoubleComplex *dB, magma_int_t lddb, magmaDoubleComplex *hwork, magma_int_t lwork, magma_int_t *info) { magmaDoubleComplex *dT, *tau; magma_int_t k; magma_int_t nb = magma_get_zgeqrf_nb(m); magma_int_t lwkopt = (m - n + nb)*(nrhs + nb) + nrhs*nb; int lquery = (lwork == -1); hwork[0] = MAGMA_Z_MAKE( (double)lwkopt, 0. ); *info = 0; /* For now, N is the only case working */ if ( trans != MagmaNoTrans ) *info = -1; else if (m < 0) *info = -2; else if (n < 0 || m < n) /* LQ is not handle for now*/ *info = -3; else if (nrhs < 0) *info = -4; else if (ldda < max(1,m)) *info = -6; else if (lddb < max(1,m)) *info = -8; else if (lwork < lwkopt && ! lquery) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { hwork[0] = MAGMA_Z_ONE; return *info; } /* * Allocate temporary buffers */ int ldtwork = ( 2*k + ((n+31)/32)*32 )*nb; if (nb < nrhs) ldtwork = ( 2*k + ((n+31)/32)*32 )*nrhs; if (MAGMA_SUCCESS != magma_zmalloc( &dT, ldtwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_zmalloc_cpu( &tau, k ); if ( tau == NULL ) { magma_free( dT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_zgeqrf3_gpu( m, n, dA, ldda, tau, dT, info ); if ( *info == 0 ) { magma_zgeqrs3_gpu( m, n, nrhs, dA, ldda, tau, dT, dB, lddb, hwork, lwork, info ); } magma_free( dT ); magma_free_cpu(tau); return *info; }
/** Purpose ------- ZGEBRD reduces a general complex M-by-N matrix A to upper or lower bidiagonal form B by an orthogonal transformation: Q**H * A * P = B. If m >= n, B is upper bidiagonal; if m < n, B is lower bidiagonal. Arguments --------- @param[in] m INTEGER The number of rows in the matrix A. M >= 0. @param[in] n INTEGER The number of columns in the matrix A. N >= 0. @param[in,out] A COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N general matrix to be reduced. On exit, if m >= n, the diagonal and the first superdiagonal are overwritten with the upper bidiagonal matrix B; the elements below the diagonal, with the array TAUQ, represent the orthogonal matrix Q as a product of elementary reflectors, and the elements above the first superdiagonal, with the array TAUP, represent the orthogonal matrix P as a product of elementary reflectors; \n if m < n, the diagonal and the first subdiagonal are overwritten with the lower bidiagonal matrix B; the elements below the first subdiagonal, with the array TAUQ, represent the orthogonal matrix Q as a product of elementary reflectors, and the elements above the diagonal, with the array TAUP, represent the orthogonal matrix P as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] d double precision array, dimension (min(M,N)) The diagonal elements of the bidiagonal matrix B: D(i) = A(i,i). @param[out] e double precision array, dimension (min(M,N)-1) The off-diagonal elements of the bidiagonal matrix B: if m >= n, E(i) = A(i,i+1) for i = 1,2,...,n-1; if m < n, E(i) = A(i+1,i) for i = 1,2,...,m-1. @param[out] tauq COMPLEX_16 array dimension (min(M,N)) The scalar factors of the elementary reflectors which represent the orthogonal matrix Q. See Further Details. @param[out] taup COMPLEX_16 array, dimension (min(M,N)) The scalar factors of the elementary reflectors which represent the orthogonal matrix P. See Further Details. @param[out] work (workspace) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. LWORK >= (M+N)*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value. Further Details --------------- The matrices Q and P are represented as products of elementary reflectors: If m >= n, Q = H(1) H(2) . . . H(n) and P = G(1) G(2) . . . G(n-1) Each H(i) and G(i) has the form: H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' where tauq and taup are complex scalars, and v and u are complex vectors; v(1:i-1) = 0, v(i) = 1, and v(i+1:m) is stored on exit in A(i+1:m,i); u(1:i) = 0, u(i+1) = 1, and u(i+2:n) is stored on exit in A(i,i+2:n); tauq is stored in TAUQ(i) and taup in TAUP(i). If m < n, Q = H(1) H(2) . . . H(m-1) and P = G(1) G(2) . . . G(m) Each H(i) and G(i) has the form: H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' where tauq and taup are complex scalars, and v and u are complex vectors; v(1:i) = 0, v(i+1) = 1, and v(i+2:m) is stored on exit in A(i+2:m,i); u(1:i-1) = 0, u(i) = 1, and u(i+1:n) is stored on exit in A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i). The contents of A on exit are illustrated by the following examples: @verbatim m = 6 and n = 5 (m > n): m = 5 and n = 6 (m < n): ( d e u1 u1 u1 ) ( d u1 u1 u1 u1 u1 ) ( v1 d e u2 u2 ) ( e d u2 u2 u2 u2 ) ( v1 v2 d e u3 ) ( v1 e d u3 u3 u3 ) ( v1 v2 v3 d e ) ( v1 v2 e d u4 u4 ) ( v1 v2 v3 v4 d ) ( v1 v2 v3 e d u5 ) ( v1 v2 v3 v4 v5 ) @endverbatim where d and e denote diagonal and off-diagonal elements of B, vi denotes an element of the vector defining H(i), and ui an element of the vector defining G(i). @ingroup magma_zgesvd_comp ********************************************************************/ extern "C" magma_int_t magma_zgebrd( magma_int_t m, magma_int_t n, magmaDoubleComplex *A, magma_int_t lda, double *d, double *e, magmaDoubleComplex *tauq, magmaDoubleComplex *taup, magmaDoubleComplex *work, magma_int_t lwork, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define dA(i, j) (dA + (j)*ldda + (i)) magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex *dA, *dwork; magma_int_t ncol, nrow, jmax, nb, ldda; magma_int_t i, j, nx; magma_int_t iinfo; magma_int_t minmn; magma_int_t ldwrkx, ldwrky, lwkopt; magma_int_t lquery; nb = magma_get_zgebrd_nb( m, n ); ldda = m; lwkopt = (m + n) * nb; work[0] = magma_zmake_lwork( lwkopt ); lquery = (lwork == -1); /* Check arguments */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < lwkopt && (! lquery) ) { *info = -10; } if (*info < 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; /* Quick return if possible */ minmn = min(m,n); if (minmn == 0) { work[0] = c_one; return *info; } magma_queue_t queue = NULL; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); magmaDoubleComplex *work2; magma_int_t lwork2 = max(m,n); if (MAGMA_SUCCESS != magma_zmalloc_cpu( &work2, lwork2 )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } if (MAGMA_SUCCESS != magma_zmalloc( &dA, n*ldda + (m + n)*nb )) { magma_free_cpu( work2 ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dwork = dA + n*ldda; ldwrkx = m; ldwrky = n; /* Set the block/unblock crossover point NX. */ nx = 128; /* Copy the matrix to the GPU */ if (minmn - nx >= 1) { magma_zsetmatrix( m, n, A, lda, dA, ldda, queue ); } for (i=0; i < (minmn - nx); i += nb) { /* Reduce rows and columns i:i+nb-1 to bidiagonal form and return the matrices X and Y which are needed to update the unreduced part of the matrix */ nrow = m - i; ncol = n - i; /* Get the current panel (no need for the 1st iteration) */ if ( i > 0 ) { magma_zgetmatrix( nrow, nb, dA(i, i), ldda, A( i, i), lda, queue ); magma_zgetmatrix( nb, ncol - nb, dA(i, i+nb), ldda, A( i, i+nb), lda, queue ); } magma_zlabrd_gpu(nrow, ncol, nb, A(i, i), lda, dA(i, i), ldda, d+i, e+i, tauq+i, taup+i, work, ldwrkx, dwork, ldwrkx, // x, dx work+(ldwrkx*nb), ldwrky, dwork+(ldwrkx*nb), ldwrky, work2, lwork2, queue ); // y, dy /* Update the trailing submatrix A(i+nb:m,i+nb:n), using an update of the form A := A - V*Y' - X*U' */ nrow = m - i - nb; ncol = n - i - nb; // Send Y back to the GPU magma_zsetmatrix( nrow, nb, work + nb, ldwrkx, dwork + nb, ldwrkx, queue ); magma_zsetmatrix( ncol, nb, work + (ldwrkx+1)*nb, ldwrky, dwork + (ldwrkx+1)*nb, ldwrky, queue ); magma_zgemm( MagmaNoTrans, MagmaConjTrans, nrow, ncol, nb, c_neg_one, dA(i+nb, i ), ldda, dwork+(ldwrkx+1)*nb, ldwrky, c_one, dA(i+nb, i+nb), ldda, queue ); magma_zgemm( MagmaNoTrans, MagmaNoTrans, nrow, ncol, nb, c_neg_one, dwork+nb, ldwrkx, dA( i, i+nb ), ldda, c_one, dA( i+nb, i+nb ), ldda, queue ); /* Copy diagonal and off-diagonal elements of B back into A */ if (m >= n) { jmax = i + nb; for (j = i; j < jmax; ++j) { *A(j, j ) = MAGMA_Z_MAKE( d[j], 0. ); *A(j, j+1) = MAGMA_Z_MAKE( e[j], 0. ); } } else { jmax = i + nb; for (j = i; j < jmax; ++j) { *A(j, j ) = MAGMA_Z_MAKE( d[j], 0. ); *A(j+1, j ) = MAGMA_Z_MAKE( e[j], 0. ); } } } /* Use unblocked code to reduce the remainder of the matrix */ nrow = m - i; ncol = n - i; if ( 0 < minmn - nx ) { magma_zgetmatrix( nrow, ncol, dA(i, i), ldda, A( i, i), lda, queue ); } lapackf77_zgebrd( &nrow, &ncol, A(i, i), &lda, d+i, e+i, tauq+i, taup+i, work, &lwork, &iinfo); work[0] = magma_zmake_lwork( lwkopt ); magma_free_cpu( work2 ); magma_free( dA ); magma_queue_destroy( queue ); return *info; } /* magma_zgebrd */
extern "C" magma_int_t magma_zgeev_m( char jobvl, char jobvr, magma_int_t n, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *W, magmaDoubleComplex *vl, magma_int_t ldvl, magmaDoubleComplex *vr, magma_int_t ldvr, magmaDoubleComplex *work, magma_int_t lwork, double *rwork, magma_int_t *info ) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= ZGEEV computes for an N-by-N complex nonsymmetric matrix A, the eigenvalues and, optionally, the left and/or right eigenvectors. The right eigenvector v(j) of A satisfies A * v(j) = lambda(j) * v(j) where lambda(j) is its eigenvalue. The left eigenvector u(j) of A satisfies u(j)**H * A = lambda(j) * u(j)**H where u(j)**H denotes the conjugate transpose of u(j). The computed eigenvectors are normalized to have Euclidean norm equal to 1 and largest component real. Arguments ========= JOBVL (input) CHARACTER*1 = 'N': left eigenvectors of A are not computed; = 'V': left eigenvectors of are computed. JOBVR (input) CHARACTER*1 = 'N': right eigenvectors of A are not computed; = 'V': right eigenvectors of A are computed. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) COMPLEX*16 array, dimension (LDA,N) On entry, the N-by-N matrix A. On exit, A has been overwritten. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). W (output) COMPLEX*16 array, dimension (N) W contains the computed eigenvalues. VL (output) COMPLEX*16 array, dimension (LDVL,N) If JOBVL = 'V', the left eigenvectors u(j) are stored one after another in the columns of VL, in the same order as their eigenvalues. If JOBVL = 'N', VL is not referenced. u(j) = VL(:,j), the j-th column of VL. LDVL (input) INTEGER The leading dimension of the array VL. LDVL >= 1; if JOBVL = 'V', LDVL >= N. VR (output) COMPLEX*16 array, dimension (LDVR,N) If JOBVR = 'V', the right eigenvectors v(j) are stored one after another in the columns of VR, in the same order as their eigenvalues. If JOBVR = 'N', VR is not referenced. v(j) = VR(:,j), the j-th column of VR. LDVR (input) INTEGER The leading dimension of the array VR. LDVR >= 1; if JOBVR = 'V', LDVR >= N. WORK (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= (1+nb)*N. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. RWORK (workspace) DOUBLE PRECISION array, dimension (2*N) INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value. > 0: if INFO = i, the QR algorithm failed to compute all the eigenvalues, and no eigenvectors have been computed; elements and i+1:N of W contain eigenvalues which have converged. ===================================================================== */ #define vl(i,j) (vl + (i) + (j)*ldvl) #define vr(i,j) (vr + (i) + (j)*ldvr) magma_int_t c_one = 1; magma_int_t c_zero = 0; double d__1, d__2; magmaDoubleComplex z__1, z__2; magmaDoubleComplex tmp; double scl; double dum[1], eps; double anrm, cscale, bignum, smlnum; magma_int_t i, k, ilo, ihi; magma_int_t ibal, ierr, itau, iwrk, nout, liwrk, i__1, i__2, nb; magma_int_t scalea, minwrk, irwork, lquery, wantvl, wantvr, select[1]; char side[2] = {0, 0}; char jobvl_[2] = {jobvl, 0}; char jobvr_[2] = {jobvr, 0}; irwork = 0; *info = 0; lquery = lwork == -1; wantvl = lapackf77_lsame( jobvl_, "V" ); wantvr = lapackf77_lsame( jobvr_, "V" ); if (! wantvl && ! lapackf77_lsame( jobvl_, "N" )) { *info = -1; } else if (! wantvr && ! lapackf77_lsame( jobvr_, "N" )) { *info = -2; } else if (n < 0) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if ( (ldvl < 1) || (wantvl && (ldvl < n))) { *info = -8; } else if ( (ldvr < 1) || (wantvr && (ldvr < n))) { *info = -10; } /* Compute workspace */ nb = magma_get_zgehrd_nb( n ); if (*info == 0) { minwrk = (1+nb)*n; work[0] = MAGMA_Z_MAKE( minwrk, 0 ); if (lwork < minwrk && ! lquery) { *info = -12; } } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } #if defined(Version3) || defined(Version4) || defined(Version5) magmaDoubleComplex *dT; if (MAGMA_SUCCESS != magma_zmalloc( &dT, nb*n )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } #endif #if defined(Version4) || defined(Version5) magmaDoubleComplex *T; if (MAGMA_SUCCESS != magma_zmalloc_cpu( &T, nb*n )) { magma_free( dT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } #endif /* Get machine constants */ eps = lapackf77_dlamch( "P" ); smlnum = lapackf77_dlamch( "S" ); bignum = 1. / smlnum; lapackf77_dlabad( &smlnum, &bignum ); smlnum = magma_dsqrt( smlnum ) / eps; bignum = 1. / smlnum; /* Scale A if max element outside range [SMLNUM,BIGNUM] */ anrm = lapackf77_zlange( "M", &n, &n, A, &lda, dum ); scalea = 0; if (anrm > 0. && anrm < smlnum) { scalea = 1; cscale = smlnum; } else if (anrm > bignum) { scalea = 1; cscale = bignum; } if (scalea) { lapackf77_zlascl( "G", &c_zero, &c_zero, &anrm, &cscale, &n, &n, A, &lda, &ierr ); } /* Balance the matrix * (CWorkspace: none) * (RWorkspace: need N) */ ibal = 0; lapackf77_zgebal( "B", &n, A, &lda, &ilo, &ihi, &rwork[ibal], &ierr ); /* Reduce to upper Hessenberg form * (CWorkspace: need 2*N, prefer N + N*NB) * (RWorkspace: none) */ itau = 0; iwrk = itau + n; liwrk = lwork - iwrk; #if defined(Version1) // Version 1 - LAPACK lapackf77_zgehrd( &n, &ilo, &ihi, A, &lda, &work[itau], &work[iwrk], &liwrk, &ierr ); #elif defined(Version2) // Version 2 - LAPACK consistent HRD magma_zgehrd2( n, ilo, ihi, A, lda, &work[itau], &work[iwrk], &liwrk, &ierr ); #elif defined(Version3) // Version 3 - LAPACK consistent MAGMA HRD + matrices T stored, magma_zgehrd( n, ilo, ihi, A, lda, &work[itau], &work[iwrk], liwrk, dT, &ierr ); #elif defined(Version4) || defined(Version5) // Version 4 - Multi-GPU, T on host magma_zgehrd_m( n, ilo, ihi, A, lda, &work[itau], &work[iwrk], liwrk, T, &ierr ); magma_zsetmatrix( nb, n, T, nb, dT, nb ); #endif if (wantvl) { /* Want left eigenvectors * Copy Householder vectors to VL */ side[0] = 'L'; lapackf77_zlacpy( MagmaLowerStr, &n, &n, A, &lda, vl, &ldvl ); /* Generate unitary matrix in VL * (CWorkspace: need 2*N-1, prefer N + (N-1)*NB) * (RWorkspace: none) */ #if defined(Version1) || defined(Version2) // Version 1 & 2 - LAPACK lapackf77_zunghr( &n, &ilo, &ihi, vl, &ldvl, &work[itau], &work[iwrk], &liwrk, &ierr ); #elif defined(Version3) || defined(Version4) // Version 3 - LAPACK consistent MAGMA HRD + matrices T stored magma_zunghr( n, ilo, ihi, vl, ldvl, &work[itau], dT, nb, &ierr ); #elif defined(Version5) // Version 5 - Multi-GPU, T on host magma_zunghr_m( n, ilo, ihi, vl, ldvl, &work[itau], T, nb, &ierr ); #endif /* Perform QR iteration, accumulating Schur vectors in VL * (CWorkspace: need 1, prefer HSWORK (see comments) ) * (RWorkspace: none) */ iwrk = itau; liwrk = lwork - iwrk; lapackf77_zhseqr( "S", "V", &n, &ilo, &ihi, A, &lda, W, vl, &ldvl, &work[iwrk], &liwrk, info ); if (wantvr) { /* Want left and right eigenvectors * Copy Schur vectors to VR */ side[0] = 'B'; lapackf77_zlacpy( "F", &n, &n, vl, &ldvl, vr, &ldvr ); } } else if (wantvr) { /* Want right eigenvectors * Copy Householder vectors to VR */ side[0] = 'R'; lapackf77_zlacpy( "L", &n, &n, A, &lda, vr, &ldvr ); /* Generate unitary matrix in VR * (CWorkspace: need 2*N-1, prefer N + (N-1)*NB) * (RWorkspace: none) */ #if defined(Version1) || defined(Version2) // Version 1 & 2 - LAPACK lapackf77_zunghr( &n, &ilo, &ihi, vr, &ldvr, &work[itau], &work[iwrk], &liwrk, &ierr ); #elif defined(Version3) || defined(Version4) // Version 3 - LAPACK consistent MAGMA HRD + matrices T stored magma_zunghr( n, ilo, ihi, vr, ldvr, &work[itau], dT, nb, &ierr ); #elif defined(Version5) // Version 5 - Multi-GPU, T on host magma_zunghr_m( n, ilo, ihi, vr, ldvr, &work[itau], T, nb, &ierr ); #endif /* Perform QR iteration, accumulating Schur vectors in VR * (CWorkspace: need 1, prefer HSWORK (see comments) ) * (RWorkspace: none) */ iwrk = itau; liwrk = lwork - iwrk; lapackf77_zhseqr( "S", "V", &n, &ilo, &ihi, A, &lda, W, vr, &ldvr, &work[iwrk], &liwrk, info ); } else { /* Compute eigenvalues only * (CWorkspace: need 1, prefer HSWORK (see comments) ) * (RWorkspace: none) */ iwrk = itau; liwrk = lwork - iwrk; lapackf77_zhseqr( "E", "N", &n, &ilo, &ihi, A, &lda, W, vr, &ldvr, &work[iwrk], &liwrk, info ); } /* If INFO > 0 from ZHSEQR, then quit */ if (*info > 0) { goto CLEANUP; } if (wantvl || wantvr) { /* Compute left and/or right eigenvectors * (CWorkspace: need 2*N) * (RWorkspace: need 2*N) */ irwork = ibal + n; lapackf77_ztrevc( side, "B", select, &n, A, &lda, vl, &ldvl, vr, &ldvr, &n, &nout, &work[iwrk], &rwork[irwork], &ierr ); } if (wantvl) { /* Undo balancing of left eigenvectors * (CWorkspace: none) * (RWorkspace: need N) */ lapackf77_zgebak( "B", "L", &n, &ilo, &ihi, &rwork[ibal], &n, vl, &ldvl, &ierr ); /* Normalize left eigenvectors and make largest component real */ for (i = 0; i < n; ++i) { scl = 1. / cblas_dznrm2( n, vl(0,i), 1 ); cblas_zdscal( n, scl, vl(0,i), 1 ); for (k = 0; k < n; ++k) { /* Computing 2nd power */ d__1 = MAGMA_Z_REAL( *vl(k,i) ); d__2 = MAGMA_Z_IMAG( *vl(k,i) ); rwork[irwork + k] = d__1*d__1 + d__2*d__2; } k = cblas_idamax( n, &rwork[irwork], 1 ); z__2 = MAGMA_Z_CNJG( *vl(k,i) ); d__1 = magma_dsqrt( rwork[irwork + k] ); MAGMA_Z_DSCALE( z__1, z__2, d__1 ); tmp = z__1; cblas_zscal( n, CBLAS_SADDR(tmp), vl(0,i), 1 ); d__1 = MAGMA_Z_REAL( *vl(k,i) ); z__1 = MAGMA_Z_MAKE( d__1, 0 ); *vl(k,i) = z__1; } } if (wantvr) { /* Undo balancing of right eigenvectors * (CWorkspace: none) * (RWorkspace: need N) */ lapackf77_zgebak( "B", "R", &n, &ilo, &ihi, &rwork[ibal], &n, vr, &ldvr, &ierr ); /* Normalize right eigenvectors and make largest component real */ for (i = 0; i < n; ++i) { scl = 1. / cblas_dznrm2( n, vr(0,i), 1 ); cblas_zdscal( n, scl, vr(0,i), 1 ); for (k = 0; k < n; ++k) { /* Computing 2nd power */ d__1 = MAGMA_Z_REAL( *vr(k,i) ); d__2 = MAGMA_Z_IMAG( *vr(k,i) ); rwork[irwork + k] = d__1*d__1 + d__2*d__2; } k = cblas_idamax( n, &rwork[irwork], 1 ); z__2 = MAGMA_Z_CNJG( *vr(k,i) ); d__1 = magma_dsqrt( rwork[irwork + k] ); MAGMA_Z_DSCALE( z__1, z__2, d__1 ); tmp = z__1; cblas_zscal( n, CBLAS_SADDR(tmp), vr(0,i), 1 ); d__1 = MAGMA_Z_REAL( *vr(k,i) ); z__1 = MAGMA_Z_MAKE( d__1, 0 ); *vr(k,i) = z__1; } } CLEANUP: /* Undo scaling if necessary */ if (scalea) { i__1 = n - (*info); i__2 = max( n - (*info), 1 ); lapackf77_zlascl( "G", &c_zero, &c_zero, &cscale, &anrm, &i__1, &c_one, W + (*info), &i__2, &ierr ); if (*info > 0) { i__1 = ilo - 1; lapackf77_zlascl( "G", &c_zero, &c_zero, &cscale, &anrm, &i__1, &c_one, W, &n, &ierr ); } } #if defined(Version3) || defined(Version4) || defined(Version5) magma_free( dT ); #endif #if defined(Version4) || defined(Version5) magma_free_cpu( T ); #endif return *info; } /* magma_zgeev */
/* //////////////////////////////////////////////////////////////////////////// -- Testing zhetrd_he2hb */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_time, gpu_perf; magmaDoubleComplex *h_A, *h_R, *h_work; magmaDoubleComplex *tau; double *D, *E; magma_int_t N, n2, lda, ldda, lwork, ldt, info, nstream; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; // TODO add these options to parse_opts magma_int_t NE = 0; magma_int_t distblk = 0; magma_opts opts; opts.parse_opts( argc, argv ); magma_int_t WANTZ = (opts.jobz == MagmaVec); double tol = opts.tolerance * lapackf77_dlamch("E"); if (opts.nb == 0) opts.nb = 64; //magma_get_zhetrd_he2hb_nb(N); if (NE < 1) NE = N; //64; //magma_get_zhetrd_he2hb_nb(N); nstream = max(3, opts.ngpu+2); magma_queue_t streams[MagmaMaxGPUs][20]; magmaDoubleComplex_ptr da[MagmaMaxGPUs], dT1[MagmaMaxGPUs]; if ((distblk == 0) || (distblk < opts.nb)) distblk = max(256, opts.nb); printf("%% ngpu %d, distblk %d, NB %d, nstream %d\n", (int) opts.ngpu, (int) distblk, (int) opts.nb, (int) nstream); for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); for( int i = 0; i < nstream; ++i ) { magma_queue_create( &streams[dev][i] ); } } magma_setdevice( 0 ); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldt = N; ldda = magma_roundup( N, opts.align ); // multiple of 32 by default n2 = lda*N; /* We suppose the magma NB is bigger than lapack NB */ lwork = N*opts.nb; //gflops = ....? /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( tau, magmaDoubleComplex, N-1 ); TESTING_MALLOC_PIN( h_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_work, magmaDoubleComplex, lwork ); TESTING_MALLOC_PIN( D, double, N ); TESTING_MALLOC_PIN( E, double, N ); for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { magma_int_t mlocal = ((N / distblk) / opts.ngpu + 1) * distblk; magma_setdevice( dev ); TESTING_MALLOC_DEV( da[dev], magmaDoubleComplex, ldda*mlocal ); TESTING_MALLOC_DEV( dT1[dev], magmaDoubleComplex, N*opts.nb ); } /* ==================================================================== Initialize the matrix =================================================================== */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); magma_zmake_hermitian( N, h_A, lda ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* Copy the matrix to the GPU */ magma_zsetmatrix_1D_col_bcyclic( N, N, h_R, lda, da, ldda, opts.ngpu, distblk); //magmaDoubleComplex_ptr dabis; //TESTING_MALLOC_DEV( dabis, magmaDoubleComplex, ldda*N ); //magma_zsetmatrix(N, N, h_R, lda, dabis, ldda); for (int count=0; count < 1; ++count) { magma_setdevice(0); gpu_time = magma_wtime(); if (opts.version == 30) { // see src/obsolete and magmablas/obsolete printf( "magma_zhetrd_he2hb_mgpu_spec not compiled\n" ); //magma_zhetrd_he2hb_mgpu_spec( // opts.uplo, N, opts.nb, h_R, lda, tau, h_work, lwork, // da, ldda, dT1, opts.nb, opts.ngpu, distblk, // streams, nstream, opts.nthread, &info); } else { nstream = 3; magma_zhetrd_he2hb_mgpu( opts.uplo, N, opts.nb, h_R, lda, tau, h_work, lwork, da, ldda, dT1, opts.nb, opts.ngpu, distblk, streams, nstream, opts.nthread, &info); } // magma_zhetrd_he2hb(opts.uplo, N, opts.nb, h_R, lda, tau, h_work, lwork, dT1[0], &info); gpu_time = magma_wtime() - gpu_time; printf(" Finish BAND N %d NB %d dist %d ngpu %d version %d timing= %f\n", N, opts.nb, distblk, opts.ngpu, opts.version, gpu_time); } magma_setdevice(0); for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice(dev); magma_device_sync(); } magma_setdevice(0); magmablasSetKernelStream( NULL ); // todo neither of these is declared in headers // magma_zhetrd_bhe2trc_v5(opts.nthread, WANTZ, opts.uplo, NE, N, opts.nb, h_R, lda, D, E, dT1[0], ldt); // magma_zhetrd_bhe2trc(opts.nthread, WANTZ, opts.uplo, NE, N, opts.nb, h_R, lda, D, E, dT1[0], ldt); // todo where is this timer started? // gpu_time = magma_wtime() - gpu_time; // todo what are the gflops? gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zhetrd_he2hb returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Print performance and error. =================================================================== */ #if defined(CHECKEIG) #if defined(PRECISION_z) || defined(PRECISION_d) if ( opts.check ) { printf(" Total N %5d flops %6.2f timing %6.2f seconds\n", (int) N, gpu_perf, gpu_time ); double nrmI=0.0, nrm1=0.0, nrm2=0.0; int lwork2 = 256*N; magmaDoubleComplex *work2, *AINIT; double *rwork2, *D2; // TODO free this memory ! magma_zmalloc_cpu( &work2, lwork2 ); magma_dmalloc_cpu( &rwork2, N ); magma_dmalloc_cpu( &D2, N ); magma_zmalloc_cpu( &AINIT, N*lda ); memcpy(AINIT, h_A, N*lda*sizeof(magmaDoubleComplex)); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); int nt = min(12, opts.nthread); magma_set_lapack_numthreads(nt); lapackf77_zheev( "N", "L", &N, h_A, &lda, D2, work2, &lwork2, #ifdef COMPLEX rwork2, #endif &info ); ///* call eigensolver for our resulting tridiag [D E] and for Q */ //dstedc_withZ('V', N, D, E, h_R, lda); ////dsterf_( &N, D, E, &info); cpu_time = magma_wtime() - cpu_time; printf(" Finish CHECK - EIGEN timing= %f threads %d\n", cpu_time, nt); /* compare result */ cmp_vals(N, D2, D, &nrmI, &nrm1, &nrm2); magmaDoubleComplex *WORKAJETER; double *RWORKAJETER, *RESU; // TODO free this memory ! magma_zmalloc_cpu( &WORKAJETER, (2* N * N + N) ); magma_dmalloc_cpu( &RWORKAJETER, N ); magma_dmalloc_cpu( &RESU, 10 ); int MATYPE; memset(RESU, 0, 10*sizeof(double)); MATYPE=3; double NOTHING=0.0; cpu_time = magma_wtime(); // check results zcheck_eig_( lapack_vec_const(opts.jobz), &MATYPE, &N, &opts.nb, AINIT, &lda, &NOTHING, &NOTHING, D2, D, h_R, &lda, WORKAJETER, RWORKAJETER, RESU ); cpu_time = magma_wtime() - cpu_time; printf(" Finish CHECK - results timing= %f\n", cpu_time); magma_set_lapack_numthreads(1); printf("\n"); printf(" ================================================================================================================\n"); printf(" ==> INFO voici threads=%d N=%d NB=%d WANTZ=%d\n", (int) opts.nthread, (int) N, (int) opts.nb, (int) WANTZ); printf(" ================================================================================================================\n"); printf(" DSBTRD : %15s \n", "STATblgv9withQ "); printf(" ================================================================================================================\n"); if (WANTZ > 0) printf(" | A - U S U' | / ( |A| n ulp ) : %15.3E \n", RESU[0]); if (WANTZ > 0) printf(" | I - U U' | / ( n ulp ) : %15.3E \n", RESU[1]); printf(" | D1 - EVEIGS | / (|D| ulp) : %15.3E \n", RESU[2]); printf(" max | D1 - EVEIGS | : %15.3E \n", RESU[6]); printf(" ================================================================================================================\n\n\n"); printf(" ****************************************************************************************************************\n"); printf(" * Hello here are the norm Infinite (max)=%8.2e norm one (sum)=%8.2e norm2(sqrt)=%8.2e *\n", nrmI, nrm1, nrm2); printf(" ****************************************************************************************************************\n\n"); } #endif // PRECISION_z || PRECISION_d #endif // CHECKEIG printf(" Total N %5d flops %6.2f timing %6.2f seconds\n", (int) N, 0.0, gpu_time ); printf("%%===========================================================================\n\n\n"); TESTING_FREE_CPU( tau ); TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( D ); TESTING_FREE_PIN( E ); for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); TESTING_FREE_DEV( da[dev] ); TESTING_FREE_DEV( dT1[dev] ); } magma_setdevice( 0 ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { for( int i = 0; i < nstream; ++i ) { magma_queue_destroy( streams[dev][i] ); } } TESTING_FINALIZE(); return status; }
/** Purpose ------- Solves a system of linear equations A * X = B, A**T * X = B, or A**H * X = B with a general N-by-N matrix A using the LU factorization computed by ZGETRF_GPU. Arguments --------- @param[in] trans magma_trans_t Specifies the form of the system of equations: - = MagmaNoTrans: A * X = B (No transpose) - = MagmaTrans: A**T * X = B (Transpose) - = MagmaConjTrans: A**H * X = B (Conjugate transpose) @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in] dA COMPLEX_16 array on the GPU, dimension (LDA,N) The factors L and U from the factorization A = P*L*U as computed by ZGETRF_GPU. @param[in] ldda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in] ipiv INTEGER array, dimension (N) The pivot indices from ZGETRF; for 1 <= i <= N, row i of the matrix was interchanged with row IPIV(i). @param[in,out] dB COMPLEX_16 array on the GPU, dimension (LDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. @param[in] lddb INTEGER The leading dimension of the array B. LDB >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zgesv_comp ********************************************************************/ extern "C" magma_int_t magma_zgetrs_gpu( magma_trans_t trans, magma_int_t n, magma_int_t nrhs, magmaDoubleComplex_ptr dA, magma_int_t ldda, magma_int_t *ipiv, magmaDoubleComplex_ptr dB, magma_int_t lddb, magma_int_t *info) { magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex *work = NULL; int notran = (trans == MagmaNoTrans); magma_int_t i1, i2, inc; *info = 0; if ( (! notran) && (trans != MagmaTrans) && (trans != MagmaConjTrans) ) { *info = -1; } else if (n < 0) { *info = -2; } else if (nrhs < 0) { *info = -3; } else if (ldda < max(1,n)) { *info = -5; } else if (lddb < max(1,n)) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (n == 0 || nrhs == 0) { return *info; } magma_zmalloc_cpu( &work, n * nrhs ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } i1 = 1; i2 = n; if (notran) { inc = 1; /* Solve A * X = B. */ magma_zgetmatrix( n, nrhs, dB, lddb, work, n ); lapackf77_zlaswp(&nrhs, work, &n, &i1, &i2, ipiv, &inc); magma_zsetmatrix( n, nrhs, work, n, dB, lddb ); if ( nrhs == 1) { magma_ztrsv(MagmaLower, MagmaNoTrans, MagmaUnit, n, dA, ldda, dB, 1 ); magma_ztrsv(MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, dA, ldda, dB, 1 ); } else { magma_ztrsm(MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, n, nrhs, c_one, dA, ldda, dB, lddb ); magma_ztrsm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb ); } } else { inc = -1; /* Solve A**T * X = B or A**H * X = B. */ if ( nrhs == 1) { magma_ztrsv(MagmaUpper, trans, MagmaNonUnit, n, dA, ldda, dB, 1 ); magma_ztrsv(MagmaLower, trans, MagmaUnit, n, dA, ldda, dB, 1 ); } else { magma_ztrsm(MagmaLeft, MagmaUpper, trans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb ); magma_ztrsm(MagmaLeft, MagmaLower, trans, MagmaUnit, n, nrhs, c_one, dA, ldda, dB, lddb ); } magma_zgetmatrix( n, nrhs, dB, lddb, work, n ); lapackf77_zlaswp(&nrhs, work, &n, &i1, &i2, ipiv, &inc); magma_zsetmatrix( n, nrhs, work, n, dB, lddb ); } magma_free_cpu(work); return *info; }
/** Purpose ------- ZUNGQR generates an M-by-N COMPLEX_16 matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by ZGEQRF_GPU. Arguments --------- @param[in] m INTEGER The number of rows of the matrix Q. M >= 0. @param[in] n INTEGER The number of columns of the matrix Q. M >= N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. @param[in,out] dA COMPLEX_16 array A on the GPU, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. @param[in] ldda INTEGER The first dimension of the array A. LDDA >= max(1,M). @param[in] tau COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQRF_GPU. @param[in] dT (workspace) COMPLEX_16 work space array on the GPU, dimension (2*MIN(M, N) + ceil(N/32)*32 )*NB. This must be the 6th argument of magma_zgeqrf_gpu [ note that if N here is bigger than N in magma_zgeqrf_gpu, the workspace requirement DT in magma_zgeqrf_gpu must be as specified in this routine ]. @param[in] nb INTEGER This is the block size used in ZGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in DT. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_zungqr_gpu( magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex_ptr dA, magma_int_t ldda, magmaDoubleComplex *tau, magmaDoubleComplex_ptr dT, magma_int_t nb, magma_int_t *info) { #define dA(i,j) (dA + (i) + (j)*ldda) #define dT(j) (dT + (j)*nb) magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, lpanel; magma_int_t i, ib, ki, kk, iinfo; magma_int_t lddwork; magmaDoubleComplex_ptr dV, dW; magmaDoubleComplex *work, *panel; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (ldda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min( k, ki+nb ); } else { ki = 0; kk = 0; } // Allocate CPU work space // n*nb for zungqr workspace // (m - kk)*(n - kk) for last block's panel lwork = n*nb; lpanel = (m - kk)*(n - kk); magma_zmalloc_cpu( &work, lwork + lpanel ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } panel = work + lwork; // Allocate work space on GPU if (MAGMA_SUCCESS != magma_zmalloc( &dV, ldda*nb )) { magma_free_cpu( work ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } // dT workspace has: // 2*min(m,n)*nb for T and R^{-1} matrices from geqrf // roundup(n,32) * nb for dW larfb workspace. lddwork = min(m,n); dW = dT + 2*lddwork*nb; magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); // Use unblocked code for the last or only block. if (kk < n) { m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; magma_zgetmatrix( m_kk, k_kk, dA(kk, kk), ldda, panel, m_kk, queue ); lapackf77_zungqr( &m_kk, &n_kk, &k_kk, panel, &m_kk, &tau[kk], work, &lwork, &iinfo ); magma_zsetmatrix( m_kk, n_kk, panel, m_kk, dA(kk, kk), ldda, queue ); // Set A(1:kk,kk+1:n) to zero. magmablas_zlaset( MagmaFull, kk, n - kk, c_zero, c_zero, dA(0, kk), ldda, queue ); } if (kk > 0) { // Use blocked code // queue: copy Aii to V --> laset --> laset --> larfb --> [next] // CPU has no computation for (i = ki; i >= 0; i -= nb) { ib = min( nb, k-i ); mi = m - i; // Copy current panel on the GPU from dA to dV magma_zcopymatrix_async( mi, ib, dA(i,i), ldda, dV, ldda, queue ); // set panel to identity magmablas_zlaset( MagmaFull, i, ib, c_zero, c_zero, dA(0, i), ldda, queue ); magmablas_zlaset( MagmaFull, mi, ib, c_zero, c_one, dA(i, i), ldda, queue ); if (i < n) { // Apply H to A(i:m,i:n) from the left magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, n-i, ib, dV, ldda, dT(i), nb, dA(i, i), ldda, dW, lddwork, queue ); } } } magma_queue_sync( queue ); magma_free( dV ); magma_free_cpu( work ); magma_queue_destroy( queue ); return *info; } /* magma_zungqr_gpu */
magma_int_t magma_zinitguess( magma_z_matrix A, magma_z_matrix *L, magma_z_matrix *U, magma_queue_t queue ) { magma_int_t info = 0; magmaDoubleComplex one = MAGMA_Z_MAKE( 1.0, 0.0 ); magma_z_matrix hAL={Magma_CSR}, hAU={Magma_CSR}, dAL={Magma_CSR}, dAU={Magma_CSR}, dALU={Magma_CSR}, hALU={Magma_CSR}, hD={Magma_CSR}, dD={Magma_CSR}, dL={Magma_CSR}, hL={Magma_CSR}; magma_int_t i,j; magma_int_t offdiags = 0; magma_index_t *diag_offset; magmaDoubleComplex *diag_vals=NULL; // need only lower triangular hAL.diagorder_type = Magma_VALUE; CHECK( magma_zmconvert( A, &hAL, Magma_CSR, Magma_CSRL, queue )); //magma_zmconvert( hAL, &hALCOO, Magma_CSR, Magma_CSRCOO ); // need only upper triangular //magma_zmconvert( A, &hAU, Magma_CSR, Magma_CSRU ); CHECK( magma_z_cucsrtranspose( hAL, &hAU, queue )); //magma_zmconvert( hAU, &hAUCOO, Magma_CSR, Magma_CSRCOO ); CHECK( magma_zmtransfer( hAL, &dAL, Magma_CPU, Magma_DEV, queue )); CHECK( magma_z_spmm( one, dAL, dAU, &dALU, queue )); CHECK( magma_zmtransfer( dALU, &hALU, Magma_DEV, Magma_CPU, queue )); magma_zmfree( &dAU, queue); magma_zmfree( &dALU, queue); CHECK( magma_zmalloc_cpu( &diag_vals, offdiags+1 )); CHECK( magma_index_malloc_cpu( &diag_offset, offdiags+1 )); diag_offset[0] = 0; diag_vals[0] = MAGMA_Z_MAKE( 1.0, 0.0 ); CHECK( magma_zmgenerator( hALU.num_rows, offdiags, diag_offset, diag_vals, &hD, queue )); magma_zmfree( &hALU, queue ); for(i=0; i<hALU.num_rows; i++){ for(j=hALU.row[i]; j<hALU.row[i+1]; j++){ if( hALU.col[j] == i ){ //printf("%d %d %d == %d -> %f -->", i, j, hALU.col[j], i, hALU.val[j]); hD.val[i] = MAGMA_Z_MAKE( 1.0 / sqrt(fabs(MAGMA_Z_REAL(hALU.val[j]))) , 0.0 ); //printf("insert %f at %d\n", hD.val[i], i); } } } CHECK( magma_zmtransfer( hD, &dD, Magma_CPU, Magma_DEV, queue )); magma_zmfree( &hD, queue); CHECK( magma_z_spmm( one, dD, dAL, &dL, queue )); magma_zmfree( &dAL, queue ); magma_zmfree( &dD, queue ); /* // check for diagonal = 1 magma_z_matrix dLt={Magma_CSR}, dLL={Magma_CSR}, LL={Magma_CSR}; CHECK( magma_z_cucsrtranspose( dL, &dLt )); CHECK( magma_zcuspmm( dL, dLt, &dLL )); CHECK( magma_zmtransfer( dLL, &LL, Magma_DEV, Magma_CPU )); //for(i=0; i < hALU.num_rows; i++) { for(i=0; i < 100; i++) { for(j=hALU.row[i]; j < hALU.row[i+1]; j++) { if( hALU.col[j] == i ){ printf("%d %d -> %f -->", i, i, LL.val[j]); } } } */ CHECK( magma_zmtransfer( dL, &hL, Magma_DEV, Magma_CPU, queue )); CHECK( magma_zmconvert( hL, L, Magma_CSR, Magma_CSRCOO, queue )); cleanup: if( info !=0 ){ magma_zmfree( L, queue ); magma_zmfree( U, queue ); } magma_zmfree( &dAU, queue); magma_zmfree( &dALU, queue); magma_zmfree( &dL, queue ); magma_zmfree( &hL, queue ); magma_zmfree( &dAL, queue ); magma_zmfree( &dD, queue ); magma_zmfree( &hD, queue); magma_zmfree( &hALU, queue ); return info; }
magma_int_t magma_zilures( magma_z_matrix A, magma_z_matrix L, magma_z_matrix U, magma_z_matrix *LU, real_Double_t *res, real_Double_t *nonlinres, magma_queue_t queue ) { magma_int_t info = 0; magmaDoubleComplex tmp; real_Double_t tmp2; magma_int_t i, j, k; magmaDoubleComplex one = MAGMA_Z_MAKE( 1.0, 0.0 ); magma_z_matrix LL={Magma_CSR}, L_d={Magma_CSR}, U_d={Magma_CSR}, LU_d={Magma_CSR}; if( L.row[1]==1 ){ // lower triangular with unit diagonal //printf("L lower triangular.\n"); LL.diagorder_type = Magma_UNITY; CHECK( magma_zmconvert( L, &LL, Magma_CSR, Magma_CSRL, queue )); } else if ( L.row[1]==0 ){ // strictly lower triangular //printf("L strictly lower triangular.\n"); CHECK( magma_zmtransfer( L, &LL, Magma_CPU, Magma_CPU, queue )); magma_free_cpu( LL.col ); magma_free_cpu( LL.val ); LL.nnz = L.nnz+L.num_rows; CHECK( magma_zmalloc_cpu( &LL.val, LL.nnz )); CHECK( magma_index_malloc_cpu( &LL.col, LL.nnz )); magma_int_t z=0; for (i=0; i < L.num_rows; i++) { LL.row[i] = z; for (j=L.row[i]; j < L.row[i+1]; j++) { LL.val[z] = L.val[j]; LL.col[z] = L.col[j]; z++; } // add unit diagonal LL.val[z] = MAGMA_Z_MAKE(1.0, 0.0); LL.col[z] = i; z++; } LL.row[LL.num_rows] = z; } else { printf("error: L neither lower nor strictly lower triangular!\n"); } CHECK( magma_zmtransfer( LL, &L_d, Magma_CPU, Magma_DEV, queue )); CHECK( magma_zmtransfer( U, &U_d, Magma_CPU, Magma_DEV, queue )); magma_zmfree( &LL, queue ); CHECK( magma_z_spmm( one, L_d, U_d, &LU_d, queue )); CHECK( magma_zmtransfer(LU_d, LU, Magma_DEV, Magma_CPU, queue )); magma_zmfree( &L_d, queue ); magma_zmfree( &U_d, queue ); magma_zmfree( &LU_d, queue ); // compute Frobenius norm of A-LU for(i=0; i<A.num_rows; i++){ for(j=A.row[i]; j<A.row[i+1]; j++){ magma_index_t lcol = A.col[j]; for(k=LU->row[i]; k<LU->row[i+1]; k++){ if( LU->col[k] == lcol ){ tmp = MAGMA_Z_MAKE( MAGMA_Z_REAL( LU->val[k] )- MAGMA_Z_REAL( A.val[j] ) , 0.0 ); LU->val[k] = tmp; tmp2 = (real_Double_t) fabs( MAGMA_Z_REAL(tmp) ); (*nonlinres) = (*nonlinres) + tmp2*tmp2; } } } } for(i=0; i<LU->num_rows; i++){ for(j=LU->row[i]; j<LU->row[i+1]; j++){ tmp2 = (real_Double_t) fabs( MAGMA_Z_REAL(LU->val[j]) ); (*res) = (*res) + tmp2* tmp2; } } (*res) = sqrt((*res)); (*nonlinres) = sqrt((*nonlinres)); cleanup: if( info !=0 ){ magma_zmfree( LU, queue ); } magma_zmfree( &LL, queue ); magma_zmfree( &L_d, queue ); magma_zmfree( &U_d, queue ); magma_zmfree( &LU_d, queue ); return info; }
extern "C" magma_int_t magma_zgetrf2_gpu( magma_int_t m, magma_int_t n, magmaDoubleComplex_ptr dA, size_t dA_offset, magma_int_t ldda, magma_int_t *ipiv, magma_queue_t queues[2], magma_int_t *info ) { /* -- clMAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define dA(i_, j_) dA, dA_offset + (i_)*nb + (j_)*nb*ldda #define dAT(i_, j_) dAT, dAT_offset + (i_)*nb*lddat + (j_)*nb #define dAP(i_, j_) dAP, (i_) + (j_)*maxm #define work(i_) (work + (i_)) magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t iinfo, nb; magma_int_t maxm, maxn, mindim; magma_int_t i, j, rows, s, lddat, ldwork; magmaDoubleComplex_ptr dAT, dAP; magmaDoubleComplex *work; size_t dAT_offset; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); nb = magma_get_zgetrf_nb(m); s = mindim / nb; if (nb <= 1 || nb >= min(m,n)) { /* Use CPU code. */ if ( MAGMA_SUCCESS != magma_zmalloc_cpu( &work, m*n )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_zgetmatrix( m, n, dA(0,0), ldda, work(0), m, queues[0] ); lapackf77_zgetrf( &m, &n, work, &m, ipiv, info ); magma_zsetmatrix( m, n, work(0), m, dA(0,0), ldda, queues[0] ); magma_free_cpu( work ); } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; maxn = ((n + 31)/32)*32; if ( MAGMA_SUCCESS != magma_zmalloc( &dAP, nb*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } // square matrices can be done in place; // rectangular requires copy to transpose if ( m == n ) { dAT = dA; dAT_offset = dA_offset; lddat = ldda; magmablas_ztranspose_inplace( m, dAT(0,0), lddat, queues[0] ); } else { lddat = maxn; // N-by-M dAT_offset = 0; if ( MAGMA_SUCCESS != magma_zmalloc( &dAT, lddat*maxm )) { magma_free( dAP ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magmablas_ztranspose( m, n, dA(0,0), ldda, dAT(0,0), lddat, queues[0] ); } ldwork = maxm; /* if ( MAGMA_SUCCESS != magma_zmalloc_cpu( &work, ldwork*nb ) ) { magma_free( dAP ); if ( dA != dAT ) magma_free( dAT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } */ cl_mem work_mapped = clCreateBuffer( gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, ldwork*nb * sizeof(magmaDoubleComplex), NULL, NULL ); work = (magmaDoubleComplex*) clEnqueueMapBuffer( queues[0], work_mapped, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, ldwork*nb * sizeof(magmaDoubleComplex), 0, NULL, NULL, NULL ); for( j=0; j < s; j++ ) { // download j-th panel magmablas_ztranspose( nb, m-j*nb, dAT(j,j), lddat, dAP(0,0), maxm, queues[0] ); clFlush( queues[0] ); magma_queue_sync( queues[0] ); magma_zgetmatrix_async( m-j*nb, nb, dAP(0,0), maxm, work(0), ldwork, queues[1], NULL ); clFlush( queues[1] ); if ( j > 0 ) { magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (j+1)*nb, nb, c_one, dAT(j-1,j-1), lddat, dAT(j-1,j+1), lddat, queues[0] ); magma_zgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-j*nb, nb, c_neg_one, dAT(j-1,j+1), lddat, dAT(j, j-1), lddat, c_one, dAT(j, j+1), lddat, queues[0] ); } magma_queue_sync( queues[1] ); // do the cpu part rows = m - j*nb; lapackf77_zgetrf( &rows, &nb, work, &ldwork, ipiv+j*nb, &iinfo ); if ( *info == 0 && iinfo > 0 ) *info = iinfo + j*nb; for( i=j*nb; i < j*nb + nb; ++i ) { ipiv[i] += j*nb; } magmablas_zlaswp( n, dAT(0,0), lddat, j*nb + 1, j*nb + nb, ipiv, 1, queues[0] ); clFlush( queues[0] ); // upload j-th panel magma_zsetmatrix_async( m-j*nb, nb, work(0), ldwork, dAP(0,0), maxm, queues[1], NULL ); magma_queue_sync( queues[1] ); magmablas_ztranspose( m-j*nb, nb, dAP(0,0), maxm, dAT(j,j), lddat, queues[0] ); clFlush( queues[0] ); // do the small non-parallel computations (next panel update) if ( s > (j+1) ) { magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+1), lddat, queues[0] ); magma_zgemm( MagmaNoTrans, MagmaNoTrans, nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), lddat, dAT(j+1, j ), lddat, c_one, dAT(j+1, j+1), lddat, queues[0] ); } else { magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+1), lddat, queues[0] ); magma_zgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), lddat, dAT(j+1, j ), lddat, c_one, dAT(j+1, j+1), lddat, queues[0] ); } } magma_int_t nb0 = min( m - s*nb, n - s*nb ); if ( nb0 > 0 ) { rows = m - s*nb; magmablas_ztranspose( nb0, rows, dAT(s,s), lddat, dAP(0,0), maxm, queues[0] ); clFlush( queues[0] ); magma_queue_sync( queues[0] ); magma_zgetmatrix_async( rows, nb0, dAP(0,0), maxm, work(0), ldwork, queues[1], NULL ); magma_queue_sync( queues[1] ); // do the cpu part lapackf77_zgetrf( &rows, &nb0, work, &ldwork, ipiv+s*nb, &iinfo ); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + s*nb; for( i=s*nb; i < s*nb + nb0; ++i ) { ipiv[i] += s*nb; } magmablas_zlaswp( n, dAT(0,0), lddat, s*nb + 1, s*nb + nb0, ipiv, 1, queues[0] ); clFlush( queues[0] ); // upload j-th panel magma_zsetmatrix_async( rows, nb0, work(0), ldwork, dAP(0,0), maxm, queues[1], NULL ); magma_queue_sync( queues[1] ); magmablas_ztranspose( rows, nb0, dAP(0,0), maxm, dAT(s,s), lddat, queues[0] ); clFlush( queues[0] ); magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb-nb0, nb0, c_one, dAT(s,s), lddat, dAT(s,s)+nb0, lddat, queues[0] ); } // undo transpose if ( dA == dAT ) { magmablas_ztranspose_inplace( m, dAT(0,0), lddat, queues[0] ); } else { magmablas_ztranspose( n, m, dAT(0,0), lddat, dA(0,0), ldda, queues[0] ); magma_free( dAT ); } magma_queue_sync( queues[0] ); magma_queue_sync( queues[1] ); magma_free( dAP ); // magma_free_cpu( work ); clEnqueueUnmapMemObject( queues[0], work_mapped, work, 0, NULL, NULL ); clReleaseMemObject( work_mapped ); } return *info; } /* magma_zgetrf_gpu */
/** Purpose ------- ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. If the current stream is NULL, this version replaces it with a new stream to overlap computation with communication. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_zgesv_comp ********************************************************************/ extern "C" magma_int_t magma_zgetrf_gpu( magma_int_t m, magma_int_t n, magmaDoubleComplex_ptr dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info) { #define dAT(i_, j_) (dAT + (i_)*nb*lddat + (j_)*nb) magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t iinfo, nb; magma_int_t maxm, maxn, mindim; magma_int_t i, j, rows, cols, s, lddat, ldwork; magmaDoubleComplex *dAT, *dAP, *work; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); nb = magma_get_zgetrf_nb(m); s = mindim / nb; if (nb <= 1 || nb >= min(m,n)) { /* Use CPU code. */ magma_zmalloc_cpu( &work, m * n ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_zgetmatrix( m, n, dA, ldda, work, m ); lapackf77_zgetrf(&m, &n, work, &m, ipiv, info); magma_zsetmatrix( m, n, work, m, dA, ldda ); magma_free_cpu(work); } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; maxn = ((n + 31)/32)*32; if (MAGMA_SUCCESS != magma_zmalloc( &dAP, nb*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } // square matrices can be done in place; // rectangular requires copy to transpose if ( m == n ) { dAT = dA; lddat = ldda; magmablas_ztranspose_inplace( m, dAT, ldda ); } else { lddat = maxn; // N-by-M if (MAGMA_SUCCESS != magma_zmalloc( &dAT, lddat*maxm )) { magma_free( dAP ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magmablas_ztranspose( m, n, dA, ldda, dAT, lddat ); } ldwork = maxm; if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, ldwork*nb )) { magma_free( dAP ); if ( ! (m == n)) magma_free( dAT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* Define user stream if current stream is NULL */ magma_queue_t stream[2]; magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); magma_queue_create( &stream[0] ); if (orig_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else { stream[1] = orig_stream; } for( j=0; j < s; j++ ) { // download j-th panel cols = maxm - j*nb; magmablas_ztranspose( nb, m-j*nb, dAT(j,j), lddat, dAP, cols ); // make sure that the transpose has completed magma_queue_sync( stream[1] ); magma_zgetmatrix_async( m-j*nb, nb, dAP, cols, work, ldwork, stream[0]); if ( j > 0 ) { magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (j+1)*nb, nb, c_one, dAT(j-1,j-1), lddat, dAT(j-1,j+1), lddat ); magma_zgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-j*nb, nb, c_neg_one, dAT(j-1,j+1), lddat, dAT(j, j-1), lddat, c_one, dAT(j, j+1), lddat ); } // do the cpu part rows = m - j*nb; magma_queue_sync( stream[0] ); lapackf77_zgetrf( &rows, &nb, work, &ldwork, ipiv+j*nb, &iinfo); if ( *info == 0 && iinfo > 0 ) *info = iinfo + j*nb; // upload j-th panel magma_zsetmatrix_async( m-j*nb, nb, work, ldwork, dAP, maxm, stream[0]); for( i=j*nb; i < j*nb + nb; ++i ) { ipiv[i] += j*nb; } magmablas_zlaswp( n, dAT, lddat, j*nb + 1, j*nb + nb, ipiv, 1 ); magma_queue_sync( stream[0] ); magmablas_ztranspose( m-j*nb, nb, dAP, maxm, dAT(j,j), lddat ); // do the small non-parallel computations (next panel update) if ( s > (j+1) ) { magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+1), lddat); magma_zgemm( MagmaNoTrans, MagmaNoTrans, nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), lddat, dAT(j+1, j ), lddat, c_one, dAT(j+1, j+1), lddat ); } else { magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+1), lddat); magma_zgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), lddat, dAT(j+1, j ), lddat, c_one, dAT(j+1, j+1), lddat ); } } magma_int_t nb0 = min(m - s*nb, n - s*nb); if ( nb0 > 0 ) { rows = m - s*nb; cols = maxm - s*nb; magmablas_ztranspose( nb0, rows, dAT(s,s), lddat, dAP, maxm ); magma_zgetmatrix( rows, nb0, dAP, maxm, work, ldwork ); // do the cpu part lapackf77_zgetrf( &rows, &nb0, work, &ldwork, ipiv+s*nb, &iinfo); if ( *info == 0 && iinfo > 0 ) *info = iinfo + s*nb; for( i=s*nb; i < s*nb + nb0; ++i ) { ipiv[i] += s*nb; } magmablas_zlaswp( n, dAT, lddat, s*nb + 1, s*nb + nb0, ipiv, 1 ); // upload j-th panel magma_zsetmatrix( rows, nb0, work, ldwork, dAP, maxm ); magmablas_ztranspose( rows, nb0, dAP, maxm, dAT(s,s), lddat ); magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb-nb0, nb0, c_one, dAT(s,s), lddat, dAT(s,s)+nb0, lddat); } // undo transpose if ( m == n ) { magmablas_ztranspose_inplace( m, dAT, lddat ); } else { magmablas_ztranspose( n, m, dAT, lddat, dA, ldda ); magma_free( dAT ); } magma_free( dAP ); magma_free_pinned( work ); magma_queue_destroy( stream[0] ); if (orig_stream == NULL) { magma_queue_destroy( stream[1] ); } magmablasSetKernelStream( orig_stream ); } return *info; } /* magma_zgetrf_gpu */
extern "C" magma_int_t magma_zmlumerge( magma_z_matrix L, magma_z_matrix U, magma_z_matrix *A, magma_queue_t queue ) { magma_int_t info = 0; if( L.storage_type == Magma_CSR && U.storage_type == Magma_CSR ){ if( L.memory_location == Magma_CPU && U.memory_location == Magma_CPU ){ CHECK( magma_zmtransfer( L, A, Magma_CPU, Magma_CPU, queue )); magma_free_cpu( A->col ); magma_free_cpu( A->val ); // make sure it is strictly lower triangular magma_int_t z = 0; for(magma_int_t i=0; i<A->num_rows; i++){ for(magma_int_t j=L.row[i]; j<L.row[i+1]; j++){ if( L.col[j] < i ){// make sure it is strictly lower triangular z++; } } for(magma_int_t j=U.row[i]; j<U.row[i+1]; j++){ z++; } } A->nnz = z; // fill A with the new structure; CHECK( magma_index_malloc_cpu( &A->col, A->nnz )); CHECK( magma_zmalloc_cpu( &A->val, A->nnz )); z = 0; for(magma_int_t i=0; i<A->num_rows; i++){ A->row[i] = z; for(magma_int_t j=L.row[i]; j<L.row[i+1]; j++){ if( L.col[j] < i ){// make sure it is strictly lower triangular A->col[z] = L.col[j]; A->val[z] = L.val[j]; z++; } } for(magma_int_t j=U.row[i]; j<U.row[i+1]; j++){ A->col[z] = U.col[j]; A->val[z] = U.val[j]; z++; } } A->row[A->num_rows] = z; A->nnz = z; } else{ printf("error: matrix not on CPU.\n"); info = MAGMA_ERR_NOT_SUPPORTED; } } else{ printf("error: matrix in wrong format.\n"); info = MAGMA_ERR_NOT_SUPPORTED; } cleanup: if( info != 0 ){ magma_zmfree( A, queue ); } return info; }
static void magma_ztile_bulge_parallel(magma_int_t my_core_id, magma_int_t cores_num, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *V, magma_int_t ldv, magmaDoubleComplex *TAU, magma_int_t n, magma_int_t nb, magma_int_t nbtiles, magma_int_t grsiz, magma_int_t Vblksiz, volatile magma_int_t *prog) { magma_int_t sweepid, myid, shift, stt, st, ed, stind, edind; magma_int_t blklastind, colpt; magma_int_t stepercol; magma_int_t i, j, m, k; magma_int_t thgrsiz, thgrnb, thgrid, thed; magma_int_t coreid; magma_int_t colblktile, maxrequiredcores, colpercore, mycoresnb; magma_int_t fin; magmaDoubleComplex *work; if (n <= 0) return; if (grsiz <= 0) return; //printf("=================> my core id %d of %d \n",my_core_id, cores_num); /* As I store V in the V vector there are overlap between * tasks so shift is now 4 where group need to be always * multiple of 2, because as example if grs=1 task 2 from * sweep 2 can run with task 6 sweep 1., but task 2 sweep 2 * will overwrite the V of tasks 5 sweep 1 which are used by * task 6, so keep in mind that group need to be multiple of 2, * and thus tasks 2 sweep 2 will never run with task 6 sweep 1. * However, when storing V in A, shift could be back to 3. * */ magma_zmalloc_cpu(&work, n); mycoresnb = cores_num; shift = 5; if (grsiz == 1) colblktile=1; else colblktile=grsiz/2; maxrequiredcores = nbtiles/colblktile; if (maxrequiredcores < 1)maxrequiredcores=1; colpercore = colblktile*nb; if (mycoresnb > maxrequiredcores) mycoresnb = maxrequiredcores; thgrsiz = n; stepercol = magma_ceildiv(shift, grsiz); thgrnb = magma_ceildiv(n-1, thgrsiz); #ifdef ENABLE_DEBUG if (my_core_id == 0) { if (cores_num > maxrequiredcores) { printf("==================================================================================\n"); printf(" WARNING only %3d threads are required to run this test optimizing cache reuse\n", maxrequiredcores); printf("==================================================================================\n"); } printf(" Static bulgechasing version v9_9col threads %4d N %5d NB %5d grs %4d thgrsiz %4d \n", cores_num, n, nb, grsiz, thgrsiz); } #endif for (thgrid = 1; thgrid <= thgrnb; thgrid++) { stt = (thgrid-1)*thgrsiz+1; thed = min( (stt + thgrsiz -1), (n-1)); for (i = stt; i <= n-1; i++) { ed = min(i,thed); if (stt > ed) break; for (m = 1; m <= stepercol; m++) { st=stt; for (sweepid = st; sweepid <= ed; sweepid++) { for (k = 1; k <= grsiz; k++) { myid = (i-sweepid)*(stepercol*grsiz) +(m-1)*grsiz + k; if (myid%2 == 0) { colpt = (myid/2)*nb+1+sweepid-1; stind = colpt-nb+1; edind = min(colpt,n); blklastind = colpt; if (stind >= edind) { printf("ERROR---------> st >= ed %d %d \n\n", (int) stind, (int) edind); exit(-10); } } else { colpt = ((myid+1)/2)*nb + 1 +sweepid -1; stind = colpt-nb+1; edind = min(colpt,n); if ( (stind >= edind-1) && (edind == n) ) blklastind=n; else blklastind=0; if (stind > edind) { printf("ERROR---------> st >= ed %d %d \n\n", (int) stind, (int) edind); exit(-10); } } coreid = (stind/colpercore)%mycoresnb; if (my_core_id == coreid) { fin=0; while(fin == 0) { if (myid == 1) { if ( prog[myid+shift-1] == (sweepid-1) ) { magma_ztrdtype1cbHLsym_withQ_v2(n, nb, A, lda, V, ldv, TAU, stind, edind, sweepid, Vblksiz, work); fin=1; prog[myid]= sweepid; if (blklastind >= (n-1)) { for (j = 1; j <= shift; j++) prog[myid+j]=sweepid; } } // END progress condition } else { if ( (prog[myid-1] == sweepid) && (prog[myid+shift-1] == (sweepid-1)) ) { if (myid%2 == 0) magma_ztrdtype2cbHLsym_withQ_v2(n, nb, A, lda, V, ldv, TAU, stind, edind, sweepid, Vblksiz, work); else magma_ztrdtype3cbHLsym_withQ_v2(n, nb, A, lda, V, ldv, TAU, stind, edind, sweepid, Vblksiz, work); fin=1; prog[myid]= sweepid; if (blklastind >= (n-1)) { for (j = 1; j <= shift+mycoresnb; j++) prog[myid+j]=sweepid; } } // END progress condition } // END if myid == 1 } // END while loop } // END if my_core_id == coreid if (blklastind >= (n-1)) { stt=stt+1; break; } } // END for k=1:grsiz } // END for sweepid=st:ed } // END for m=1:stepercol } // END for i=1:n-1 } // END for thgrid=1:thgrnb magma_free_cpu(work); } // END FUNCTION
extern "C" magma_int_t magma_zgetrf_mgpu(magma_int_t num_gpus, magma_int_t m, magma_int_t n, cuDoubleComplex **d_lA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments ========= NUM_GPUS (input) INTEGER The number of GPUS to be used for the factorization. M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define inAT(id,i,j) (d_lAT[(id)] + (i)*nb*lddat + (j)*nb) cuDoubleComplex c_one = MAGMA_Z_ONE; cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t iinfo, nb, n_local[MagmaMaxGPUs]; magma_int_t maxm, mindim; magma_int_t i, j, d, rows, cols, s, lddat, lddwork; magma_int_t id, i_local, i_local2, nb0, nb1; cuDoubleComplex *d_lAT[MagmaMaxGPUs]; cuDoubleComplex *d_panel[MagmaMaxGPUs], *work; cudaStream_t streaml[4][2]; /* Check arguments */ *info = 0; if (m < 0) *info = -2; else if (n < 0) *info = -3; else if (ldda < max(1,m)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); nb = magma_get_zgetrf_nb(m); if (nb <= 1 || nb >= n) { /* Use CPU code. */ magma_zmalloc_cpu( &work, m * n ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_zgetmatrix( m, n, d_lA[0], ldda, work, m ); lapackf77_zgetrf(&m, &n, work, &m, ipiv, info); magma_zsetmatrix( m, n, work, m, d_lA[0], ldda ); magma_free_cpu(work); } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; if( num_gpus > ceil((double)n/nb) ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) num_gpus ); *info = -1; return *info; } /* allocate workspace for each GPU */ lddat = ((((((n+nb-1)/nb)/num_gpus)*nb)+31)/32)*32; lddat = (n+nb-1)/nb; /* number of block columns */ lddat = (lddat+num_gpus-1)/num_gpus; /* number of block columns per GPU */ lddat = nb*lddat; /* number of columns per GPU */ lddat = ((lddat+31)/32)*32; /* make it a multiple of 32 */ for(i=0; i<num_gpus; i++){ magma_setdevice(i); /* local-n and local-ld */ n_local[i] = ((n/nb)/num_gpus)*nb; if (i < (n/nb)%num_gpus) n_local[i] += nb; else if (i == (n/nb)%num_gpus) n_local[i] += n%nb; /* workspaces */ if (MAGMA_SUCCESS != magma_zmalloc( &d_panel[i], 3*nb*maxm )) { for( j=0; j<=i; j++ ) { magma_setdevice(j); } for( j=0; j<i; j++ ) { magma_setdevice(j); magma_free( d_panel[j] ); magma_free( d_lAT[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* local-matrix storage */ if (MAGMA_SUCCESS != magma_zmalloc( &d_lAT[i], lddat*maxm )) { for( j=0; j<=i; j++ ) { magma_setdevice(j); magma_free( d_panel[j] ); } for( j=0; j<i; j++ ) { magma_setdevice(j); magma_free( d_lAT[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* create the streams */ magma_queue_create( &streaml[i][0] ); magma_queue_create( &streaml[i][1] ); magmablasSetKernelStream(streaml[i][1]); magmablas_ztranspose2( d_lAT[i], lddat, d_lA[i], ldda, m, n_local[i] ); } for(i=0; i<num_gpus; i++){ magma_setdevice(i); cudaStreamSynchronize(streaml[i][0]); magmablasSetKernelStream(NULL); } magma_setdevice(0); /* cpu workspace */ lddwork = maxm; if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, lddwork*nb*num_gpus )) { for(i=0; i<num_gpus; i++ ) { magma_setdevice(i); magma_free( d_panel[i] ); magma_free( d_lAT[i] ); } *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* calling multi-gpu interface with allocated workspaces and streams */ //magma_zgetrf1_mgpu( num_gpus, m, n, nb, 0, d_lAT, lddat, ipiv, d_panel, work, maxm, // (cudaStream_t **)streaml, info ); magma_zgetrf2_mgpu(num_gpus, m, n, nb, 0, d_lAT, lddat, ipiv, d_panel, work, maxm, streaml, info); /* clean up */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); /* save on output */ magmablas_ztranspose2( d_lA[d], ldda, d_lAT[d], lddat, n_local[d], m ); magma_device_sync(); magma_free( d_lAT[d] ); magma_free( d_panel[d] ); magma_queue_destroy( streaml[d][0] ); magma_queue_destroy( streaml[d][1] ); magmablasSetKernelStream(NULL); } /* end of for d=1,..,num_gpus */ magma_setdevice(0); magma_free_pinned( work ); } return *info; /* End of MAGMA_ZGETRF_MGPU */ }
extern "C" magma_err_t magma_zgetrf_msub(magma_int_t trans, magma_int_t num_subs, magma_int_t num_gpus, magma_int_t m, magma_int_t n, magmaDoubleComplex_ptr *d_lA, size_t dlA_offset, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info, magma_queue_t *queues) { /* -- clMAGMA (version 1.1.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date January 2014 Purpose ======= ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments ========= NUM_GPUS (input) INTEGER The number of GPUS to be used for the factorization. M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ magma_int_t maxm, tot_subs = num_subs*num_gpus; magma_int_t i, j, d, lddat; /* submatrix info */ magma_int_t nb, n_local[ MagmaMaxSubs * MagmaMaxGPUs ]; magmaDoubleComplex_ptr d_lAT[ MagmaMaxSubs * MagmaMaxGPUs ]; /* local workspace per GPU */ magmaDoubleComplex_ptr d_panel[ MagmaMaxGPUs ]; magmaDoubleComplex_ptr d_lAP[ MagmaMaxGPUs ]; magmaDoubleComplex *work; /* Check arguments */ *info = 0; if (m < 0) *info = -2; else if (n < 0) *info = -3; else if (trans == MagmaTrans && ldda < max(1,m)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ nb = magma_get_zgetrf_nb(m); if (nb <= 1 || nb >= n) { /* Use CPU code. */ magma_zmalloc_cpu( &work, m * n ); if (work == NULL) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_zgetmatrix( m, n, d_lA[0], 0, ldda, work, 0, m, queues[0] ); lapackf77_zgetrf(&m, &n, work, &m, ipiv, info); magma_zsetmatrix( m, n, work, 0, m, d_lA[0], 0, ldda, queues[0] ); magma_free_cpu(work); } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; if (tot_subs > ceil((double)n/nb)) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) tot_subs ); *info = -1; return *info; } /* allocate workspace for each GPU */ lddat = n/nb; /* number of block columns */ lddat = lddat/tot_subs; /* number of block columns per GPU */ lddat = nb*lddat; /* number of columns per GPU */ if (lddat * tot_subs < n) { /* left over */ if (n-lddat*tot_subs >= nb) { lddat += nb; } else { lddat += (n-lddat*tot_subs)%nb; } } lddat = ((lddat+31)/32)*32; /* make it a multiple of 32 */ /* allocating workspace */ for (d=0; d<num_gpus; d++) { //#define SINGLE_GPU_PER_CONTEXT #ifdef SINGLE_GPU_PER_CONTEXT if ((MAGMA_SUCCESS != magma_zmalloc_mgpu( d, &d_panel[d], (2+num_gpus)*nb*maxm )) || (MAGMA_SUCCESS != magma_zmalloc_mgpu( d, &d_lAP[d], (2+num_gpus)*nb*maxm )) ) { #else if ((MAGMA_SUCCESS != magma_zmalloc( &d_panel[d], (2+num_gpus)*nb*maxm )) || (MAGMA_SUCCESS != magma_zmalloc( &d_lAP[d], (2+num_gpus)*nb*maxm )) ) { #endif for( i=0; i<d; i++ ) { magma_free( d_panel[i] ); magma_free( d_lAP[i] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } /* transposing the local matrix */ for (i=0; i<tot_subs; i++) { /* local-n and local-ld */ n_local[i] = ((n/nb)/tot_subs)*nb; if (i < (n/nb)%tot_subs) n_local[i] += nb; else if (i == (n/nb)%tot_subs) n_local[i] += n%nb; /* local-matrix storage */ if (trans == MagmaNoTrans) { d_lAT[i] = d_lA[i]; } else { if ((m == n_local[i]) && (m%32 == 0) && (ldda%32 == 0) && (ldda == lddat)) { d_lAT[i] = d_lA[i]; magma_ztranspose_inplace( d_lA[i], 0, ldda, ldda, queues[2*(i%num_gpus)+1] ); } else { #ifdef SINGLE_GPU_PER_CONTEXT if (MAGMA_SUCCESS != magma_zmalloc_mgpu( i%num_gpus, &d_lAT[i], lddat*maxm )) { #else if (MAGMA_SUCCESS != magma_zmalloc( &d_lAT[i], lddat*maxm )) { #endif for (j=0; j<=i; j++) { magma_free( d_panel[j] ); magma_free( d_lAP[j] ); } for (j=0; j<i; j++) { if (d_lAT[j] != d_lA[j]) magma_free( d_lAT[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_ztranspose2(d_lAT[i], 0, lddat, d_lA[i], 0, ldda, m, n_local[i], queues[2*(i%num_gpus)+1]); } } } if (trans == MagmaNoTrans) { for (d=0; d<num_gpus; d++){ magma_queue_sync(queues[2*d+1]); } } /* cpu workspace */ #ifdef USE_PINNED_CLMEMORY cl_mem buffer = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(magmaDoubleComplex)*maxm*nb*(1+num_gpus), NULL, NULL); for (d=0; d<num_gpus; d++) { work = (magmaDoubleComplex*)clEnqueueMapBuffer(queues[2*d], buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(magmaDoubleComplex)*maxm*nb*(1+num_gpus), 0, NULL, NULL, NULL); } #else if (MAGMA_SUCCESS != magma_zmalloc_cpu( &work, maxm*nb*(1+num_gpus) )) { for(d=0; d<num_gpus; d++ ) magma_free( d_panel[d] ); for(d=0; d<tot_subs; d++ ) { if( d_lAT[d] != d_lA[d] ) magma_free( d_lAT[d] ); } *info = MAGMA_ERR_HOST_ALLOC; return *info; } #endif /* calling multi-gpu interface with allocated workspaces and streams */ magma_zgetrf2_msub(num_subs, num_gpus, m, n, nb, 0, d_lAT, 0, lddat, ipiv, d_lAP, d_panel, 0, work, maxm, info, queues); /* save on output */ for (d=0; d<tot_subs; d++) { if (trans == MagmaNoTrans) { //magma_zcopymatrix( n_local[d], m, d_lAT[d], 0, lddat, d_lA[d], 0, ldda, queues[2*d+1] ); } else { if (d_lAT[d] == d_lA[d]) { magma_ztranspose_inplace( d_lA[d], 0, ldda, ldda, queues[2*(d%num_gpus)+1] ); } else { magma_ztranspose2( d_lA[d], 0, ldda, d_lAT[d], 0, lddat, n_local[d], m, queues[2*(d%num_gpus)+1] ); } } } /* clean up */ for (d=0; d<num_gpus; d++) { magma_queue_sync(queues[2*d+1]); magma_free( d_panel[d] ); magma_free( d_lAP[d] ); d_panel[d] = d_lAP[d] = NULL; } for (d=0; d<tot_subs; d++) { if (d_lAT[d] != d_lA[d]) { magma_free( d_lAT[d] ); d_lAT[d] = NULL; } } #ifdef USE_PINNED_CLMEMORY for (d=0; d<num_gpus; d++) { clEnqueueUnmapMemObject(queues[2*d], buffer, work, 0, NULL, NULL); } clReleaseMemObject( buffer ); #else magma_free_cpu( work ); #endif work = NULL; } return *info; /* End of MAGMA_ZGETRF_MSUB */ }
extern "C" magma_int_t magma_zpcg_merge( magma_z_matrix A, magma_z_matrix b, magma_z_matrix *x, magma_z_solver_par *solver_par, magma_z_preconditioner *precond_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_PCGMERGE; solver_par->numiter = 0; solver_par->spmv_count = 0; // solver variables magmaDoubleComplex alpha, beta, gamma, rho, tmp1, *skp_h={0}; double nom, nom0, r0, res, nomb; magmaDoubleComplex den; // some useful variables magmaDoubleComplex c_zero = MAGMA_Z_ZERO, c_one = MAGMA_Z_ONE; magma_int_t dofs = A.num_rows*b.num_cols; magma_z_matrix r={Magma_CSR}, d={Magma_CSR}, z={Magma_CSR}, h={Magma_CSR}, rt={Magma_CSR}; magmaDoubleComplex *d1=NULL, *d2=NULL, *skp=NULL; // GPU workspace CHECK( magma_zvinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &d, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &z, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &rt, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zvinit( &h, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_zmalloc( &d1, dofs*(2) )); CHECK( magma_zmalloc( &d2, dofs*(2) )); // array for the parameters CHECK( magma_zmalloc( &skp, 7 )); // skp = [alpha|beta|gamma|rho|tmp1|tmp2|res] // solver setup CHECK( magma_zresidualvec( A, b, *x, &r, &nom0, queue)); // preconditioner CHECK( magma_z_applyprecond_left( MagmaNoTrans, A, r, &rt, precond_par, queue )); CHECK( magma_z_applyprecond_right( MagmaNoTrans, A, rt, &h, precond_par, queue )); magma_zcopy( dofs, h.dval, 1, d.dval, 1, queue ); nom = MAGMA_Z_ABS( magma_zdotc( dofs, r.dval, 1, h.dval, 1, queue )); CHECK( magma_z_spmv( c_one, A, d, c_zero, z, queue )); // z = A d den = magma_zdotc( dofs, d.dval, 1, z.dval, 1, queue ); // den = d'* z solver_par->init_res = nom0; nomb = magma_dznrm2( dofs, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } if ( (r0 = nomb * solver_par->rtol) < ATOLERANCE ){ r0 = ATOLERANCE; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nom0; solver_par->timing[0] = 0.0; } if ( nom < r0 ) { info = MAGMA_SUCCESS; goto cleanup; } // check positive definite if ( MAGMA_Z_ABS(den) <= 0.0 ) { info = MAGMA_NONSPD; goto cleanup; } // array on host for the parameters CHECK( magma_zmalloc_cpu( &skp_h, 7 )); alpha = rho = gamma = tmp1 = c_one; beta = magma_zdotc( dofs, h.dval, 1, r.dval, 1, queue ); skp_h[0]=alpha; skp_h[1]=beta; skp_h[2]=gamma; skp_h[3]=rho; skp_h[4]=tmp1; skp_h[5]=MAGMA_Z_MAKE(nom, 0.0); skp_h[6]=MAGMA_Z_MAKE(nom, 0.0); magma_zsetvector( 7, skp_h, 1, skp, 1, queue ); //Chronometry real_Double_t tempo1, tempo2, tempop1, tempop2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; solver_par->spmv_count = 0; // start iteration do { solver_par->numiter++; // computes SpMV and dot product CHECK( magma_zcgmerge_spmv1( A, d1, d2, d.dval, z.dval, skp, queue )); solver_par->spmv_count++; if( precond_par->solver == Magma_JACOBI ){ CHECK( magma_zjcgmerge_xrbeta( dofs, d1, d2, precond_par->d.dval, x->dval, r.dval, d.dval, z.dval, h.dval, skp, queue )); } else if( precond_par->solver == Magma_NONE ){ // updates x, r CHECK( magma_zpcgmerge_xrbeta1( dofs, x->dval, r.dval, d.dval, z.dval, skp, queue )); // computes scalars and updates d CHECK( magma_zpcgmerge_xrbeta2( dofs, d1, d2, r.dval, r.dval, d.dval, skp, queue )); } else { // updates x, r CHECK( magma_zpcgmerge_xrbeta1( dofs, x->dval, r.dval, d.dval, z.dval, skp, queue )); // preconditioner in between tempop1 = magma_sync_wtime( queue ); CHECK( magma_z_applyprecond_left( MagmaNoTrans, A, r, &rt, precond_par, queue )); CHECK( magma_z_applyprecond_right( MagmaNoTrans, A, rt, &h, precond_par, queue )); // magma_zcopy( dofs, r.dval, 1, h.dval, 1 ); tempop2 = magma_sync_wtime( queue ); precond_par->runtime += tempop2-tempop1; // computes scalars and updates d CHECK( magma_zpcgmerge_xrbeta2( dofs, d1, d2, h.dval, r.dval, d.dval, skp, queue )); } //if( solver_par->numiter==1){ // magma_zcopy( dofs, h.dval, 1, d.dval, 1 ); //} // updates x, r, computes scalars and updates d //CHECK( magma_zcgmerge_xrbeta( dofs, d1, d2, x->dval, r.dval, d.dval, z.dval, skp, queue )); // check stopping criterion (asynchronous copy) magma_zgetvector( 1 , skp+6, 1, skp_h+6, 1, queue ); res = sqrt(MAGMA_Z_ABS(skp_h[6])); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( res/nomb <= solver_par->rtol || res <= solver_par->atol ){ break; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; double residual; CHECK( magma_zresidualvec( A, b, *x, &r, &residual, queue)); solver_par->iter_res = res; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->atol || solver_par->iter_res/solver_par->init_res < solver_par->rtol ){ info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_DIVERGENCE; } cleanup: magma_zmfree(&r, queue ); magma_zmfree(&z, queue ); magma_zmfree(&d, queue ); magma_zmfree(&rt, queue ); magma_zmfree(&h, queue ); magma_free( d1 ); magma_free( d2 ); magma_free( skp ); magma_free_cpu( skp_h ); solver_par->info = info; return info; } /* magma_zpcg_merge */
magma_int_t magma_zsymbilu( magma_z_matrix *A, magma_int_t levels, magma_z_matrix *L, magma_z_matrix *U, magma_queue_t queue ) { magma_int_t info = 0; magma_z_matrix A_copy={Magma_CSR}, B={Magma_CSR}; magma_z_matrix hA={Magma_CSR}, CSRCOOA={Magma_CSR}; if( A->memory_location == Magma_CPU && A->storage_type == Magma_CSR ){ CHECK( magma_zmtransfer( *A, &A_copy, Magma_CPU, Magma_CPU, queue )); CHECK( magma_zmtransfer( *A, &B, Magma_CPU, Magma_CPU, queue )); // possibility to scale to unit diagonal //magma_zmscale( &B, Magma_UNITDIAG ); CHECK( magma_zmconvert( B, L, Magma_CSR, Magma_CSR , queue)); CHECK( magma_zmconvert( B, U, Magma_CSR, Magma_CSR, queue )); magma_int_t num_lnnz = (levels > 0 ) ? B.nnz/2*(2*levels+50) : B.nnz; magma_int_t num_unnz = (levels > 0 ) ? B.nnz/2*(2*levels+50) : B.nnz; magma_free_cpu( L->col ); magma_free_cpu( U->col ); CHECK( magma_index_malloc_cpu( &L->col, num_lnnz )); CHECK( magma_index_malloc_cpu( &U->col, num_unnz )); magma_zsymbolic_ilu( levels, A->num_rows, &num_lnnz, &num_unnz, B.row, B.col, L->row, L->col, U->row, U->col ); L->nnz = num_lnnz; U->nnz = num_unnz; magma_free_cpu( L->val ); magma_free_cpu( U->val ); CHECK( magma_zmalloc_cpu( &L->val, L->nnz )); CHECK( magma_zmalloc_cpu( &U->val, U->nnz )); for( magma_int_t i=0; i<L->nnz; i++ ) L->val[i] = MAGMA_Z_MAKE( 0.0, 0.0 ); for( magma_int_t i=0; i<U->nnz; i++ ) U->val[i] = MAGMA_Z_MAKE( 0.0, 0.0 ); // take the original values (scaled) as initial guess for L for(magma_int_t i=0; i<L->num_rows; i++){ for(magma_int_t j=B.row[i]; j<B.row[i+1]; j++){ magma_index_t lcol = B.col[j]; for(magma_int_t k=L->row[i]; k<L->row[i+1]; k++){ if( L->col[k] == lcol ){ L->val[k] = B.val[j]; } } } } // take the original values (scaled) as initial guess for U for(magma_int_t i=0; i<U->num_rows; i++){ for(magma_int_t j=B.row[i]; j<B.row[i+1]; j++){ magma_index_t lcol = B.col[j]; for(magma_int_t k=U->row[i]; k<U->row[i+1]; k++){ if( U->col[k] == lcol ){ U->val[k] = B.val[j]; } } } } magma_zmfree( &B, queue ); // fill A with the new structure; magma_free_cpu( A->col ); magma_free_cpu( A->val ); CHECK( magma_index_malloc_cpu( &A->col, L->nnz+U->nnz )); CHECK( magma_zmalloc_cpu( &A->val, L->nnz+U->nnz )); A->nnz = L->nnz+U->nnz; magma_int_t z = 0; for(magma_int_t i=0; i<A->num_rows; i++){ A->row[i] = z; for(magma_int_t j=L->row[i]; j<L->row[i+1]; j++){ A->col[z] = L->col[j]; A->val[z] = L->val[j]; z++; } for(magma_int_t j=U->row[i]; j<U->row[i+1]; j++){ A->col[z] = U->col[j]; A->val[z] = U->val[j]; z++; } } A->row[A->num_rows] = z; // reset the values of A to the original entries for(magma_int_t i=0; i<A->num_rows; i++){ for(magma_int_t j=A_copy.row[i]; j<A_copy.row[i+1]; j++){ magma_index_t lcol = A_copy.col[j]; for(magma_int_t k=A->row[i]; k<A->row[i+1]; k++){ if( A->col[k] == lcol ){ A->val[k] = A_copy.val[j]; } } } } } else { magma_storage_t A_storage = A->storage_type; magma_location_t A_location = A->memory_location; CHECK( magma_zmtransfer( *A, &hA, A->memory_location, Magma_CPU, queue )); CHECK( magma_zmconvert( hA, &CSRCOOA, hA.storage_type, Magma_CSR, queue )); CHECK( magma_zsymbilu( &CSRCOOA, levels, L, U, queue )); magma_zmfree( &hA, queue ); magma_zmfree( A, queue ); CHECK( magma_zmconvert( CSRCOOA, &hA, Magma_CSR, A_storage, queue )); CHECK( magma_zmtransfer( hA, A, Magma_CPU, A_location, queue )); } cleanup: if( info != 0 ){ magma_zmfree( L, queue ); magma_zmfree( U, queue ); } magma_zmfree( &A_copy, queue ); magma_zmfree( &B, queue ); magma_zmfree( &hA, queue ); magma_zmfree( &CSRCOOA, queue ); return info; }