float verif_representativity(int m, int n, float *A, int ia, int ja, int *descA, float *U, int iu, int ju, int *descU, float *VT, int ivt, int jvt, int *descVT, float *S){ float *Acpy=NULL, *Ucpy=NULL; int nprow, npcol, myrow, mycol; int min_mn, max_mn, mpA, pcol, localcol, i, nqA; int ictxt, nbA, rsrcA, csrcA, nbU, rsrcU, csrcU, mpU, nqU; int ctxt_ = 1, nb_ = 5, rsrc_ = 6, csrc_ = 7; int izero = 0, ione = 1; float *wwork=NULL; float tmone= -1.0e+00, tpone= +1.0e+00; float residF, AnormF; min_mn = min(m,n); max_mn = max(m,n); ictxt = descA[ctxt_]; Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol ); nbA = descA[nb_]; rsrcA = descA[rsrc_] ; csrcA = descA[csrc_] ; mpA = numroc_( &m , &nbA, &myrow, &rsrcA, &nprow ); nqA = numroc_( &n , &nbA, &mycol, &csrcA, &npcol ); Acpy = (float *)calloc(mpA*nqA,sizeof(float)) ; if (Acpy==NULL){ printf("error of memory allocation Acpy on proc %dx%d\n",myrow,mycol); exit(0); } pslacpy_( "All", &m, &n, A, &ia, &ja, descA, Acpy, &ia, &ja, descA ); nbU = descU[nb_]; rsrcU = descU[rsrc_] ; csrcU = descU[csrc_] ; mpU = numroc_( &m , &nbU, &myrow, &rsrcU, &nprow ); nqU = numroc_( &min_mn, &nbU, &mycol, &csrcU, &npcol ); Ucpy = (float *)calloc(mpU*nqU,sizeof(float)) ; if (Ucpy==NULL){ printf("error of memory allocation Ucpy on proc %dx%d\n",myrow,mycol); exit(0); } pslacpy_( "All", &m, &min_mn, U, &iu, &ju, descU, Ucpy, &iu, &ju, descU ); AnormF = pslange_( "F", &m, &n, A, &ia, &ja, descA, wwork); for (i=1;i<min_mn+1;i++){ pcol = indxg2p_( &i, &(descU[5]), &izero, &izero, &npcol ); localcol = indxg2l_( &i, &(descU[5]), &izero, &izero, &npcol ); if( mycol==pcol ) sscal_( &mpA, &(S[i-1]), &(Ucpy[ ( localcol-1 )*descU[8] ]), &ione ); } psgemm_( "N", "N", &m, &n, &min_mn, &tpone, Ucpy, &iu, &ju, descU, VT, &ivt, &jvt, descVT, &tmone, Acpy, &ia, &ja, descA ); residF = pslange_( "F", &m, &n, Acpy, &ione, &ione, descA, wwork); residF = residF/AnormF/((float) max_mn); free(Ucpy); free(Acpy); return residF; }
float verif_repres_VN(int m, int n, float *A, int ia, int ja, int *descA, float *U, int iu, int ju, int *descU, float *S){ float *VTcpy=NULL; int nprow, npcol, myrow, mycol; int min_mn, max_mn, mpA, prow, localcol, i, nqA; int ictxt, nbA, rsrcA, csrcA, mpVT, nqVT, descVTcpy[9], itemp, ivtcpy, jvtcpy; int ctxt_ = 1, nb_ = 5, rsrc_ = 6, csrc_ = 7; int izero = 0, info; float tpone= +1.0e+00, tzero= +0.0e+00; float verif_repres_VN, invStemp; min_mn = min(m,n); max_mn = max(m,n); ictxt = descA[ctxt_]; Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol ); nbA = descA[nb_]; rsrcA = descA[rsrc_] ; csrcA = descA[csrc_] ; mpA = numroc_( &m , &nbA, &myrow, &rsrcA, &nprow ); nqA = numroc_( &n , &nbA, &mycol, &csrcA, &npcol ); mpVT = numroc_( &min_mn, &nbA, &myrow, &rsrcA, &nprow ); nqVT = numroc_( &n , &nbA, &mycol, &csrcA, &npcol ); itemp = max( 1, mpVT ); descinit_( descVTcpy, &min_mn, &n, &nbA, &nbA, &rsrcA, &csrcA, &ictxt, &itemp, &info ); ivtcpy = 1; jvtcpy = 1; VTcpy = (float *)calloc(mpVT*nqVT,sizeof(float)) ; if (VTcpy==NULL){ printf("error of memory allocation VTcpy on proc %dx%d\n",myrow,mycol); exit(0); } psgemm_( "T", "N", &min_mn, &n, &m, &tpone, U, &iu, &ju, descU, A, &ia, &ja, descA, &tzero, VTcpy, &ivtcpy, &jvtcpy, descVTcpy ); for (i=1;i<min_mn+1;i++){ prow = indxg2p_( &i, &nbA, &izero, &izero, &nprow ); localcol = indxg2l_( &i, &nbA, &izero, &izero, &nprow ); invStemp = 1/S[i-1]; if( myrow==prow ) sscal_( &nqA, &invStemp, &(VTcpy[localcol-1]), &mpVT ); } verif_repres_VN = verif_orthogonality(min_mn,n,VTcpy, ivtcpy, jvtcpy, descVTcpy); free(VTcpy); return verif_repres_VN; }
float verif_repres_NV(int m, int n, float *A, int ia, int ja, int *descA, float *VT, int ivt, int jvt, int *descVT, float *S){ float *Ucpy=NULL; int nprow, npcol, myrow, mycol; int min_mn, max_mn, mpA, pcol, localcol, i, nqA; int ictxt, nbA, rsrcA, csrcA, mpU, nqU, descUcpy[9], itemp, iucpy, jucpy; int ctxt_ = 1, nb_ = 5, rsrc_ = 6, csrc_ = 7; int izero = 0, ione = 1, info; float tpone= +1.0e+00, tzero= +0.0e+00; float verif_repres_NV, invStemp; min_mn = min(m,n); max_mn = max(m,n); ictxt = descA[ctxt_]; Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol ); nbA = descA[nb_]; rsrcA = descA[rsrc_] ; csrcA = descA[csrc_] ; mpA = numroc_( &m , &nbA, &myrow, &rsrcA, &nprow ); nqA = numroc_( &n , &nbA, &mycol, &csrcA, &npcol ); itemp = max( 1, mpA ); descinit_( descUcpy, &m, &min_mn, &nbA, &nbA, &rsrcA, &csrcA, &ictxt, &itemp, &info ); iucpy = 1; jucpy = 1; mpU = numroc_( &m , &nbA, &myrow, &rsrcA, &nprow ); nqU = numroc_( &min_mn, &nbA, &mycol, &csrcA, &npcol ); Ucpy = (float *)calloc(mpU*nqU,sizeof(float)) ; if (Ucpy==NULL){ printf("error of memory allocation Ucpy on proc %dx%d\n",myrow,mycol); exit(0); } psgemm_( "N", "T", &m, &min_mn, &n, &tpone, A, &ia, &ja, descA, VT, &ivt, &jvt, descVT, &tzero, Ucpy, &iucpy, &jucpy, descUcpy ); for (i=1;i<min_mn+1;i++){ pcol = indxg2p_( &i, &(descUcpy[5]), &izero, &izero, &npcol ); localcol = indxg2l_( &i, &(descUcpy[5]), &izero, &izero, &npcol ); invStemp = 1/S[i-1]; if( mycol==pcol ) sscal_( &mpA, &invStemp, &(Ucpy[ ( localcol-1 )*descUcpy[8] ]), &ione ); } verif_repres_NV = verif_orthogonality(m,min_mn,Ucpy, iucpy, jucpy, descUcpy); free(Ucpy); return verif_repres_NV; }
void pzgecopy_hd( F_CHAR_T TRANS, int *m_in, int *n_in, double *A, int *ia_in, int *ja_in, int *descA, double *dB, int *ib_in, int *jb_in, int *descB ) { /* Copy m by n distributed submatrix from host to GPU */ int m = *m_in; int n = *n_in; int ia = *ia_in; int ja = *ja_in; int ib = *ib_in; int jb = *jb_in; int nprow = 0; int npcol = 0; int myprow = 0; int mypcol = 0; int mmb = 0; int nnb = 0; int istart = 0; int iend = 0; int isize = 0; int Locp = 0; int Locq = 0; int lld = 0; int jstart = 0; int jend = 0; int jsize = 0; int iib = 0; int jjb = 0; cuDoubleComplex *dBptr = 0; double *Btmp = 0; F_CHAR_T NoTrans = "N"; int descBtmp[DLEN_]; int elmSize = sizeof(cuDoubleComplex); double z_one[2]; double z_zero[2]; /* Tuneable parameters */ int mfactor = 4; int nfactor = 4; int rsrc = 0; int csrc = 0; int irsrc = 0; int jcsrc = 0; int info = 0; int notran = 0; int TrA = 0; int lrindx = 0; int lcindx = 0; int nrow = 0; int ncol = 0; int mm = 0; int nn = 0; int iia = 0; int jja = 0; cublasStatus cu_status; z_one[REAL_PART] = 1; z_one[IMAG_PART] = 0; z_zero[REAL_PART] = 0; z_zero[IMAG_PART] = 0; Cblacs_gridinfo( descA[CTXT_], &nprow, &npcol, &myprow, &mypcol ); notran = ( ( TrA = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); /* * check arguments */ if ((m <= 0) || (n <= 0)) { return; }; assert( (1 <= ia) && ((ia + m-1) <= descA[M_] ) ); assert( (1 <= ja) && ((ja + n-1) <= descA[N_] ) ); assert( (1 <= ib) && ((ib + m-1) <= descB[M_] ) ); assert( (1 <= jb) && ((jb + n-1) <= descB[N_] ) ); /* * Create a temp matrix that is aligned to descB. * Assume size is mmb by nnb */ if (notran) { mmb = MIN( m, descB[MB_] * nprow * mfactor); nnb = MIN( n, descB[NB_] * npcol * nfactor); } else { nnb = MIN( m, descB[MB_] * nprow * mfactor); mmb = MIN( n, descB[NB_] * npcol * nfactor); }; mmb = MAX( mmb,1); nnb = MAX( nnb,1); rsrc = indxg2p_( &ib, &descB[MB_], &myprow, &descB[RSRC_], &nprow); csrc = indxg2p_( &jb, &descB[NB_], &mypcol, &descB[CSRC_], &npcol); Locp = numroc_( &mmb, &descB[MB_], &myprow, &rsrc, &nprow ); Locq = numroc_( &nnb, &descB[NB_], &mypcol, &csrc, &npcol ); Btmp = (double*) malloc( MAX(1,(Locp * Locq))*elmSize ); assert( Btmp != 0 ); lld = MAX(1, Locp); descinit_( descBtmp, &mmb, &nnb, &descB[MB_], &descB[NB_], &rsrc, &csrc, &descB[CTXT_], &lld, &info ); assert( info == 0); for( jstart=ja; jstart <= ja + n-1; jstart = jend + 1) { jend = MIN( ja + n -1, jstart + nnb - 1); jsize = jend - jstart + 1; for( istart=ia; istart <= ia + m-1; istart = iend + 1) { iend = MIN( ia + m-1, istart + mmb -1); isize = iend - istart + 1; iia = ia + (istart-1); jja = ja + (jstart-1); iib = 1; jjb = 1; if (notran) { mm = isize; nn = jsize; } else { mm = jsize; nn = isize; }; pzgeadd_( TRAN, &mm, &nn, z_one, A, &iia, &jja, descA, z_zero, Btmp, &iib, &jjb, descBtmp ); /* * find local extent */ if (notran) { iib = ib + (istart-1); jjb = jb + (jstart-1); } else { iib = ib + (jstart-1); jjb = jb + (istart-1); }; if (notran) { nrow = numroc_( &isize, &descB[MB_], &myprow, &rsrc, &nprow); ncol = numroc_( &jsize, &descB[NB_], &mypcol, &csrc, &npcol); } else { nrow = numroc_( &jsize, &descB[MB_], &myprow, &rsrc, &nprow); ncol = numroc_( &isize, &descB[NB_], &mypcol, &csrc, &npcol); }; /* Perform global dB( iib:(iib+isize-1), jjb:(jjb+jsize-1)) <- B(1:isize,1:jsize) Perform local dB( lrindx:(lrindx+nrow-1), lcindx:(lcindx+ncol-1)) <- B(1:nrow, 1:ncol) */ infog2l_( &iib, &jjb, descB, &nprow, &npcol, &myprow, &mypcol, &lrindx, &lcindx, &irsrc, &jcsrc ); dBptr = (cuDoubleComplex *) dB; dBptr = dBptr + INDX2F( lrindx,lcindx, descB[LLD_]); cu_status = cublasSetMatrix( nrow,ncol,elmSize, (cuDoubleComplex *) Btmp, descBtmp[LLD_], dBptr, descB[LLD_] ); assert( cu_status == CUBLAS_STATUS_SUCCESS ); }; }; free( Btmp ); return; }