예제 #1
0
float verif_representativity(int m, int n, float *A, int ia, int ja, int *descA,
                                              float *U, int iu, int ju, int *descU,
                                              float *VT, int ivt, int jvt, int *descVT,
                                              float *S){

        float *Acpy=NULL, *Ucpy=NULL;
        int nprow, npcol, myrow, mycol;
        int min_mn, max_mn, mpA, pcol, localcol, i, nqA;
        int ictxt, nbA, rsrcA, csrcA, nbU, rsrcU, csrcU, mpU, nqU;
        int ctxt_ = 1, nb_ = 5, rsrc_ = 6, csrc_ = 7;
        int izero = 0, ione = 1;
        float *wwork=NULL;
        float tmone= -1.0e+00, tpone= +1.0e+00;
        float residF, AnormF;

        min_mn = min(m,n);
        max_mn = max(m,n);
        ictxt = descA[ctxt_];
        Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol );

        nbA = descA[nb_]; rsrcA = descA[rsrc_] ; csrcA = descA[csrc_] ;
        mpA    = numroc_( &m     , &nbA, &myrow, &rsrcA, &nprow );
        nqA    = numroc_( &n     , &nbA, &mycol, &csrcA, &npcol );
        Acpy = (float *)calloc(mpA*nqA,sizeof(float)) ;
        if (Acpy==NULL){ printf("error of memory allocation Acpy on proc %dx%d\n",myrow,mycol); exit(0); }
        pslacpy_( "All", &m, &n, A, &ia, &ja, descA, Acpy, &ia, &ja, descA );

        nbU = descU[nb_]; rsrcU = descU[rsrc_] ; csrcU = descU[csrc_] ;
        mpU    = numroc_( &m     , &nbU, &myrow, &rsrcU, &nprow );
        nqU    = numroc_( &min_mn, &nbU, &mycol, &csrcU, &npcol );
        Ucpy = (float *)calloc(mpU*nqU,sizeof(float)) ;
        if (Ucpy==NULL){ printf("error of memory allocation Ucpy on proc %dx%d\n",myrow,mycol); exit(0); }
        pslacpy_( "All", &m, &min_mn, U, &iu, &ju, descU, Ucpy, &iu, &ju, descU );

        AnormF = pslange_( "F", &m, &n, A, &ia, &ja, descA, wwork);

        for (i=1;i<min_mn+1;i++){
                pcol = indxg2p_( &i, &(descU[5]), &izero, &izero, &npcol );
                localcol = indxg2l_( &i, &(descU[5]), &izero, &izero, &npcol );
                if( mycol==pcol )
                        sscal_( &mpA, &(S[i-1]), &(Ucpy[ ( localcol-1 )*descU[8] ]), &ione );
        }
        psgemm_( "N", "N", &m, &n, &min_mn, &tpone, Ucpy, &iu, &ju, descU, VT, &ivt, &jvt, descVT,
                        &tmone, Acpy, &ia, &ja, descA ); 
        residF = pslange_( "F", &m, &n, Acpy, &ione, &ione, descA, wwork);
        residF = residF/AnormF/((float) max_mn);

        free(Ucpy);
        free(Acpy);

        return residF;
}
예제 #2
0
float verif_repres_VN(int m, int n, float *A, int ia, int ja, int *descA,
                                       float *U, int iu, int ju, int *descU,
                                       float *S){

        float *VTcpy=NULL;
        int nprow, npcol, myrow, mycol;
        int min_mn, max_mn, mpA, prow, localcol, i, nqA;
        int ictxt, nbA, rsrcA, csrcA, mpVT, nqVT, descVTcpy[9], itemp, ivtcpy, jvtcpy;
        int ctxt_ = 1, nb_ = 5, rsrc_ = 6, csrc_ = 7;
        int izero = 0, info;
        float tpone= +1.0e+00,  tzero= +0.0e+00;
        float verif_repres_VN, invStemp;

        min_mn = min(m,n);
        max_mn = max(m,n);
        ictxt = descA[ctxt_];
        Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol );

        nbA = descA[nb_]; rsrcA = descA[rsrc_] ; csrcA = descA[csrc_] ;

        mpA    = numroc_( &m     , &nbA, &myrow, &rsrcA, &nprow );
        nqA    = numroc_( &n     , &nbA, &mycol, &csrcA, &npcol );

        mpVT   = numroc_( &min_mn, &nbA, &myrow, &rsrcA, &nprow );
        nqVT   = numroc_( &n     , &nbA, &mycol, &csrcA, &npcol );

        itemp = max( 1, mpVT );
        descinit_( descVTcpy, &min_mn, &n, &nbA, &nbA, &rsrcA, &csrcA, &ictxt, &itemp, &info );

        ivtcpy = 1; jvtcpy = 1; 
        VTcpy = (float *)calloc(mpVT*nqVT,sizeof(float)) ;
        if (VTcpy==NULL){ printf("error of memory allocation VTcpy on proc %dx%d\n",myrow,mycol); exit(0); }

        psgemm_( "T", "N", &min_mn, &n, &m, &tpone, U, &iu, &ju, descU, A, &ia, &ja, descA,
                        &tzero, VTcpy, &ivtcpy, &jvtcpy, descVTcpy ); 

        for (i=1;i<min_mn+1;i++){
                prow = indxg2p_( &i, &nbA, &izero, &izero, &nprow );
                localcol = indxg2l_( &i, &nbA, &izero, &izero, &nprow );
                invStemp = 1/S[i-1];
                if( myrow==prow )
                        sscal_( &nqA, &invStemp, &(VTcpy[localcol-1]), &mpVT );
        }

        verif_repres_VN = verif_orthogonality(min_mn,n,VTcpy, ivtcpy, jvtcpy, descVTcpy);

        free(VTcpy);

        return verif_repres_VN;
}
예제 #3
0
float verif_repres_NV(int m, int n, float *A, int ia, int ja, int *descA,
                                float *VT, int ivt, int jvt, int *descVT,
                                float *S){

        float *Ucpy=NULL;
        int nprow, npcol, myrow, mycol;
        int min_mn, max_mn, mpA, pcol, localcol, i, nqA;
        int ictxt, nbA, rsrcA, csrcA, mpU, nqU, descUcpy[9], itemp, iucpy, jucpy;
        int ctxt_ = 1, nb_ = 5, rsrc_ = 6, csrc_ = 7;
        int izero = 0, ione = 1, info;
        float tpone= +1.0e+00,  tzero= +0.0e+00;
        float verif_repres_NV, invStemp;

        min_mn = min(m,n);
        max_mn = max(m,n);
        ictxt = descA[ctxt_];
        Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol );

        nbA = descA[nb_]; rsrcA = descA[rsrc_] ; csrcA = descA[csrc_] ;

        mpA    = numroc_( &m     , &nbA, &myrow, &rsrcA, &nprow );
        nqA    = numroc_( &n     , &nbA, &mycol, &csrcA, &npcol );

        itemp = max( 1, mpA );
        descinit_( descUcpy,  &m, &min_mn, &nbA, &nbA, &rsrcA, &csrcA, &ictxt, &itemp, &info );

        iucpy = 1; jucpy = 1;   
        mpU    = numroc_( &m     , &nbA, &myrow, &rsrcA, &nprow );
        nqU    = numroc_( &min_mn, &nbA, &mycol, &csrcA, &npcol );
        Ucpy = (float *)calloc(mpU*nqU,sizeof(float)) ;
        if (Ucpy==NULL){ printf("error of memory allocation Ucpy on proc %dx%d\n",myrow,mycol); exit(0); }

        psgemm_( "N", "T", &m, &min_mn, &n, &tpone, A, &ia, &ja, descA, VT, &ivt, &jvt, descVT,
                        &tzero, Ucpy, &iucpy, &jucpy, descUcpy ); 

        for (i=1;i<min_mn+1;i++){
                pcol = indxg2p_( &i, &(descUcpy[5]), &izero, &izero, &npcol );
                localcol = indxg2l_( &i, &(descUcpy[5]), &izero, &izero, &npcol );
                invStemp = 1/S[i-1];
                if( mycol==pcol )
                        sscal_( &mpA, &invStemp, &(Ucpy[ ( localcol-1 )*descUcpy[8] ]), &ione );
        }

        verif_repres_NV = verif_orthogonality(m,min_mn,Ucpy, iucpy, jucpy, descUcpy);

        free(Ucpy);

        return verif_repres_NV;
}
void pzgecopy_hd( F_CHAR_T TRANS, int *m_in, int *n_in,
    double *A, int *ia_in, int *ja_in, int *descA,
    double *dB, int *ib_in, int *jb_in, int *descB )
{
/*
 Copy m by n distributed submatrix  from host to GPU
 */

int m  = *m_in;
int n  = *n_in;

int ia = *ia_in;
int ja = *ja_in;
int ib = *ib_in;
int jb = *jb_in;

int nprow = 0;
int npcol = 0;
int myprow = 0;
int mypcol = 0;
int mmb = 0;
int nnb = 0;

int istart = 0;
int iend = 0;
int isize = 0;

int Locp = 0;
int Locq = 0;
int lld = 0;

int jstart = 0;
int jend = 0;
int jsize = 0;

int iib = 0;
int jjb = 0;

cuDoubleComplex *dBptr = 0;
double *Btmp = 0;
F_CHAR_T NoTrans = "N";

int descBtmp[DLEN_];
int elmSize = sizeof(cuDoubleComplex);

double z_one[2];
double z_zero[2];

/*
 Tuneable parameters
 */
int mfactor = 4;
int nfactor = 4;


int rsrc = 0;
int csrc = 0;
int irsrc = 0;
int jcsrc = 0;
int info = 0;

int notran = 0;
int TrA = 0;

int lrindx = 0;
int lcindx = 0;
int nrow = 0;
int ncol = 0;
int mm = 0;
int nn = 0;
int iia = 0;
int jja = 0;

cublasStatus cu_status;

z_one[REAL_PART] = 1;
z_one[IMAG_PART] = 0;
z_zero[REAL_PART] = 0;
z_zero[IMAG_PART] = 0;

Cblacs_gridinfo( descA[CTXT_], &nprow, &npcol, &myprow, &mypcol );

notran = ( ( TrA = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN );


/*
 * check arguments
 */
if ((m <= 0) || (n <= 0)) {
   return;
   };

assert( (1 <= ia) && ((ia + m-1) <= descA[M_] ) );
assert( (1 <= ja) && ((ja + n-1) <= descA[N_] ) );
assert( (1 <= ib) && ((ib + m-1) <= descB[M_] ) );
assert( (1 <= jb) && ((jb + n-1) <= descB[N_] ) );

/*
 * Create a temp matrix that is aligned to descB.
 * Assume size is   mmb by nnb
 */

if (notran) {
  mmb = MIN( m, descB[MB_] * nprow * mfactor);
  nnb = MIN( n, descB[NB_] * npcol * nfactor);
  }
else {
  nnb = MIN( m, descB[MB_] * nprow * mfactor);
  mmb = MIN( n, descB[NB_] * npcol * nfactor);
  };
  

mmb = MAX( mmb,1);
nnb = MAX( nnb,1);

rsrc = indxg2p_( &ib, &descB[MB_], &myprow, &descB[RSRC_], &nprow);
csrc = indxg2p_( &jb, &descB[NB_], &mypcol, &descB[CSRC_], &npcol);


Locp = numroc_( &mmb, &descB[MB_], &myprow, &rsrc, &nprow );
Locq = numroc_( &nnb, &descB[NB_], &mypcol, &csrc, &npcol );

Btmp = (double*) malloc( MAX(1,(Locp * Locq))*elmSize );
assert( Btmp != 0 );


lld = MAX(1, Locp);
descinit_( descBtmp, &mmb, &nnb, &descB[MB_], &descB[NB_],
            &rsrc, &csrc, &descB[CTXT_], &lld, &info );

assert( info == 0);



for( jstart=ja; jstart <= ja + n-1; jstart = jend + 1) {
  jend = MIN( ja + n -1, jstart + nnb - 1);
  jsize = jend - jstart + 1;

  for( istart=ia; istart <= ia + m-1; istart = iend + 1) {
     iend = MIN( ia + m-1, istart + mmb -1);
     isize = iend - istart + 1;

     iia = ia + (istart-1);
     jja = ja + (jstart-1);


     iib = 1;
     jjb = 1;
     

     if (notran) {
        mm = isize;
        nn = jsize;
        }
    else {
        mm = jsize;
        nn = isize;
        };


     pzgeadd_( TRAN,  &mm, &nn,   z_one, A, &iia, &jja, descA,  
                         z_zero, Btmp, &iib, &jjb, descBtmp );

     /* 
      * find local extent
      */

     
     if (notran) {
       iib = ib + (istart-1);
       jjb = jb + (jstart-1);
       }
     else {
       iib = ib + (jstart-1);
       jjb = jb + (istart-1);
       };

     if (notran) {
       nrow = numroc_( &isize, &descB[MB_], &myprow, &rsrc, &nprow);
       ncol = numroc_( &jsize, &descB[NB_], &mypcol, &csrc, &npcol);
       }
     else {
       nrow = numroc_( &jsize, &descB[MB_], &myprow, &rsrc, &nprow);
       ncol = numroc_( &isize, &descB[NB_], &mypcol, &csrc, &npcol);

     };

     /*
      Perform global
      dB( iib:(iib+isize-1), jjb:(jjb+jsize-1)) <- B(1:isize,1:jsize)

      Perform local
      dB( lrindx:(lrindx+nrow-1), lcindx:(lcindx+ncol-1)) <-
              B(1:nrow, 1:ncol)
      */


     infog2l_( &iib, &jjb, descB, &nprow, &npcol, &myprow, &mypcol,
               &lrindx, &lcindx,  &irsrc, &jcsrc );


     dBptr = (cuDoubleComplex *) dB;
     dBptr = dBptr + INDX2F( lrindx,lcindx, descB[LLD_]);


     cu_status = cublasSetMatrix( nrow,ncol,elmSize,
               (cuDoubleComplex *) Btmp,  descBtmp[LLD_],
                                   dBptr, descB[LLD_] );
     assert( cu_status == CUBLAS_STATUS_SUCCESS );

     };
  };

  free( Btmp );

  return;
}