예제 #1
0
SEXP R_blacs_gridinit(SEXP NPROW_in, SEXP NPCOL_in, SEXP SHANDLE)
{
  R_INIT;
  SEXP NPROW, NPCOL, MYROW, MYCOL, RET, RET_NAMES, ICTXT;
  newRvec(NPROW, 1, "int");
  newRvec(NPCOL, 1, "int");
  newRvec(MYROW, 1, "int");
  newRvec(MYCOL, 1, "int");
  newRvec(ICTXT, 1, "int");
  
  INT(NPROW) = INT(NPROW_in);
  INT(NPCOL) = INT(NPCOL_in);
  INT(ICTXT) = INT(SHANDLE);
  
  char order = 'R';
  
  Cblacs_gridinit(INTP(ICTXT), &order, INT(NPROW), INT(NPCOL));
  
  Cblacs_gridinfo(INT(ICTXT), INTP(NPROW), INTP(NPCOL), INTP(MYROW), INTP(MYCOL));
  
  make_list_names(RET_NAMES, 5, "NPROW", "NPCOL", "ICTXT", "MYROW", "MYCOL");
  make_list(RET, RET_NAMES, 5, NPROW, NPCOL, ICTXT, MYROW, MYCOL);
  R_END;
  return(RET);
}
예제 #2
0
파일: mat_redist.c 프로젝트: operedo/fnlsdp
void 
dgather(int ictxt, int n, int numc, int nb, double *A, double *A_d, int *descAd){

int RootNodeic, ione=1, izero=0, isRootNode=0, nru, info;
int nprow, npcol, myrow, mycol, descA[9], itemp;
int i,k;

   sl_init_(&RootNodeic, &ione, &ione);

   Cblacs_gridinfo(ictxt, &nprow,&npcol, &myrow, &mycol);

   if (myrow==0 && mycol ==0){ isRootNode = 1;}

   if(isRootNode){
      nru = numroc_(&n, &n, &myrow, &izero, &nprow);
      itemp = max(1,nru);
      descinit_(descA, &n, &numc, &n, &n, &izero, &izero, &RootNodeic, &itemp, &info );
   }
   else{
      k=0;
      for(i=0;i<9;i++){
          descA[k]=0;
          k++;
      }
      descA[1]=-1;
    }

   pdgemr2d_(&n,&numc,A_d,&ione, &ione, descAd, A, &ione, &ione, descA, &ictxt );

   if (isRootNode){
       Cblacs_gridexit(RootNodeic);
   }
  
}
예제 #3
0
SEXP R_blacs_init(SEXP NPROW_in, SEXP NPCOL_in, SEXP ICTXT_in)
{
    R_INIT;
    SEXP NPROW, NPCOL, ICTXT, MYROW, MYCOL, RET, RET_NAMES;

    newRvec(NPROW, 1, "int");
    newRvec(NPCOL, 1, "int");
    newRvec(ICTXT, 1, "int");
    newRvec(MYROW, 1, "int");
    newRvec(MYCOL, 1, "int");

    INT(NPROW) = INT(NPROW_in);
    INT(NPCOL) = INT(NPCOL_in);
    INT(ICTXT) = INT(ICTXT_in);

    char order = 'R';

    /*  sl_init_(INTP(ICTXT), INTP(NPROW), INTP(NPCOL));*/
    Cblacs_get(INT(ICTXT_in), 0, INTP(ICTXT));
    Cblacs_gridinit(INTP(ICTXT), &order, INT(NPROW), INT(NPCOL));
    Cblacs_gridinfo(INT(ICTXT), INTP(NPROW), INTP(NPCOL), INTP(MYROW), INTP(MYCOL));

    RET_NAMES = make_list_names(5, "NPROW", "NPCOL", "ICTXT", "MYROW", "MYCOL");
    RET = make_list(RET_NAMES, 5, NPROW, NPCOL, ICTXT, MYROW, MYCOL);

    R_END;
    return(RET);
}
예제 #4
0
파일: redist.c 프로젝트: snoweye/pbdBASE
/**
* Distribute a global matrix stored on rank 0 to all processors
* in the grid.
* 
* @param desc
* Descriptor array for desired output distributed matrix.
* @param 
*/
int dmat_as_ddmatrix(int *desc, double *A_global, double *A_local)
{
  const int M = desc[2], N = desc[3];
  const int Mb = desc[4], Nb = desc[5];
  int nrows, ncols;         // size of A_local
  
  int row, col;             // index over global matrix 
  int recvr = 0, recvc = 0; // local matrix index
  
  int ictxt = desc[1];      // blacs context
  int nprow, npcol;         // number process rows/cols
  int myprow, mypcol;       // current process row/col
  
  int rdest = 0, cdest = 0; // process grid row x col destination
  int nr, nc;               // message size; max Mb/Nb
  
  dmat_ldimget(desc, &nrows, &ncols);
  if (nrows < 1 || ncols < 1)
    return -1;
  
  Cblacs_gridinfo(ictxt, &nprow, &npcol, &myprow, &mypcol);
  const int owner = (myprow == 0 && mypcol == 0);
  
  
  for (row=0; row<M; row+=Mb, rdest=(rdest+1)%nprow)
  {
    cdest = 0;
    nr = Mb;
    // Is this the last row block?
    if (M - row < Mb)
      nr = M - row;
    
    for (col=0; col<N; col+=Nb, cdest=(cdest+1)%npcol)
    {
      nc = Nb;
      // Is this the last col block?
      if (N - col < Nb)
        nc = N - col;
    
      if (owner)
      {
        // Send a nr x nc submatrix to process (rdest, cdest)
        Cdgesd2d(ictxt, nr, nc, A_global + (row + M*col), M, rdest, cdest);
      }

      if (myprow == rdest && mypcol == cdest)
      {
        // Receive nr x nc submatrix to recvr x recvc in A_local
        Cdgerv2d(ictxt, nr, nc, A_local + (recvr + nrows*recvc), nrows, 0, 0);
        recvc = (recvc+nc)%ncols;
      }
    }
    
    if (myprow == rdest)
      recvr = (recvr+nr)%nrows;
  }
  
  return 0;
}
예제 #5
0
        int driver_psgesvd( char jobU, char jobVT, int m, int n, float *A, int ia, int ja, int *descA,
                float *S_NN, float *U_NN, int iu, int ju, int *descU, float *VT_NN, int ivt, int jvt, int *descVT,
                double *MPIelapsedNN){

        float *Acpy=NULL, *work=NULL;
        int lwork;
/**/
        int ione=1;
/**/
        int nprow, npcol, myrow, mycol;
        int mpA, nqA;
        int ictxt, nbA, rsrcA, csrcA;
        int ctxt_ = 1, nb_ = 5, rsrc_ = 6, csrc_ = 7;
        int infoNN;

        double MPIt1, MPIt2;
/**/
        ictxt = descA[ctxt_];
        Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol );

        nbA = descA[nb_]; rsrcA = descA[rsrc_] ; csrcA = descA[csrc_] ;

        mpA    = numroc_( &m     , &nbA, &myrow, &rsrcA, &nprow );
        nqA    = numroc_( &n     , &nbA, &mycol, &csrcA, &npcol );

        Acpy = (float *)calloc(mpA*nqA,sizeof(float)) ;
        if (Acpy==NULL){ printf("error of memory allocation Acpy on proc %dx%d\n",myrow,mycol); exit(0); }

        pslacpy_( "All", &m, &n, A, &ione, &ione, descA, Acpy, &ione, &ione, descA );


        work = (float *)calloc(1,sizeof(float)) ;
        if (work==NULL){ printf("error of memory allocation for work on proc %dx%d (1st time)\n",myrow,mycol); exit(0); }

        lwork=-1;

        psgesvd_( &jobU, &jobVT, &m, &n, Acpy, &ione, &ione, descA,
                S_NN, U_NN, &ione, &ione, descU, VT_NN, &ione, &ione, descVT,
                work, &lwork, &infoNN);

        lwork = (int) (work[0]);
        free(work);

        work = (float *)calloc(lwork,sizeof(float)) ;
        if (work==NULL){ printf("error of memory allocation work on proc %dx%d\n",myrow,mycol); exit(0); }
/**/            
        MPIt1 = MPI_Wtime();
/**/
        psgesvd_( &jobU, &jobVT, &m, &n, Acpy, &ione, &ione, descA,
                S_NN, U_NN, &ione, &ione, descU, VT_NN, &ione, &ione, descVT,
                work, &lwork, &infoNN);
/**/
        MPIt2 = MPI_Wtime();
        (*MPIelapsedNN)=MPIt2-MPIt1;
/**/
        free(work);
        free(Acpy);
        return infoNN;
}
예제 #6
0
float verif_representativity(int m, int n, float *A, int ia, int ja, int *descA,
                                              float *U, int iu, int ju, int *descU,
                                              float *VT, int ivt, int jvt, int *descVT,
                                              float *S){

        float *Acpy=NULL, *Ucpy=NULL;
        int nprow, npcol, myrow, mycol;
        int min_mn, max_mn, mpA, pcol, localcol, i, nqA;
        int ictxt, nbA, rsrcA, csrcA, nbU, rsrcU, csrcU, mpU, nqU;
        int ctxt_ = 1, nb_ = 5, rsrc_ = 6, csrc_ = 7;
        int izero = 0, ione = 1;
        float *wwork=NULL;
        float tmone= -1.0e+00, tpone= +1.0e+00;
        float residF, AnormF;

        min_mn = min(m,n);
        max_mn = max(m,n);
        ictxt = descA[ctxt_];
        Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol );

        nbA = descA[nb_]; rsrcA = descA[rsrc_] ; csrcA = descA[csrc_] ;
        mpA    = numroc_( &m     , &nbA, &myrow, &rsrcA, &nprow );
        nqA    = numroc_( &n     , &nbA, &mycol, &csrcA, &npcol );
        Acpy = (float *)calloc(mpA*nqA,sizeof(float)) ;
        if (Acpy==NULL){ printf("error of memory allocation Acpy on proc %dx%d\n",myrow,mycol); exit(0); }
        pslacpy_( "All", &m, &n, A, &ia, &ja, descA, Acpy, &ia, &ja, descA );

        nbU = descU[nb_]; rsrcU = descU[rsrc_] ; csrcU = descU[csrc_] ;
        mpU    = numroc_( &m     , &nbU, &myrow, &rsrcU, &nprow );
        nqU    = numroc_( &min_mn, &nbU, &mycol, &csrcU, &npcol );
        Ucpy = (float *)calloc(mpU*nqU,sizeof(float)) ;
        if (Ucpy==NULL){ printf("error of memory allocation Ucpy on proc %dx%d\n",myrow,mycol); exit(0); }
        pslacpy_( "All", &m, &min_mn, U, &iu, &ju, descU, Ucpy, &iu, &ju, descU );

        AnormF = pslange_( "F", &m, &n, A, &ia, &ja, descA, wwork);

        for (i=1;i<min_mn+1;i++){
                pcol = indxg2p_( &i, &(descU[5]), &izero, &izero, &npcol );
                localcol = indxg2l_( &i, &(descU[5]), &izero, &izero, &npcol );
                if( mycol==pcol )
                        sscal_( &mpA, &(S[i-1]), &(Ucpy[ ( localcol-1 )*descU[8] ]), &ione );
        }
        psgemm_( "N", "N", &m, &n, &min_mn, &tpone, Ucpy, &iu, &ju, descU, VT, &ivt, &jvt, descVT,
                        &tmone, Acpy, &ia, &ja, descA ); 
        residF = pslange_( "F", &m, &n, Acpy, &ione, &ione, descA, wwork);
        residF = residF/AnormF/((float) max_mn);

        free(Ucpy);
        free(Acpy);

        return residF;
}
예제 #7
0
float verif_repres_VN(int m, int n, float *A, int ia, int ja, int *descA,
                                       float *U, int iu, int ju, int *descU,
                                       float *S){

        float *VTcpy=NULL;
        int nprow, npcol, myrow, mycol;
        int min_mn, max_mn, mpA, prow, localcol, i, nqA;
        int ictxt, nbA, rsrcA, csrcA, mpVT, nqVT, descVTcpy[9], itemp, ivtcpy, jvtcpy;
        int ctxt_ = 1, nb_ = 5, rsrc_ = 6, csrc_ = 7;
        int izero = 0, info;
        float tpone= +1.0e+00,  tzero= +0.0e+00;
        float verif_repres_VN, invStemp;

        min_mn = min(m,n);
        max_mn = max(m,n);
        ictxt = descA[ctxt_];
        Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol );

        nbA = descA[nb_]; rsrcA = descA[rsrc_] ; csrcA = descA[csrc_] ;

        mpA    = numroc_( &m     , &nbA, &myrow, &rsrcA, &nprow );
        nqA    = numroc_( &n     , &nbA, &mycol, &csrcA, &npcol );

        mpVT   = numroc_( &min_mn, &nbA, &myrow, &rsrcA, &nprow );
        nqVT   = numroc_( &n     , &nbA, &mycol, &csrcA, &npcol );

        itemp = max( 1, mpVT );
        descinit_( descVTcpy, &min_mn, &n, &nbA, &nbA, &rsrcA, &csrcA, &ictxt, &itemp, &info );

        ivtcpy = 1; jvtcpy = 1; 
        VTcpy = (float *)calloc(mpVT*nqVT,sizeof(float)) ;
        if (VTcpy==NULL){ printf("error of memory allocation VTcpy on proc %dx%d\n",myrow,mycol); exit(0); }

        psgemm_( "T", "N", &min_mn, &n, &m, &tpone, U, &iu, &ju, descU, A, &ia, &ja, descA,
                        &tzero, VTcpy, &ivtcpy, &jvtcpy, descVTcpy ); 

        for (i=1;i<min_mn+1;i++){
                prow = indxg2p_( &i, &nbA, &izero, &izero, &nprow );
                localcol = indxg2l_( &i, &nbA, &izero, &izero, &nprow );
                invStemp = 1/S[i-1];
                if( myrow==prow )
                        sscal_( &nqA, &invStemp, &(VTcpy[localcol-1]), &mpVT );
        }

        verif_repres_VN = verif_orthogonality(min_mn,n,VTcpy, ivtcpy, jvtcpy, descVTcpy);

        free(VTcpy);

        return verif_repres_VN;
}
예제 #8
0
float verif_repres_NV(int m, int n, float *A, int ia, int ja, int *descA,
                                float *VT, int ivt, int jvt, int *descVT,
                                float *S){

        float *Ucpy=NULL;
        int nprow, npcol, myrow, mycol;
        int min_mn, max_mn, mpA, pcol, localcol, i, nqA;
        int ictxt, nbA, rsrcA, csrcA, mpU, nqU, descUcpy[9], itemp, iucpy, jucpy;
        int ctxt_ = 1, nb_ = 5, rsrc_ = 6, csrc_ = 7;
        int izero = 0, ione = 1, info;
        float tpone= +1.0e+00,  tzero= +0.0e+00;
        float verif_repres_NV, invStemp;

        min_mn = min(m,n);
        max_mn = max(m,n);
        ictxt = descA[ctxt_];
        Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol );

        nbA = descA[nb_]; rsrcA = descA[rsrc_] ; csrcA = descA[csrc_] ;

        mpA    = numroc_( &m     , &nbA, &myrow, &rsrcA, &nprow );
        nqA    = numroc_( &n     , &nbA, &mycol, &csrcA, &npcol );

        itemp = max( 1, mpA );
        descinit_( descUcpy,  &m, &min_mn, &nbA, &nbA, &rsrcA, &csrcA, &ictxt, &itemp, &info );

        iucpy = 1; jucpy = 1;   
        mpU    = numroc_( &m     , &nbA, &myrow, &rsrcA, &nprow );
        nqU    = numroc_( &min_mn, &nbA, &mycol, &csrcA, &npcol );
        Ucpy = (float *)calloc(mpU*nqU,sizeof(float)) ;
        if (Ucpy==NULL){ printf("error of memory allocation Ucpy on proc %dx%d\n",myrow,mycol); exit(0); }

        psgemm_( "N", "T", &m, &min_mn, &n, &tpone, A, &ia, &ja, descA, VT, &ivt, &jvt, descVT,
                        &tzero, Ucpy, &iucpy, &jucpy, descUcpy ); 

        for (i=1;i<min_mn+1;i++){
                pcol = indxg2p_( &i, &(descUcpy[5]), &izero, &izero, &npcol );
                localcol = indxg2l_( &i, &(descUcpy[5]), &izero, &izero, &npcol );
                invStemp = 1/S[i-1];
                if( mycol==pcol )
                        sscal_( &mpA, &invStemp, &(Ucpy[ ( localcol-1 )*descUcpy[8] ]), &ione );
        }

        verif_repres_NV = verif_orthogonality(m,min_mn,Ucpy, iucpy, jucpy, descUcpy);

        free(Ucpy);

        return verif_repres_NV;
}
예제 #9
0
BlacsSystem::BlacsSystem(int nprow, int npcol,int mb, int nb) :
                mb_(mb), nb_(nb)
{

        // Initialize Scalapack
        sl_init_(&ictxt_,&nprow,&npcol);


        // Fill the context information
        MPI_Comm_size(MPI_COMM_WORLD,&mpiprocs_);
        MPI_Comm_rank(MPI_COMM_WORLD,&mpirank_);

        Cblacs_gridinfo(ictxt_,&nprow_,&npcol_,&myrow_,&mycol_);

}
예제 #10
0
void PXERBLA( MKL_INT *ICTXT, char *SRNAME, MKL_INT *INFO, MKL_INT lenSRNAME ) {
    MKL_INT myrow, mycol, nprow, npcol, i;
    int     INFO_int, myrow_int, mycol_int;

    Cblacs_gridinfo( *ICTXT, &nprow, &npcol, &myrow, &mycol );
    
    INFO_int = (int) *INFO;
    myrow_int = (int) myrow;
    mycol_int = (int) mycol;
    printf( "{%5i,%5i}:  On entry to ", myrow_int, mycol_int );
    for( i = 0; i < lenSRNAME; i++ ) printf( "%c", SRNAME[ i ] );
    printf( " parameter number %4i had an illegal value\n", INFO_int );

    return;
}
예제 #11
0
int
pxerbla(int *ictxt, char *srname, int *info) {
    /* Format strings */
    char fmt_9999[] = "{ %5d, %5d }:  On entry "
	    "to %s() parameter number %4d had an illegal value"
	    "\n";

    int npcol, mycol, nprow, myrow;

/*  -- ScaLAPACK auxiliary routine (version 2.0) -- */
/*     University of Tennessee, Knoxville, Oak Ridge National Laboratory, */
/*     and University of California, Berkeley. */
/*     April 1, 1998 */

/*  Purpose */

/*  PXERBLA is an error handler for the ScaLAPACK routines.  It is called */
/*  by a ScaLAPACK routine if an input parameter has an invalid value.  A */
/*  message is printed. Installers may consider modifying this routine in */
/*  order to call system-specific exception-handling facilities. */

/*  Arguments */

/*  ICTXT   (local input) INTEGER */
/*          On entry,  ICTXT  specifies the BLACS context handle, indica- */
/*          ting the global  context of the operation. The context itself */
/*          is global, but the value of ICTXT is local. */

/*  SRNAME  (global input) CHARACTER*(*) */
/*          On entry, SRNAME specifies the name of the routine which cal- */
/*          ling PXERBLA. */

/*  INFO    (global input) INTEGER */
/*          On entry, INFO  specifies the position of the invalid parame- */
/*          ter in the parameter list of the calling routine. */

/*  -- Written on April 1, 1998 by */
/*     Antoine Petitet, University  of  Tennessee, Knoxville 37996, USA. */

    Cblacs_gridinfo(*ictxt, &nprow, &npcol, &myrow, &mycol);

    printf( fmt_9999, myrow, mycol, srname, *info );
    fflush( stdout );
    return 0;
} /* pxerbla */
예제 #12
0
float verif_orthogonality(int m, int n, float *U, int iu, int ju, int *descU){

        float *R=NULL;
        int nprow, npcol, myrow, mycol;
        int mpR, nqR, nb, itemp, descR[9], ictxt, info, min_mn, max_mn;
        int ctxt_ = 1, nb_ = 5;
        int izero = 0, ione = 1;
        float *wwork=NULL;
        float tmone= -1.0e+00,  tpone= +1.0e+00,  tzero= +0.0e+00;
        float orthU;

        min_mn = min(m,n);
        max_mn = max(m,n);
        ictxt = descU[ctxt_];
        nb = descU[nb_];
        Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol );

        mpR    = numroc_( &min_mn, &nb, &myrow, &izero, &nprow );
        nqR    = numroc_( &min_mn, &nb, &mycol, &izero, &npcol );
        R = (float *)calloc(mpR*nqR,sizeof(float)) ;
        if (R==NULL){ printf("error of memory allocation R on proc %dx%d\n",myrow,mycol); exit(0); }
        itemp = max( 1, mpR );
        descinit_( descR,  &min_mn, &min_mn, &nb, &nb, &izero, &izero, &ictxt, &itemp, &info );

        pslaset_( "F", &min_mn, &min_mn, &tzero, &tpone, R, &ione, &ione, descR );
        if (m>n)
                psgemm_( "T", "N", &min_mn, &min_mn, &m, &tpone, U, &iu, &ju, descU, U, 
                        &iu, &ju, descU, &tmone, R, &ione, &ione, descR );
        else
                psgemm_( "N", "T", &min_mn, &min_mn, &n, &tpone, U, &iu, &ju, descU, U, 
                        &iu, &ju, descU, &tmone, R, &ione, &ione, descR );
        orthU = pslange_( "F", &min_mn, &min_mn, R, &ione, &ione, descR, wwork );
        orthU = orthU / ((float) max_mn);
        free(R);

        return orthU;

}
예제 #13
0
파일: redist.c 프로젝트: snoweye/pbdBASE
void dmat_ldimget(int *desc, int* nrows, int* ncols)
{
  int M = desc[2];
  int N = desc[3];
  int Mb = desc[4];
  int Nb = desc[5];
  
  int ictxt = desc[1];
  int rsrc = desc[6];
  int csrc = desc[7];
  
  int nprow, npcol, myprow, mypcol;
  Cblacs_gridinfo(ictxt, &nprow, &npcol, &myprow, &mypcol);
  
  *nrows = numroc_(&M, &Mb, &myprow, &rsrc, &nprow);
  *ncols = numroc_(&N, &Nb, &mypcol, &csrc, &npcol);
  
  if (*nrows < 1 || *ncols < 1)
  {
    *nrows = 0;
    *ncols = 0;
  }
}
void Cpcswap_gpu( int n, cuComplex *A, int ia,int ja,int *descA, int incA,
                    cuComplex *B, int ib,int jb,int *descB, int incB )
{
/*
 perform pcswap operation when
 both distributed arrays A and B are in device memory
 */



/*
 * allocate temporary space on host
 * then use pcswap for communication
 */

const int use_MallocHost = FALSE;

cublasStatus cu_status;
size_t nbytes;
int elemSize = sizeof( cuComplex );

float *Atmp = 0;
float *Btmp = 0;

int descAtmp[DLEN_];
int descBtmp[DLEN_];
int ldA, ldB, ldAtmp, ldBtmp;

int nprow,npcol,myprow,mypcol;
int Locp, Locq, lrindx, lcindx, mm,nn;
int LocpA, LocqA, lrindxA, lcindxA;
int LocpB, LocqB, lrindxB, lcindxB;
int isizeA, isizeB, rsrc, csrc;

int iia,jja, iib, jjb;
int incAtmp, incBtmp;
int lrA1,lcA1, lrA2,lcA2;
int lrB1,lcB1, lrB2,lcB2;

Cblacs_gridinfo( descA[CTXT_], &nprow, &npcol, &myprow, &mypcol );


/*
 * allocate storage for vector from A
 */

if (incA == 1) {
   /*
    *  This is a column vector
    */
   mm = n; nn = 1;
   }
else {
  /*
   * This is a row vector
   */
   mm = 1; nn = n;
   };
setup_desc(  mm,nn, ia,ja, descA,   &isizeA, descAtmp );

nbytes = elemSize;
nbytes *= isizeA;
if (use_MallocHost) {
  Atmp = (float *) MallocHost( nbytes );
  }
else {
  Atmp = (float *) malloc( nbytes );
  };
assert( Atmp != 0 );


/*
 * copy vector from A
 */

PROFSTART("swap:GetMatrix");

local_extent( mm,nn,ia,ja,descA,  &LocpA, &LocqA, &lrA1,&lcA1, &lrA2,&lcA2 );

lrindxA = lrA1;
lcindxA = lcA1;

ldA = descA[LLD_];
ldAtmp = descAtmp[LLD_];
if ( (LocpA >= 1) && (LocqA >= 1)) {
  /*
   * copy from GPU device to host CPU
   */
  cu_status = cublasGetMatrix( LocpA,LocqA, elemSize,
             dA(lrindxA,lcindxA), ldA,  Atmp, ldAtmp );

  CHKERR(cu_status);
  };



/*
 * allocate storage for vector from B
 */

Cblacs_gridinfo( descB[CTXT_], &nprow, &npcol, &myprow, &mypcol );

if (incB == 1) {
   /*
    *  This is a column vector
    */
   mm = n; nn = 1;
   }
else {
  /*
   * This is a row vector
   */
   mm = 1; nn = n;
   };
setup_desc( mm,nn, ib,jb,descB, &isizeB, descBtmp );

ldBtmp = descBtmp[LLD_];
ldB = descB[LLD_];

nbytes = elemSize;
nbytes *= isizeB;
if (use_MallocHost) {
  Btmp = (float *) MallocHost( nbytes );
  }
else {
  Btmp = (float *) malloc( nbytes );
  };
assert( Btmp != 0 );



/*
 * copy vector from B
 */

local_extent( mm,nn,ib,jb,descB,  &LocpB, &LocqB, &lrB1,&lcB1,  &lrB2,&lcB2 );

lrindxB = lrB1;
lcindxB = lcB1;



ldB = descB[LLD_];
ldBtmp = descBtmp[LLD_];
if ((LocpB >= 1) && (LocqB >= 1)) {
  /*
   * Copy from GPU to CPU host
   */
  cu_status = cublasGetMatrix(LocpB,LocqB,elemSize,
         dB(lrindxB,lcindxB), ldB, Btmp, ldBtmp );
  CHKERR(cu_status );
  };

PROFEND("swap:GetMatrix");

iia = 1; jja = 1;
iib = 1; jjb = 1;
if (incA == 1) {
   incAtmp = 1;
   }
else {
  incAtmp = descAtmp[M_];
};

if (incB == 1) {
   incBtmp = 1;
    }
else {
   incBtmp = descBtmp[M_];
};


PROFSTART("swap:pcswap");
scalapack_pcswap( &n, Atmp, &iia, &jja, descAtmp, &incAtmp,
                      Btmp, &iib, &jjb, descBtmp, &incBtmp );
PROFEND("swap:pcswap");


/*
 * copy from host CPU back to GPU
 */

PROFSTART("swap:SetMatrix");

if ((LocpA >= 1) && (LocqA >= 1)) {
  /*
   * Copy from CPU host to GPU device
   */
  cu_status = cublasSetMatrix( LocpA, LocqA, elemSize,
              Atmp, ldAtmp, dA(lrindxA,lcindxA), ldA );
  CHKERR(cu_status);
  };


if ((LocpB >= 1) && (LocqB >= 1)) {
  /*
   * Copy from CPU host to GPU device
   */
  cu_status = cublasSetMatrix( LocpB, LocqB, elemSize,
                 Btmp, ldBtmp, dB(lrindxB,lcindxB), ldB );
  CHKERR(cu_status);
  };

PROFEND("swap:SetMatrix");

/*
 * clean up
 */


if (Atmp != 0) {
  if (use_MallocHost) {
    FreeHost(Atmp);
    }
  else {
    free(Atmp); 
    };
  Atmp = 0;
  };

if (Btmp != 0) {
  if (use_MallocHost) {
    FreeHost(Btmp);
     }
  else {
    free(Btmp); 
  };
  Btmp = 0;
 };



return;


}
예제 #15
0
void test_gemr2d(int M, int N)
{
    int repeat = 10;
    
    int32_t one = 1;
    int32_t isrc = 0;

    int32_t num_ranks;
    MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);
    int32_t rank;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    int32_t bs_row_A = M / num_ranks + std::min(1, M % num_ranks);
    int32_t bs_col_A = 1;

    int32_t bs_row_B = 1;
    int32_t bs_col_B = 1;

    int32_t blacs_handler = Csys2blacs_handle(MPI_COMM_WORLD);
    int32_t context1 = blacs_handler;
    int32_t context2 = blacs_handler;

    /* create BLACS context */
    Cblacs_gridinit(&context1, "C", num_ranks, 1);
    /* get row and column ranks */
    int32_t rank_row1, rank_col1;
    Cblacs_gridinfo(context1, &num_ranks, &one, &rank_row1, &rank_col1);
    /* get local number of rows and columns of a matrix */
    int32_t num_rows_local1, num_cols_local1;
    num_rows_local1 = numroc_(&M, &bs_row_A, &rank_row1, &isrc, &num_ranks);
    num_cols_local1 = numroc_(&N, &bs_col_A, &rank_col1, &isrc, &one);


    Cblacs_gridinit(&context2, "C", 1, num_ranks);
    int32_t rank_row2, rank_col2;
    Cblacs_gridinfo(context2, &one, &num_ranks, &rank_row2, &rank_col2);
    int32_t num_rows_local2, num_cols_local2;
    num_rows_local2 = numroc_(&M, &bs_row_B, &rank_row2, &isrc, &one);
    num_cols_local2 = numroc_(&N, &bs_col_B, &rank_col2, &isrc, &num_ranks);

    if (rank == 0)
    {
        printf("local dimensions of A: %i x %i\n", num_rows_local1, num_cols_local1);
        printf("local dimensions of B: %i x %i\n", num_rows_local2, num_cols_local2);
    } 

    int32_t descA[9], descB[9], info;
    descinit_(descA, &M, &N, &bs_row_A, &bs_col_A, &isrc, &isrc, &context1, &num_rows_local1, &info);
    descinit_(descB, &M, &N, &bs_row_B, &bs_col_B, &isrc, &isrc, &context2, &num_rows_local2, &info);

    std::vector<double_complex> A(num_rows_local1 * num_cols_local1);
    std::vector<double_complex> B(num_rows_local2 * num_cols_local2, double_complex(0, 0));
    std::vector<double_complex> C(num_rows_local1 * num_cols_local1);

    for (int i = 0; i < num_rows_local1 * num_cols_local1; i++)
    {
        A[i] = double_complex(double(rand()) / RAND_MAX, double(rand()) / RAND_MAX);
        C[i] = A[i];
    }

    double time = -MPI_Wtime();
    for (int i = 0; i < repeat; i++)
    {
        pzgemr2d_(&M, &N, &A[0], &one, &one, descA, &B[0], &one, &one, descB, &context1);
    }
    time += MPI_Wtime();

    if (rank == 0)
    {
        printf("average time %.4f sec, swap speed: %.4f GB/sec\n", time / repeat,
               sizeof(double_complex) * repeat * M * N / double(1 << 30) / time);
    }

    /* check correctness */
    pzgemr2d_(&M, &N, &B[0], &one, &one, descB, &A[0], &one, &one, descA, &context1);
    for (int i = 0; i < num_rows_local1 * num_cols_local1; i++)
    {
        if (std::abs(A[i] - C[i]) > 1e-14)
        {
            printf("Fail.\n");
            exit(0);
        }
    }

    Cblacs_gridexit(context1);
    Cblacs_gridexit(context2);
    Cfree_blacs_system_handle(blacs_handler);
}
예제 #16
0
void pdgemm(char *TRANSA, char *TRANSB,
            int *M, int *N, int *K,
            double *ALPHA,
            double *a, int *IA, int *JA, int *DESCA,
            double *b, int *IB, int *JB, int *DESCB,
            double *BETA,
            double *c, int *IC, int *JC, int *DESCC)
{
  //   int my_row, my_column;
   int bk = BLOCK_SIZE;
   int n;
   int i, j, k, round;
   int r_rank, c_rank, g_rank, g_size;
   double *tempA, *tempB;
   double *tmpA, *tmpB;
   double t_bc, t_calc, t_tot, t_bar;
   double tt_bc, tt_calc, tt_tot, tt_bar;
   double sum, g_sum;
   int num_bk, loops, roots;
   int pk, last=0;
   int nprow, npcol, myrow, mycol;
   int *position, *coordinate;
   MPI_Comm my_row_comm, my_column_comm;

   /* Get the rank of the global communicator */
   MPI_Comm_rank(MPI_COMM_WORLD, &g_rank);
   MPI_Comm_size(MPI_COMM_WORLD, &g_size);

   Cblacs_gridinfo( (ctxt = DESCC[DCTXT]), &nprow, &npcol, &myrow, &mycol);

   //   printf("nprow = %d, npcol = %d, myrow = %d, mycol = %d\n", nprow, npcol, myrow, mycol);

   pk = PIPE_SIZE;
   if(nprow == npcol)
      n = DESCA[DN]/nprow; // local size
   else 
   { 
     printf("The grid of process should be square.\n");
     exit(-1);
   }

   coordinate = (int *)malloc(2*sizeof(int));
   position = (int *)malloc(g_size*sizeof(int));
   

   //MPI_Allgather(pos, 1, MPI_INT, position, g_size, MPI_INT, MPI_COMM_WORLD);
   for(i=0; i<g_size; i++)
     position[i]=i;   
   MPI_Barrier(MPI_COMM_WORLD);
   //  CreatTiling(position);
   //CreateReduceCommunicator(position, coordinate, &my_row_comm, &my_column_comm);
   //my_row = coordinate[0];
   //my_column = coordinate[1];

   /* Create my row and column communicators */   
   MPI_Comm_split(MPI_COMM_WORLD, mycol, myrow, &my_column_comm);
   MPI_Comm_split(MPI_COMM_WORLD, myrow, mycol, &my_row_comm);

   /* Get the rank of the local row communicator */
   //MPI_Comm_rank(ctxtr->scp->comm, &r_rank);
   MPI_Comm_rank(my_row_comm, &r_rank);
   /* Get the rank of the local column communicator */
   MPI_Comm_rank(my_column_comm, &c_rank);
   //MPI_Comm_rank(ctxtc->scp->comm, &c_rank);
   if(r_rank == 0 )
     printf("my c_rank = %d\n", c_rank);


   tempA = (double *) malloc (n*bk * sizeof(double));
   tempB = (double *) malloc (bk*n * sizeof(double));

   //const enum CBLAS_ORDER Order=CblasRowMajor;
   //const enum CBLAS_TRANSPOSE TA=CblasNoTrans;
   //const enum CBLAS_TRANSPOSE TB=CblasNoTrans;

   sum = g_sum = 0.0;
   tt_bc = tt_calc = tt_tot = t_bar = 0.0;
   tt_tot = tt_bc = tt_calc = tt_bar = 0.0;
   num_bk =(n/bk);
   last = n%bk;

   tmpA = (double *) malloc (n*last * sizeof(double));
   tmpB = (double *) malloc (last*n * sizeof(double));

   if(last==0)
      loops = rows*num_bk;
   else 
   {
      num_bk++;
      loops = rows*num_bk;
   }
   

   if(TIMER)
      t_tot = MPI_Wtime();

      for(round=0; round<loops; round++)
      {
        k = (round%num_bk)*bk;
        roots = round/num_bk;


	if(*TRANSA == 'N')
	{
         if(r_rank == roots)
         {
           if(last!=0 && (round+1)%num_bk==0) // the last block of this process
	   {
              for(i=0; i<last; i++)
   	        for(j=0; j<n; j++)
		{
		  tmpA[i*n+j] = a[(k+i)*n+j];
		}
	   }
           else 
	   {
              for(i=0; i<bk; i++)
	        for(j=0; j<n; j++)
		{
		  tempA[i*n+j] = a[(k+i)*n+j];
		}
           }
	 }

         if(c_rank == roots)
         {
           if(last!=0 && (round+1)%num_bk==0) // the last block of this process
	   {
              for(i=0; i<n; i++)
	        for(j=0; j<last; j++)
		{
		  tmpB[i*last+j] = b[i*n+(k+j)];
		}
	   }
           else
	   {
              for(i=0; i<n; i++)
	        for(j=0; j<bk; j++)
		{
		  tempB[i*bk+j] = b[i*n+(k+j)];
		}
	   }
         }
	}
        else if( *TRANSA == 'T')
	{
         if(r_rank == roots)
         {
           if(last!=0 && (round+1)%num_bk==0) // the last block of this process
	   {
              for(i=0; i<n; i++)
   	        for(j=0; j<last; j++)
		{
		   tmpA[i*last+j] = a[i*n+(k+j)];
       		}
	   }
	   else
	   {
              for(i=0; i<n; i++)
	        for(j=0; j<bk; j++)
		{
		   tempA[i*bk+j] = a[i*n+(k+j)];
       		}
	   }
         }
         if(c_rank == roots)
         {
           if(last!=0 && (round+1)%num_bk==0) // the last block of this process
	   {       
              for(i=0; i<last; i++)
	        for(j=0; j<n; j++)
		{
		  tmpB[i*n+j] = b[(k+i)*n+j];
       		}
	   }
	   else
	   {
             for(i=0; i<bk; i++)
	        for(j=0; j<n; j++)
		{
		  tempB[i*n+j] = b[(k+i)*n+j];
       		}
	   }
         }
	}
   
         if(TIMER)
            t_bc = MPI_Wtime();

         /* Broadcast to right */
         if(PIPE)
	 {
           if(last!=0 && (round+1)%num_bk==0) // the last block of this process
              MY_Bcast(tmpA, n*last,  MPI_DOUBLE, roots, my_row_comm, r_rank, columns, pk);
	   else
              MY_Bcast(tempA, n*bk,  MPI_DOUBLE, roots, my_row_comm, r_rank, columns, pk);
	 }
         else
         {
           if(last!=0 && (round+1)%num_bk==0) // the last block of this process
              MPI_Bcast(tmpA, last*n, MPI_DOUBLE, roots, my_row_comm);
	   else
              MPI_Bcast(tempA, bk*n, MPI_DOUBLE, roots, my_row_comm);
	 }
	 
         /* Broadcast below */
         
         if(PIPE)
	 {
           if(last!=0 && (round+1)%num_bk==0) // the last block of this process
	      MY_Bcast(tmpB, last*n, MPI_DOUBLE, roots, my_column_comm, c_rank, rows, pk);
	   else
              MY_Bcast(tempB, bk*n, MPI_DOUBLE, roots, my_column_comm, c_rank, rows, pk);
	 }
         else
	 {
            if(last!=0 && (round+1)%num_bk==0) // the last block of this process
               MPI_Bcast(tmpB, n*last, MPI_DOUBLE, roots, my_column_comm);
	    else
               MPI_Bcast(tempB, n*bk, MPI_DOUBLE, roots, my_column_comm);
         }
	 
         if(TIMER)
         {
            t_bc = MPI_Wtime() - t_bc;
            tt_bc += t_bc;
            t_calc = MPI_Wtime();
         }

         if(DVFS_ENABLE) 
            mapping(g_rank%8, DVFS_HIGH);



         /* Do the multiplication */
         if(*TRANSA == 'N') 
         {
           if(last!=0 && (round+1)%num_bk==0) // the last block of this process
              dgemm_(TRANSA, TRANSB, &n, &n, &last, ALPHA, tmpA, &n, tmpB, &last, BETA, c, &n);
	   else
              dgemm_(TRANSA, TRANSB, &n, &n, &bk, ALPHA, tempA, &n, tempB, &bk, BETA, c, &n);
	 }
         else if(*TRANSA == 'T') 
         {
           if(last!=0 && (round+1)%num_bk==0) // the last block of this process              
              dgemm_(TRANSA, TRANSB, &n, &n, &last, ALPHA, tmpA, &last, tmpB, &n, BETA, c, &n);
	   else
              dgemm_(TRANSA, TRANSB, &n, &n, &bk, ALPHA, tempA, &bk, tempB, &n, BETA, c, &n);
	 }

	 //	          dgemm_(TRANSA, TRANSB, &n, &n, &bk, ALPHA, tempA, &n, tempB, &bk, BETA, c, &n);
	 //	 cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, bk, alpha, tempA, bk, tempB, n, beta, c, n);

         /* check the calculate results*/
     
         if(DVFS_ENABLE) 
            mapping(g_rank%8, DVFS_LOW);

         if(TIMER)
         { 
            t_calc = MPI_Wtime() - t_calc;
            tt_calc += t_calc;
         }
      }
         
   if(TIMER)
   {
      t_tot = MPI_Wtime() - t_tot;
      tt_tot += t_tot;
   
      MPI_Reduce(&tt_tot, &t_tot, 1, MPI_DOUBLE, MPI_MAX,  0, MPI_COMM_WORLD);
      MPI_Reduce(&tt_bc, &t_bc, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
      MPI_Reduce(&tt_calc, &t_calc, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);

   
            if(g_rank == 0)
         printf("tot = %10.6f, bc = %10.6f, calc = %10.6f\n", t_tot, t_bc, t_calc);
   }

   MPI_Barrier(MPI_COMM_WORLD);

   free(tmpA);free(tmpB);
   //   free(a); free(b); free(c);
   free(tempA); free(tempB);
}
void
Cpclaswp_gpu( char direc, char rowcol, int n, 
              cuComplex *A, int ia, int ja, int *descA, 
              int k1, int k2, int *ipiv_ )
{
  /*
   * Note vector ipiv(:) is tied to the distribution of A
   * dimension ipiv is   Locr(M_A) + MB_A
   */

  int nprow,npcol,myprow,mypcol;
  int ip,jp;
  int i,j, iia,jja, icurrow,icurcol;

  int is_forward;
  int is_row;

  if (n <= 0) {
    return;
  };

  is_row = (rowcol == 'R') || (rowcol == 'r');
  is_forward =  (direc == 'F') || (direc == 'f');

  Cblacs_gridinfo( descA[CTXT_], &nprow,&npcol, &myprow,&mypcol );
  assert( nprow >= 1);
  assert( npcol >= 1);
  assert( (0 <= myprow) && (myprow < nprow));
  assert( (0 <= mypcol) && (mypcol < npcol));
/*
!      IF( LSAME( ROWCOL, 'R' ) ) THEN
!         IF( LSAME( DIREC, 'F' ) ) THEN
!            CALL INFOG2L( K1, JA, DESCA, NPROW, NPCOL, myprow, mypcol,
!     $                    IIA, JJA, ICURROW, ICURCOL )
!            DO 10 I = K1, K2
!               IP = IPIV( IIA+I-K1 )
!               IF( IP.NE.I )
!     $            CALL PZSWAP( N, A, I, JA, DESCA, DESCA( M_ ), A, IP,
!     $                         JA, DESCA, DESCA( M_ ) )
!   10       CONTINUE
*/
      if (is_row && is_forward ) {
             Cinfog2l(k1,ja,descA,nprow,npcol,myprow,mypcol,
                 &iia,&jja,   &icurrow,&icurcol );
             for(i=k1; i <= k2; i++) {
               ip = ipiv( iia + i - k1 );
               if (ip != i) {
                  Cpcswap_gpu(n, A, ia,ja,descA, descA[M_], A, ip,
                               ja, descA, descA[M_] );
               };
             }; /* end for i */
         };
/*
!         ELSE
!            CALL INFOG2L( K2, JA, DESCA, NPROW, NPCOL, myprow, mypcol,
!     $                    IIA, JJA, ICURROW, ICURCOL )
!            DO 20 I = K2, K1, -1
!               IP = IPIV( IIA+I-K1 )
!               IF( IP.NE.I )
!     $            CALL PZSWAP( N, A, I, JA, DESCA, DESCA( M_ ), A, IP,
!     $                         JA, DESCA, DESCA( M_ ) )
!   20       CONTINUE
!         END IF
*/
         if (is_row && (!is_forward)) {
           Cinfog2l(k2,ja,descA,nprow,npcol,myprow,mypcol,
                       &iia,&jja,  &icurrow,&icurcol );
           for(i=k2; i >= k1; i-- ) {
             ip = ipiv( iia + i - k1 );
             if (ip != i) {
               Cpcswap_gpu( n,A,i,ja,descA, descA[M_],  A, ip,
                            ja, descA, descA[M_] );
             };
           }; /* end for i */
         };
/*
!      ELSE
!         IF( LSAME( DIREC, 'F' ) ) THEN
!            CALL INFOG2L( IA, K1, DESCA, NPROW, NPCOL, myprow, mypcol,
!     $                    IIA, JJA, ICURROW, ICURCOL )
!            DO 30 J = K1, K2
!               JP = IPIV( JJA+J-K1 )
!               IF( JP.NE.J )
!     $            CALL PZSWAP( N, A, IA, J, DESCA, 1, A, IA, JP,
!     $                         DESCA, 1 )
!   30       CONTINUE
*/
         if ( (!is_row) && is_forward) {
               Cinfog2l(ia,k1,descA,  nprow,npcol,myprow,mypcol,
                           &iia, &jja,  &icurrow, &icurcol );
               for(j=k1; j <= k2; j++ ) {
                 jp = ipiv( jja+j-k1 );
                 if (jp != j) {
                   Cpcswap_gpu( n,A,ia,j,descA,1,  A,ia,jp,
                                descA, 1 );
                 };
               }; /* end for j */
           };
/*
!         ELSE
!            CALL INFOG2L( IA, K2, DESCA, NPROW, NPCOL, myprow, mypcol,
!     $                    IIA, JJA, ICURROW, ICURCOL )
!            DO 40 J = K2, K1, -1
!               JP = IPIV( JJA+J-K1 )
!               IF( JP.NE.J )
!     $            CALL PZSWAP( N, A, IA, J, DESCA, 1, A, IA, JP,
!     $                         DESCA, 1 )
!   40       CONTINUE
!         END IF
*/
           if ( (!is_row) && (!is_forward))  {
             Cinfog2l( ia,k2,descA, nprow,npcol,myprow,mypcol,
                          &iia, &jja, &icurrow, &icurcol );
             for( j=k2; j >= k1; j--) {
               jp = ipiv( jja + j-k1);
               if (jp != j) {
                 Cpcswap_gpu( n,A,ia,j,descA, 1, A, ia,jp, 
                              descA, 1 );
               };
             }; /* end for j */
           };
/*
!      END IF
*/
         return;
}
예제 #18
0
int
pdtrans(char *trans, int *m, int *n, int * mb, int *nb, double *a, int *lda, double *beta,
	double *c__, int *ldc, int *imrow, int *imcol, double *work, int *iwork) {
    /* System generated locals */
    long a_dim1, a_offset, c_dim1, c_offset;
    int i__1, i__2, i__3, i__4;

    /* Local variables */
    int j1, k1, k2, ml, nl, mp, mq, np, nq, mb0, mb1, mb2, nb0,
	    nb1, nb2, kia, kja, kic, kjc, lbm, lbn, lcm, ldt, lbm0, lbm1,
	     lbm2, lbn0, lbn1, lbn2, igcd;
    long ipt;
    int mcol, info, lcmp, lcmq, item, ncol, kmod1, kmod2;
    double tbeta;
    int kpcol, mpcol, npcol, mrcol, mycol, kprow, mprow, nprow, mrrow, myrow;

/*  -- PUMMA Package routine (version 2.1) -- */
/*     Jaeyoung Choi, Oak Ridge National Laboratory. */
/*     Jack Dongarra, Univ. of Tennessee, Oak Ridge National Laboratory. */
/*     David Walker,  Oak Ridge National Laboratory. */
/*     October 31, 1994. */

/*  Purpose */

/*  PDTRANS routine is one of the PUMMA package based on block cyclic */
/*  data distribution on 2-D process configuration. */

/*  It is used for the following matrix transposition, */

/*     Form  C := A' + beta*C */

/*  where beta is a scalar, and A and C are matrices, with A an M by N */
/*  matrix (globally), and C an N by M matrix (globally). */

/*  Parameters */

/*  TRANS  - (input) CHARACTER*1 */
/*           TRANS specifies whether A is transposed or conjugate */
/*           transposed. */

/*              TRANS = 'T',   transpose; */

/*              TRANS = 'C',   conjugate transpose. */

/*  M      - (input) INTEGER */
/*           M specifies the (global) number of rows of the matrix A and */
/*           the (global) number of rows of the matrix C.  M >= 0. */

/*  N      - (input) INTEGER */
/*           N specifies the (global) number of columns of the matrix A */
/*           and columns of the matrix B.  N >= 0. */

/*  MB     - (input) INTEGER */
/*           MB specifies the row block size of the matrix A and the */
/*           column block of the matrix C.  MB >= 1. */

/*  NB     - (input) INTEGER */
/*           NB specifies the column block size of the matrix A and the */
/*           row block size of the matrix C.  NB >= 1. */

/*  A      - (input) DOUBLE PRECISION array of DIMENSION ( LDA, Nq ). */
/*           The leading Mp by Nq part of the array A must contain the */
/*           local matrix  A.  Mp and Nq are local variables */
/*           (see description of local parameters). */

/*  LDA    - (input) INTEGER */
/*           The leading dimension of the (local) array A. */
/*           LDA >= MAX( 1, Mp ). */

/*  BETA   - (input) DOUBLE PRECISION */
/*           BETA  specifies the scalar beta.  When BETA is supplied as */
/*           zero then C need not be set on input. */

/*  C      - (input/ouput) DOUBLE PRECISION array of DIMENSION (LDC, Mq). */
/*           On entry the leading Np by Mq part of the array C must */
/*           contain the local matrix C, except when beta is zero, */
/*           in which case C need not be set on entry. */
/*           On exit, the array C is overwritten by the Np by Mq matrix */
/*           (A'+bata*C).  Np and Mq are local variables */
/*           (see description of local parameters). */

/*  LDC    - (input) INTEGER */
/*           The leading dimension of the (local) array C. */
/*           LDC >= MAX( 1, Np ). */

/*  IMROW  - (input) INTEGER */
/*           IMROW specifies a row of the process template, which holds */
/*           the first block of the matrices.  0 <= IMROW < NPROW. */

/*  IMCOL  - (input) INTEGER */
/*           IMCOL specifies a column of the process template, which */
/*           holds the first block of the matrices.  0 <= IMCOL < NPCOL. */

/*  WORK   - (workspace) DOUBLE PRECISION array */
/*           See requirements. */

/*  IWORK  - (workspace) INTEGER array */
/*           See requirements. */

/*  Local  Parameters */

/*  LCM   =  the lowest common multiple of P and Q */
/*  LCMP  =  LCM/P = number of template rows in LCM block */
/*  LCMQ  =  LCM/Q = number of template columns in LCM block */
/*  IGCD   =  the greatest common divisor (GCD) of P and Q */
/*  MpxNq =  size of (local) matrix A in the process, iam */
/*  NpxMq =  size of (local) matrix C in the process, iam */
/*  KMOD  =  Define Group I.D. */
/*  item  =  temporal integer parameter */

/*    Two buffers for storing A' and T(= subblock of A') */
/*       WORK       <== A' */
/*       WORK(IPT)  <== T */

/*    Three interger buffers */
/*       IWORK(1,k) <== starting point of row subblock of A  to send and */
/*                      C to receive in K2 loop (rowwise communication) */
/*       IWORK(2,k) <== starting point of column subblock of A to send in */
/*                      J1 loop (columnwise communication) */
/*       IWORK(3,k) <== starting point of column subblock of C to receive */
/*                      in J1 loop (columnwise communication) */

/*  Requirements (approximate) */

/*   Size(IWORK) = 3 x MAX(P, Q) */
/*   Size(WORK)  = 2 x Ceil(Ceil(M,MB),LCM)xMB x Ceil(Ceil(N,NB),LCM)xNB */

/*     Get grid parameters */

    /* Parameter adjustments */
    a_dim1 = *lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    c_dim1 = *ldc;
    c_offset = 1 + c_dim1;
    c__ -= c_offset;
    --work;
    --iwork;

    /* Function Body */
    Cblacs_gridinfo(context_1.ictxt, &nprow, &npcol, &myrow, &mycol);

/*     Test for the input parameters. */

    info = 0;
    if (*trans != 'T' && *trans != 'C') {
	info = 1;
    } else if (*m < 0) {
	info = 2;
    } else if (*n < 0) {
	info = 3;
    } else if (*mb < 1) {
	info = 4;
    } else if (*nb < 1) {
	info = 5;
    } else if (*lda < 1) {
	info = 7;
    } else if (*ldc < 1) {
	info = 10;
    } else if (*imrow < 0 || *imrow >= nprow) {
	info = 11;
    } else if (*imcol < 0 || *imcol >= npcol) {
	info = 12;
    }

L10:
    if (info != 0) {
	pxerbla( &context_1.ictxt, "PDTRANS", &info );
	return 0;
    }

/*     Initialize parameters */

    mprow = nprow + myrow;
    mpcol = npcol + mycol;
    mrrow = (mprow - *imrow) % nprow;
    mrcol = (mpcol - *imcol) % npcol;

    lcm = ilcm_(&nprow, &npcol);
    lcmp = lcm / nprow;
    lcmq = lcm / npcol;
    igcd = nprow / lcmq;

    mp = numroc_(m, mb, &mrrow, &c__0, &nprow);
    mq = numroc_(m, mb, &mrcol, &c__0, &npcol);
    np = numroc_(n, nb, &mrrow, &c__0, &nprow);
    nq = numroc_(n, nb, &mrcol, &c__0, &npcol);

    i__1 = iceil_(m, mb);
    lbm = iceil_(&i__1, &lcm);
    i__1 = iceil_(n, nb);
    lbn = iceil_(&i__1, &lcm);

/*     Test for the input parameters again with local parameters */

    if (*lda < mp) {
	info = 7;
    } else if (*ldc < np) {
	info = 10;
    }
    if (info != 0) {
	goto L10;
    }

/*     Quick return if possible. */

    if (*m == 0 || *n == 0) {
	return 0;
    }

/*     At first, scale C with beta if beta != 0.0 & beta != 1.0 */

    tbeta = *beta;
    if (*beta != 0. && *beta != 1.) {
	i__1 = mq;
	for (j1 = 1; j1 <= i__1; ++j1) {
	    HPL_dscal( np, *beta, &c__[j1 * c_dim1 + 1], 1 );
/* L20: */
	}
	tbeta = 1.;
    }

    commtrb_1.iaz = lcmp * *mb;
    commtrb_1.jaz = lcmq * *nb;
    commtrb_1.itz = lcmp * *nb;
    commtrb_1.jtz = lcmq * *mb;

    ml = lbm * *mb;
    nl = lbn * *nb;
    ipt = (long)ml * (long)nl + 1;
    ldt = nl;
    kprow = mrrow + nprow;
    kpcol = mrcol + npcol;

/*     Initialize Parameters -- Compute the positions of subblocks */

    i__1 = npcol - 1;
    for (k1 = 0; k1 <= i__1; ++k1) {
	ncol = (kpcol - k1) % npcol;
	i__2 = lcmq - 1;
	for (j1 = 0; j1 <= i__2; ++j1) {
	    item = npcol * j1 + ncol;
	    if (item % nprow == mrrow) {
		iwork[ncol * 3 + 1] = item / nprow;
	    }
/* L30: */
	}
    }

    i__2 = lcmq - 1;
    for (j1 = 0; j1 <= i__2; ++j1) {
	item = (npcol * j1 + mrcol) % nprow;
	iwork[item * 3 + 2] = j1;
	iwork[item * 3 + 3] = j1;
	i__1 = igcd - 1;
	for (k1 = 1; k1 <= i__1; ++k1) {
	    iwork[(item + nprow - k1) % nprow * 3 + 2] = j1;
	    iwork[(item + k1) % nprow * 3 + 3] = j1;
/* L40: */
	}
    }

/*     Set parameters for efficient copying */

    lbm0 = lbm;
    lbm1 = lbm;
    lbm2 = lbm;
    lbn0 = lbn;
    lbn1 = lbn;
    lbn2 = lbn;
    mb0 = *mb;
    mb1 = *mb;
    mb2 = *mb;
    nb0 = *nb;
    nb1 = *nb;
    nb2 = *nb;

    if (nprow == npcol) {
	lbm0 = 1;
	lbn0 = 1;
	mb0 = mp;
	nb0 = nq;
    }
    if (nprow == lcm) {
	lbm1 = 1;
	lbn2 = 1;
	mb1 = mp;
	nb2 = np;
    }
    if (npcol == lcm) {
	lbn1 = 1;
	lbm2 = 1;
	nb1 = nq;
	mb2 = mq;
    }

/*     For each K2 loop (rowwise), Copy A' to WORK & Send it to KTPROC */
/*                                 then, Receive WORK and Copy WORK to C */

    kmod1 = (nprow + mrcol - mrrow) % igcd;
    kmod2 = (igcd - kmod1) % igcd;

    i__1 = lcmp - 1;
    for (k2 = 0; k2 <= i__1; ++k2) {

/*        Copy A' to WORK in the appropriate order & Send it */

	k1 = k2 * igcd + kmod1;
	mcol = (kpcol - k1) % npcol;
	kia = iwork[mcol * 3 + 1] * *mb;
	mcol = (mcol + *imcol) % npcol;
	ncol = (mrcol + k2 * igcd + kmod2) % npcol;
	kic = iwork[ncol * 3 + 1] * *nb;
	ncol = (ncol + *imcol) % npcol;

	i__2 = lcmq - 1;
	for (j1 = 0; j1 <= i__2; ++j1) {
	    kja = iwork[(mrrow + igcd * j1) % nprow * 3 + 2] * *nb;

	    if (myrow == (myrow + igcd * j1 + kmod1) % nprow && mycol == mcol)
		     {
		kjc = iwork[(kprow - igcd * j1) % nprow * 3 + 3] * *mb;
		i__3 = mp - kia;
		i__4 = nq - kja;
		dtr2mx_(&a[kia + 1 + (kja + 1) * a_dim1], lda, &tbeta, &c__[
			kic + 1 + (kjc + 1) * c_dim1], ldc, &lbm0, &lbn0, &
			mb0, &nb0, &i__3, &i__4);

	    } else {
		i__3 = mp - kia;
		i__4 = nq - kja;
		dtr2bf_(&a[kia + 1 + (kja + 1) * a_dim1], lda, &work[1], &ldt,
			 &lbm1, &lbn1, &mb1, &nb1, &i__3, &i__4);

		if (nprow == npcol && *beta == 0. && *ldc == ldt) {
		    i__3 = (myrow + igcd * j1 + kmod1) % nprow;
		    i__4 = (mprow - igcd * j1 - kmod2) % nprow;
		    kjc = iwork[(kprow - igcd * j1) % nprow * 3 + 3] * *mb;
#if 0
		    Cdgesd2d(context_1.ictxt,nl,ml,&work[1],nl,i__3,mcol);
		    Cdgerv2d(context_1.ictxt,nl,ml,&c__[(kjc + 1) * c_dim1 + 1],*ldc,i__4,ncol);
#else
		    Cblacs_dSendrecv( context_1.ictxt,
                          nl, ml, &work[1], nl, i__3, mcol,
                          nl, ml, &c__[(kjc + 1) * c_dim1 + 1], *ldc, i__4, ncol );
#endif

		} else {
		    i__3 = (myrow + igcd * j1 + kmod1) % nprow;
		    i__4 = (mprow - igcd * j1 - kmod2) % nprow;
#if 0
		    Cdgesd2d(context_1.ictxt,nl,ml,&work[1],nl,i__3,mcol);
		    Cdgerv2d(context_1.ictxt,nl,ml,&work[ipt],nl, i__4,ncol);
#else
        Cblacs_dSendrecv( context_1.ictxt,
                          nl, ml, &work[1],   nl, i__3, mcol,
                          nl, ml, &work[ipt], nl, i__4, ncol );
#endif

		    kjc = iwork[(kprow - igcd * j1) % nprow * 3 + 3] * *mb;
		    i__3 = np - kic;
		    i__4 = mq - kjc;
		    dmv2mx_(&work[ipt], &ldt, &tbeta, &c__[kic + 1 + (kjc + 1)
			     * c_dim1], ldc, &lbn2, &lbm2, &nb2, &mb2, &i__3,
			    &i__4);
		}
	    }
	}
    }

    return 0;
} /* pdtrans_ */
예제 #19
0
void pdgemm(char *TRANSA, char *TRANSB,
            int *M, int *N, int *K,
            double *ALPHA,
            double *a, int *IA, int *JA, int *DESCA,
            double *b, int *IB, int *JB, int *DESCB,
            double *BETA,
            double *c, int *IC, int *JC, int *DESCC)
{
   int m, n, k;
   int i, j, round;
   int r_rank, c_rank, g_rank, g_size;
   double *tempA, *tempB;
   double t_bc, t_calc, t_tot, t_bar;
   double tt_bc, tt_calc, tt_tot, tt_bar;
   double sum, g_sum;
   int roots;
   int pk;
   int nprow, npcol, myrow, mycol;
   MPI_Comm my_row_comm, my_column_comm;

   /* Get the rank of the global communicator */
   MPI_Comm_rank(MPI_COMM_WORLD, &g_rank);
   MPI_Comm_size(MPI_COMM_WORLD, &g_size);

   Cblacs_gridinfo( (ctxt = DESCC[DCTXT]), &nprow, &npcol, &myrow, &mycol);

   rows = nprow;
   columns = npcol;

   pk = PIPE_SIZE;

      m = DESCA[DM]/nprow;
      n = DESCB[DN]/nprow;
      k = DESCA[DN]/npcol; // local m n k size

      //      printf("nprow = %d, npcol = %d\n\n",nprow, npcol);

   MPI_Barrier(MPI_COMM_WORLD);
   //         printf("---step 1:myrow = %d, mycol = %d\n\n",myrow, mycol);
   
   /* Create my row and column communicators */   
   MPI_Comm_split(MPI_COMM_WORLD, mycol, myrow, &my_column_comm);
   MPI_Comm_split(MPI_COMM_WORLD, myrow, mycol, &my_row_comm);

   /* Get the rank of the local row communicator */
   MPI_Comm_rank(my_row_comm, &r_rank);
   /* Get the rank of the local column communicator */
   MPI_Comm_rank(my_column_comm, &c_rank);
   
   tempA = (double *) malloc (m*k * sizeof(double));
   tempB = (double *) malloc (k*n * sizeof(double));

   //const enum CBLAS_ORDER Order=CblasRowMajor;
   //const enum CBLAS_TRANSPOSE TA=CblasNoTrans;
   //const enum CBLAS_TRANSPOSE TB=CblasNoTrans;

   sum = g_sum = 0.0;
   tt_bc = tt_calc = tt_tot = t_bar = 0.0;
   tt_tot = tt_bc = tt_calc = tt_bar = 0.0;

   if(TIMER)
      t_tot = MPI_Wtime();

   /* FT part, sum of A columnwise and B rowwise */
   // A part //
   if(mycol < 7)
   {
     if(myrow < 7)
     {
       for(i=0; i<m; i++)
	 for(j=0; j<k; j++)
	   {
	     tempA[i*k+j] = a[i*k+j];
	   }
     }
     //     if(mycol == 6)
     //  printf("a = %f\n", a[0]);
     MPI_Reduce(tempA, a, m*k, MPI_DOUBLE, MPI_SUM, 7, my_column_comm);
     //if(mycol == 6 && myrow == 7)
     //  printf("A = %f\n", a[0]);
   }

   if(myrow < 7)
   {
     if(mycol < 7)
       {
	 for(i=0; i<k; i++)
	   for(j=0; j<n; j++)
	     {
	       tempB[i*n+j] = b[i*n+j];
	     }
       }
     MPI_Reduce(tempB, b, k*n, MPI_DOUBLE, MPI_SUM, 7, my_row_comm);
   }
				

   //   printf("rows = %d, columns = %d\n",rows, columns);
   //   printf("---step 3:myrow = %d, mycol = %d\n\n",myrow, mycol);         
     for(round=0; round<nprow; round++)
      {
     
	//	if(myrow == mycol)
	//	printf("round = %d\n", round);
	//      if(myrow == 5 || mycol == 5)
           

	roots = round;

         if(r_rank == roots)
         {
              for(i=0; i<m; i++)
	        for(j=0; j<k; j++)
		{
		  tempA[i*k+j] = a[i*k+j];
		}
	 }

         if(c_rank == roots)
         {
              for(i=0; i<k; i++)
	        for(j=0; j<n; j++)
		{
		  tempB[i*n+j] = b[i*n+j];
		}
         }


         if(TIMER)
            t_bc = MPI_Wtime();

         /* Broadcast to right */
         if(PIPE)
	 {
              MY_Bcast(tempA, m*k,  MPI_DOUBLE, roots, my_row_comm, r_rank, columns, pk);
	 }
         else
         {
              MPI_Bcast(tempA, k*m, MPI_DOUBLE, roots, my_row_comm);
	 }

	 //   if(c_rank == 5)
	 //    printf("round = %d, :myrow = %d, mycol = %d\n\n", round, myrow, mycol);         
	 
         /* Broadcast below */
         
         if(PIPE)
	 {
              MY_Bcast(tempB, k*n, MPI_DOUBLE, roots, my_column_comm, c_rank, rows, pk);
	 }
         else
	 {
               MPI_Bcast(tempB, n*k, MPI_DOUBLE, roots, my_column_comm);
         }
	 
         if(TIMER)
         {
            t_bc = MPI_Wtime() - t_bc;
            tt_bc += t_bc;
            t_calc = MPI_Wtime();
         }

         if(DVFS_ENABLE) 
            mapping(g_rank%rows, DVFS_HIGH);

         /* Do the multiplication */
         dgemm_(TRANSA, TRANSB, &m, &n, &k, ALPHA, tempA, &m, tempB, &k, BETA, c, &m);

         if(DVFS_ENABLE) 
            mapping(g_rank%rows, DVFS_LOW);

         if(TIMER)
         { 
            t_calc = MPI_Wtime() - t_calc;
            tt_calc += t_calc;
         }
      }
      

   if(TIMER)
   {
      t_tot = MPI_Wtime() - t_tot;
      tt_tot += t_tot;
   
      MPI_Reduce(&tt_tot, &t_tot, 1, MPI_DOUBLE, MPI_MAX,  0, MPI_COMM_WORLD);
      MPI_Reduce(&tt_bc, &t_bc, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
      MPI_Reduce(&tt_calc, &t_calc, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);

   
      if(g_rank == 0)
         printf("tot = %10.6f, bc = %10.6f, calc = %10.6f\n", t_tot, t_bc, t_calc);
   }

   //	if(myrow==0 && mycol==0) 
   //   	  printf("myrow = %d, mycol = %d\n\n",myrow, mycol);

   //   printf("Test0\n");
   MPI_Barrier(MPI_COMM_WORLD);

   free(tempA); free(tempB);
   MPI_Comm_free(&my_row_comm); 
   MPI_Comm_free(&my_column_comm);

   //   printf("pdgemm done!\n");

   return;
}
예제 #20
0
void pdgemm(char *TRANSA, char *TRANSB,
            int *M, int *N, int *K,
            double *ALPHA,
            double *a, int *IA, int *JA, int *DESCA,
            double *b, int *IB, int *JB, int *DESCB,
            double *BETA,
            double *c, int *IC, int *JC, int *DESCC)
{
   // M = *, N = *, K = 32 //
   int m, n, k;
   int i, j, round;
   int r_rank, c_rank, g_rank, g_size;
   double *tempA, *tempB;
   double t_bc, t_calc, t_tot, t_bar;
   double tt_bc, tt_calc, tt_tot, tt_bar;
   double sum, g_sum;
   int roots;
   int pk;
   int nprow, npcol, myrow, mycol;
   MPI_Comm my_row_comm, my_column_comm;

   /* Get the rank of the global communicator */
   MPI_Comm_rank(MPI_COMM_WORLD, &g_rank);
   MPI_Comm_size(MPI_COMM_WORLD, &g_size);

   Cblacs_gridinfo( (ctxt = DESCC[DCTXT]), &nprow, &npcol, &myrow, &mycol);

   rows = nprow;
   columns = npcol;

   pk = PIPE_SIZE;

      m = DESCA[DM]/nprow;
      n = DESCB[DN]/nprow;
   // only first column and first row has data, block = 32
      k = DESCA[DN]; // k = 32 or 0 for others

   MPI_Barrier(MPI_COMM_WORLD);

   /* Create my row and column communicators */   
   MPI_Comm_split(MPI_COMM_WORLD, mycol, myrow, &my_column_comm);
   MPI_Comm_split(MPI_COMM_WORLD, myrow, mycol, &my_row_comm);

   /* Get the rank of the local row communicator */
   MPI_Comm_rank(my_row_comm, &r_rank);
   /* Get the rank of the local column communicator */
   MPI_Comm_rank(my_column_comm, &c_rank);
   
   tempA = (double *) malloc (m*k * sizeof(double));
   tempB = (double *) malloc (k*n * sizeof(double));

   //const enum CBLAS_ORDER Order=CblasRowMajor;
   //const enum CBLAS_TRANSPOSE TA=CblasNoTrans;
   //const enum CBLAS_TRANSPOSE TB=CblasNoTrans;

   sum = g_sum = 0.0;
   tt_bc = tt_calc = tt_tot = t_bar = 0.0;
   tt_tot = tt_bc = tt_calc = tt_bar = 0.0;

   if(TIMER)
      t_tot = MPI_Wtime();

      round = 0; // only first round has data to calculate
      //for(round=0; round<nprow; round++)
      {
         roots = round;

         if(r_rank == roots)
         {
              for(i=0; i<m; i++)
	        for(j=0; j<k; j++)
		{
		  tempA[i*k+j] = a[i*k+j];
		}
	 }

         if(c_rank == roots)
         {
              for(i=0; i<k; i++)
	        for(j=0; j<n; j++)
		{
		  tempB[i*n+j] = b[i*n+j];
		}
         }


         if(TIMER)
            t_bc = MPI_Wtime();

         /* Broadcast to right */
         if(PIPE)
	 {
              MY_Bcast(tempA, m*k,  MPI_DOUBLE, roots, my_row_comm, r_rank, columns, pk);
	 }
         else
         {
              MPI_Bcast(tempA, k*m, MPI_DOUBLE, roots, my_row_comm);
	 }
	 
         /* Broadcast below */
         
         if(PIPE)
	 {
              MY_Bcast(tempB, k*n, MPI_DOUBLE, roots, my_column_comm, c_rank, rows, pk);
	 }
         else
	 {
               MPI_Bcast(tempB, n*k, MPI_DOUBLE, roots, my_column_comm);
         }
	 
         if(TIMER)
         {
            t_bc = MPI_Wtime() - t_bc;
            tt_bc += t_bc;
            t_calc = MPI_Wtime();
         }

         if(DVFS_ENABLE) 
            mapping(g_rank%8, DVFS_HIGH);

         /* Do the multiplication */
         dgemm_(TRANSA, TRANSB, &m, &n, &k, ALPHA, tempA, &m, tempB, &k, BETA, c, &m);

         if(DVFS_ENABLE) 
            mapping(g_rank%8, DVFS_LOW);

         if(TIMER)
         { 
            t_calc = MPI_Wtime() - t_calc;
            tt_calc += t_calc;
         }
      }
         
   if(TIMER)
   {
      t_tot = MPI_Wtime() - t_tot;
      tt_tot += t_tot;
   
      MPI_Reduce(&tt_tot, &t_tot, 1, MPI_DOUBLE, MPI_MAX,  0, MPI_COMM_WORLD);
      MPI_Reduce(&tt_bc, &t_bc, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
      MPI_Reduce(&tt_calc, &t_calc, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);

   
      if(g_rank == 0)
         printf("tot = %10.6f, bc = %10.6f, calc = %10.6f\n", t_tot, t_bc, t_calc);
   }

   MPI_Barrier(MPI_COMM_WORLD);

   free(tempA); free(tempB);
   MPI_Comm_free(&my_row_comm); 
   MPI_Comm_free(&my_column_comm);
}
예제 #21
0
파일: fnlsdp.c 프로젝트: operedo/fnlsdp
int main(int argc, char *argv[]){
	gettimeofday(&tp, NULL);
  	starttime=(double)tp.tv_sec+(1.e-6)*tp.tv_usec;

	int id, np, ret;
	/* dirs, files, tags ...*/
	const char* dir="../data/compleib_data/";
	const char* dirinit="../data/initial_points/";
	const char* code="REA1";
	const char* tag="01";
	char* solutionQP="solutionQP.dat-s";


	/*algorithm vars*/
	int max_iter=10000,step_1_fail=20;	
	double romax=10.0,beta=0.9,gamma=0.1,sigma=0.5,tolerancia_ro=10e-4;
	
	/* compleib data matrices */
	genmatrix A, B1, B,C1, C,D11,D12,D21;
	
	/* compleib initial point */
	genmatrix F0, Q0, V0;
	
	/* compleib matrix dimensions*/
	int nx,nw,nu,nz,ny;	
	int i,n,k;
	double rho=100.0;
	
	/* auxiliar matrices */
	genmatrix AF, CF, OUT, MF1, MF2;
	struct blockmatrix diagMF1, diagMF2;
	
	/* filter SDP */
	filter Fil;
	//double beta=0.9;
	//double gamma=0.1;

	/* sdp problem data */
	struct blockmatrix Csdp;
  	double *asdp;
  	struct constraintmatrix *constraintssdp;
  	
	/* sdp variables */
	struct blockmatrix X,Z;
  	double *y;
	
	/* sdp value of objectives functions */
  	double pobj,dobj;
	genmatrix *null=NULL;
	
	 /* Initialize the process grid */
	struct scalapackpar scapack;
  	struct paramstruc params;
  	int printlevel;

	/*load compleib data*/
	//int size=10974;
	//load_genmatrix("testeigenvalue/bcsstk17.dat",&A,size,size,0);
	//load_compleib(code, dir, &A, &B1, &B, &C1, &C, &D11, &D12, &D21, &nx, &nw, &nu, &nz, &ny, id);
	//load_initial_point(code,tag,dirinit,&F0,&Q0,&V0,nx,nu,ny, id);
	
	
	//initialize_filter(&Fil,500.0,500.0);
	
	
	MPI_Init(&argc,&argv);
  	MPI_Comm_rank (MPI_COMM_WORLD,&id);
  	MPI_Comm_size (MPI_COMM_WORLD,&np);
	scapack.id = id;
  	scapack.np = np;

  	switch (scapack.np)
        {
        	case 1: scapack.nprow=1; scapack.npcol=1;
          		break;
        	case 2: scapack.nprow=2; scapack.npcol=1;
          		break;
        	case 4: scapack.nprow=2; scapack.npcol=2;
          		break;
        	case 8: scapack.nprow=4; scapack.npcol=2;
          		break;
        	case 16: scapack.nprow=4; scapack.npcol=4;
          		break;
        	case 32: scapack.nprow=8; scapack.npcol=4;
          		break;
        	case 64: scapack.nprow=8; scapack.npcol=8;
          		break;
        	default:
          		if (scapack.id==0)
              			printf("Can not setup %d processors to a grid.\nPlease use 1,2,4,8,9,16,32 or 64 nodes to run or modify fnlsdp.c file. \n",scapack.np);
          		MPI_Finalize();
          		return(1);
	};
   	
	sl_init_(&scapack.ic,&scapack.nprow,&scapack.npcol);
	Cblacs_gridinfo(scapack.ic,&scapack.nprow,&scapack.npcol,&scapack.myrow,&scapack.mycol);
	
	/*
	if(id==0)print_filter(&Fil);
	if(acceptable(&Fil,10.0,3.0,beta,gamma)) {printf("acceptable!\n");add(&Fil,10.0,3.0);}
	if(id==0)print_filter(&Fil);
	if(acceptable(&Fil,8.0,3.1,beta,gamma)) {printf("acceptable!\n");add(&Fil,8.0,3.1);}
	if(id==0)print_filter(&Fil);
	if(acceptable(&Fil,6.0,3.2,beta,gamma)) {printf("acceptable!\n");add(&Fil,6.0,3.2);}
	if(id==0)print_filter(&Fil);
	extract(&Fil,10.0,3.0);
	if(id==0)print_filter(&Fil);
	extract(&Fil,8.0,3.1);
	if(id==0)print_filter(&Fil);
	*/	


//printf("scapack: %d,%d,%d,%d,%d\n",scapack.ic,scapack.npcol,scapack.nprow,scapack.mycol,scapack.myrow);

	
	//printf("f=%f\ntheta=%f\n",eval_f(&F0,&Q0, &V0,rho,&A,&B1,&B,&C1,&C,&D11,&D12,&D21,nx,nw,nu,ny,nz,scapack,params,printlevel,id),eval_theta(&F0,&Q0, &V0,rho,&A,&B1,&B,&C1,&C,&D11,&D12,&D21,nx,nw,nu,ny,nz,scapack,params,printlevel,id));
//printf("scapack: %d,%d,%d,%d,%d\n",scapack.ic,scapack.npcol,scapack.nprow,scapack.mycol,scapack.myrow);

	//test_nelmin(&F0,&Q0, &V0,rho,&A,&B1,&B,&C1,&C,&D11,&D12,&D21,nx,nw,nu,ny,nz,scapack,params,printlevel,id);

	//fix_genmatrix(&Q0);
	//print_genmatrix(&V0);

	/*double ll=lambda1(&A,size,scapack,params,printlevel,id);	
	if(id==0)printf("lambda1=%f\n",ll);
	MPI_Barrier(MPI_COMM_WORLD);
	free_mat_gen(&A,0);*/

//printf("scapack: %d,%d,%d,%d,%d\n",scapack.ic,scapack.npcol,scapack.nprow,scapack.mycol,scapack.myrow);


	algorithm(code,tag,dir,dirinit,max_iter,romax,beta,gamma,sigma,tolerancia_ro,step_1_fail,scapack,params,printlevel,id);


	/*
	double *dx=(double *)calloc(nu*ny+nx*(nx+1),sizeof(double));	
	double *x_current=(double *)calloc(nu*ny+nx*(nx+1),sizeof(double));	

	printf("holaaaa\n");

	mats2vec(dx,&F0,&Q0,&V0,nu,ny,nx);
	
	for(i=0;i<nu*ny+nx*(nx+1);i++){
		printf("dx[%d]=%f\n",i,dx[i]);
	}

	mats2vec(x_current,&F0,&Q0,&V0,nu,ny,nx);
	
	for(i=0;i<nu*ny+nx*(nx+1);i++){
		printf("x_current[%d]=%f\n",i,x_current[i]);
	}
	
	printf("fobj=%f\n",eval_nabla_f_vec(dx,x_current, rho,&A,&B1,&B,&C1,&C,&D11,&D12,&D21,nx,nw,nu,ny,nz,scapack,params,printlevel,id));
	
	free(dx);
	free(x_current);
	*/

	/*solve qp*/
	//ret=0;
	//fix_genmatrix(&Q0);
	//ret=solve_qp(code,"01", &F0,&Q0, &V0,rho,&A,&B1,&B,&C1,&C,&D11,&D12,&D21,nx,nw,nu,ny,nz,scapack,params,printlevel,id,solutionQP);
//printf("scapack: %d,%d,%d,%d,%d\n",scapack.ic,scapack.npcol,scapack.nprow,scapack.mycol,scapack.myrow);
	
	/*if(DEBUG_FNLSDP && id==0){
		printf("F1:\n");
		print_genmatrix(&F0);
		printf("Q1:\n");
		print_genmatrix(&Q0);
		printf("V1:\n");
		print_genmatrix(&V0);
	}*/
	
	
	//free_filter(&Fil);
	//free_initial_point(&F0,&Q0,&V0, np);
	//free_compleib(&A, &B1, &B, &C1, &C, &D11, &D12, &D21, np);
  	
	Cblacs_gridexit(scapack.ic);
	MPI_Finalize();
        gettimeofday(&tp, NULL);
  	endtime=(double)tp.tv_sec+(1.e-6)*tp.tv_usec;
  	totaltime=endtime-starttime;
  	othertime=totaltime-opotime-factortime;
	if(id==0){
  		printf("Elements time: %f \n",opotime);
  		printf("Factor time: %f \n",factortime);
  		printf("Other time: %f \n",othertime);
  		printf("Total time: %f \n",totaltime);
	}
	return ret;
}
예제 #22
0
파일: mat_redist.c 프로젝트: operedo/fnlsdp
void 
ddistr(int ictxt, int n, int numc, int nb, double *A , double *A_d, int *descAd ){

int RootNodeic, ione=1, izero=0, isRootNode=0, nru, info;
int nprow, npcol, myrow, mycol,descA[9], itemp;
int i,k;

/*
#ifdef NOUNDERLAPACK
 sl_init__(&RootNodeic,&ione, &ione);
#else
  sl_init__(&RootNodeic,&ione, &ione);
#endif
*/



   
  sl_init_(&RootNodeic,&ione, &ione);
  
  Cblacs_gridinfo(ictxt, &nprow, &npcol, &myrow, &mycol);  


	//printf("nprow=%d, npcol=%d, myrow=%d, mycol=%d\n",nprow,npcol,myrow,mycol);
	//printf("nb=%d\n",nb);


  if (myrow==0 && mycol==0) { isRootNode=1;}

  if (isRootNode){
     //printf("root entro aca...\n");
	nru = numroc_(&n, &n, &myrow,&izero, &nprow );
     //printf("root paso numroc\n");
	itemp = max(1, nru);
     descinit_(descA, &n, &numc, &n, &n, &izero, &izero, &RootNodeic, &itemp, &info);
  	//printf("root paso descinit\n");
  } 
  else{
     //printf("yo entre aca\n");
     k=0;
     for(i=0;i<9;i++){ 
       descA[k]=0;
       k++;
     }
     descA[1]=-1;
  }

  //printf("inicio de cosas para todos\n");
  nru = numroc_(&n, &nb, &myrow, &izero, &nprow);
  //printf("todos pasan numroc\n");
  itemp = max(1,nru);
  descinit_(descAd, &n, &numc, &nb, &nb, &izero, &izero, &ictxt,&itemp, &info);  
  //printf("todos pasan descinit\n");

  pdgemr2d_( &n, &numc, A, &ione, &ione, descA, A_d, &ione, &ione, descAd, &ictxt);
  //printf("todos pasan pdgemr2d\n");
  
 if (isRootNode){ 
     	//printf("RootNodeic=%d\n",RootNodeic);
	Cblacs_gridexit(RootNodeic);
	//printf("root paso gridexit\n");
  }
}
예제 #23
0
파일: cosdist.c 프로젝트: kouamano/utility
int MAIN__(int argc, char** argv) {
    int num;  // number of data
    int dim;  // dimension of each data
    int nprow=4; // number of row
    int npcol=1;  // number of columnn
    int zero=0, one=1; // constant value
    int ictxt,myrow,mycol,pnum,pdim,info;
    char ifilename[LEN_FILENAME];
    char ofilename[LEN_FILENAME];

    int myproc, nprocs;
    Cblacs_pinfo(&myproc, &nprocs);
    Cblacs_setup(&myproc, &nprocs);
    Cblacs_get(-1,0,&ictxt);
    nprow = nprocs;
    npcol = 1; // fixed

    char order[] = "Row";
    Cblacs_gridinit(&ictxt, order, nprow, npcol);
    Cblacs_gridinfo(ictxt, &nprow, &npcol, &myrow, &mycol);

    if (DEBUG_MODE) {
        printf("ConTxt = %d\n", ictxt);
        printf("nprocs=%d, nprow=%d, npcol=%d\n", nprocs, nprow, npcol);
        printf("nprocs=%d, myrow=%d, mycol=%d\n", nprocs, myrow, mycol);
    }

    get_option(argc, argv, ifilename, ofilename, &num, &dim);

    // 0. cosinedist(ij) = 1 - V(i)V(j)/(Length(V(i))*Length(V(j)))

    // 1. calculate submatrix size
    int bsize = num / nprow; // blocking factor
    pnum = num / nprow;
    pdim = dim;
    if ( myrow < (num/bsize)%nprow) {
        pnum += bsize;
    }
    else if ( myrow == (num/bsize)%nprow) {
        pnum += (num % bsize);
    }
    else {
    }
    if(DEBUG_MODE)
        printf("myproc=%d: pnum=%d, pdim=%d, bsize=%d\n", myproc, pnum, pdim, bsize);

    int desc_input[9], desc_v[9], desc_ip[9], desc_n[9], desc_result[9];
    descinit_(desc_input,  &num, &dim, &num,   &dim,  &zero, &zero, &ictxt, &num,  &info);
    descinit_(desc_v,      &num, &dim, &bsize, &pdim, &zero, &zero, &ictxt, &pnum, &info);
    descinit_(desc_ip,     &num, &num, &bsize, &num,  &zero, &zero, &ictxt, &pnum, &info);
    descinit_(desc_n,      &num, &one, &bsize, &one,  &zero, &zero, &ictxt, &pnum, &info);
    descinit_(desc_result, &num, &num, &num,   &num,  &zero, &zero, &ictxt, &num,  &info);

    // 2. read input data
    double* input;
    if (myproc == 0) {
        input = (double*)malloc(sizeof(double)*num*dim);
        memset(input, 0, sizeof(double)*num*dim);
        read_data(ifilename, num, dim, input);
        printArray("input", myproc, input, num, dim);
    }

    // 3. distribute input data array
    double* V = (double*)malloc(sizeof(double)*pnum*pdim);
    memset(V, 0, sizeof(double)*pnum*pdim);
    Cpdgemr2d(num, dim, input, 1, 1, desc_input, V, 1, 1, desc_v, ictxt);
    printArray("V", myproc, V, pnum, pdim);

    // 4. InnerProduct = VV'
    double* InnerProduct = (double*)malloc(sizeof(double)*pnum*num);
    memset(InnerProduct, 0, sizeof(double)*pnum*num);
    char transa = 'N', transb = 'T';
    int m = num, n = num, k = dim;
    int lda = num, ldb = num, ldc = num;
    double alpha = 1.0f, beta = 0.0f;
    pdgemm_(&transa, &transb, &m, &n, &k, &alpha, V, &one, &one, desc_v, V, &one, &one, desc_v, &beta, InnerProduct, &one, &one, desc_ip);
    printArray("InnerProduct", myproc, InnerProduct, pnum, num);

    // 5. Norm of each vector
    double* Norm = (double*)malloc(sizeof(double)*pnum);
    for (int i = 0; i < pnum; i++) {
        int n = ((myproc*bsize)+(i/bsize)*(nprocs-1)*bsize+i)*pnum + i;
        Norm[i] = sqrt(InnerProduct[n]);
    }
    printArray("Norm", myproc, Norm, 1, pnum);

    // 6. Norm product matrix
    double* NormProduct = (double*)malloc(sizeof(double)*pnum*num);
    memset(NormProduct, 0, sizeof(double)*pnum*num);
    char uplo = 'U';
    n = num;
    alpha = 1.0f;
    int incx = 1;
    lda = num;
    pdsyr_(&uplo, &n, &alpha, Norm, &one, &one, desc_n, &incx, NormProduct, &one, &one, desc_ip);
    printArray("NormProduct", myproc, NormProduct, pnum, num);

    // 7. CosineDistance(ij) = 1-InnerProduct(ij)/NormProduct(ij)
    double* CosineDistance = (double*)malloc(sizeof(double)*pnum*num);
    memset(CosineDistance, 0, sizeof(double)*pnum*num);
    for (int j = 0; j < num; j++) {
        for (int i = 0; i < pnum; i++) {
            int n = ((myproc*bsize)+i+(i/bsize)*(nprocs-1)*bsize)*pnum+i;
            int p = i+j*pnum;
            if (p<=n) {
                CosineDistance[p] = 0.0;
            }
            else {
                CosineDistance[p] = 1 - InnerProduct[p]/NormProduct[p];
            }
        }
    }
    printArray("CosineDistance", myproc, CosineDistance, pnum, num);

    // 8. gather result
    double* result;
    if ( myproc == 0 ) {
        result = (double*)malloc(sizeof(double)*num*num);
        memset(result, 0, sizeof(double)*num*num);
    }
    Cpdgemr2d(num, num, CosineDistance, 1, 1, desc_ip, result, 1, 1, desc_result, ictxt);

    // 9. output to file
    if ( myproc == 0 ) {
        output_results(ofilename, result, num, num);
    }

    // a. cleanup memory
    free(V);
    free(InnerProduct);
    free(Norm);
    free(NormProduct);
    free(CosineDistance);
    if ( myproc == 0 ) {
        free(input);
        free(result);
    }

    blacs_exit_(&zero);

    return 0;
}
예제 #24
0
int main(int argc, char **argv) {
        int iam, nprocs;
        int myrank_mpi, nprocs_mpi;
        int ictxt, nprow, npcol, myrow, mycol;
        int nb, m, n;
        int mpA, nqA, mpU, nqU, mpVT, nqVT;
        int i, j, k, itemp, min_mn;
        int descA[9], descU[9], descVT[9];
        float *A=NULL;
        int info, infoNN, infoVV, infoNV, infoVN;
        float *U_NN=NULL,  *U_VV=NULL,  *U_NV=NULL,  *U_VN=NULL;
        float *VT_NN=NULL, *VT_VV=NULL, *VT_NV=NULL, *VT_VN=NULL;
        float *S_NN=NULL,  *S_VV=NULL, *S_NV=NULL, *S_VN=NULL;
        float *S_res_NN=NULL;
        float orthU_VV, residF, orthVT_VV;
        float orthU_VN, orthVT_NV;
        float  residS_NN, eps;
        float  res_repres_NV, res_repres_VN;
/**/
        int izero=0,ione=1;
        float rtmone=-1.0e+00;
/**/
        double MPIelapsedVV, MPIelapsedNN, MPIelapsedVN, MPIelapsedNV;
        char jobU, jobVT;
        int nbfailure=0, nbtestcase=0,inputfromfile, nbhetereogeneity=0;
        float threshold=100e+00;
        char buf[1024];
        FILE *fd;       
        char *c;
        char *t_jobU, *t_jobVT;
        int *t_m, *t_n, *t_nb, *t_nprow, *t_npcol;
        int nb_expe, expe;
        char hetereogeneityVV, hetereogeneityNN, hetereogeneityVN, hetereogeneityNV;
        int iseed[4], idist;
/**/
        MPI_Init( &argc, &argv);
        MPI_Comm_rank(MPI_COMM_WORLD, &myrank_mpi);
        MPI_Comm_size(MPI_COMM_WORLD, &nprocs_mpi);
/**/
        m = 100; n = 100; nprow = 1; npcol = 1; nb = 64; jobU='A'; jobVT='A'; inputfromfile = 0;
        for( i = 1; i < argc; i++ ) {
                if( strcmp( argv[i], "-f" ) == 0 ) {
                        inputfromfile = 1;
                }
                if( strcmp( argv[i], "-jobvt" ) == 0 ) {
                        if (i+1<argc) {
                                if( strcmp( argv[i+1], "V" ) == 0 ){ jobVT = 'V'; i++; }
                                else if( strcmp( argv[i+1], "N" ) == 0 ){ jobVT = 'N'; i++; }
                                else if( strcmp( argv[i+1], "A" ) == 0 ){ jobVT = 'A'; i++; }
                                else printf(" ** warning: jobvt should be set to V, N or A in the command line ** \n");
                        }
                        else    
                                printf(" ** warning: jobvt should be set to V, N or A in the command line ** \n");
                }
                if( strcmp( argv[i], "-jobu" ) == 0 ) {
                        if (i+1<argc) {
                                if( strcmp( argv[i+1], "V" ) == 0 ){ jobU = 'V'; i++; }
                                else if( strcmp( argv[i+1], "N" ) == 0 ){ jobU = 'N'; i++; }
                                else if( strcmp( argv[i+1], "A" ) == 0 ){ jobU = 'A'; i++; }
                                else printf(" ** warning: jobu should be set to V, N or A in the command line ** \n");
                        }
                        else    
                                printf(" ** warning: jobu should be set to V, N or A in the command line ** \n");
                }
                if( strcmp( argv[i], "-m" ) == 0 ) {
                        m      = atoi(argv[i+1]);
                        i++;
                }
                if( strcmp( argv[i], "-n" ) == 0 ) {
                        n      = atoi(argv[i+1]);
                        i++;
                }
                if( strcmp( argv[i], "-p" ) == 0 ) {
                        nprow  = atoi(argv[i+1]);
                        i++;
                }
                if( strcmp( argv[i], "-q" ) == 0 ) {
                        npcol  = atoi(argv[i+1]);
                        i++;
                }
                if( strcmp( argv[i], "-nb" ) == 0 ) {
                        nb     = atoi(argv[i+1]);
                        i++;
                }
        }
/**/
        if (inputfromfile){
                nb_expe = 0;
                fd = fopen("svd.dat", "r");
                if (fd == NULL) { printf("File failed to open svd.dat from processor mpirank(%d/%d): \n",myrank_mpi,nprocs_mpi); exit(-1); }
                do {    
                        c = fgets(buf, 1024, fd);  /* get one line from the file */
                        if (c != NULL)
                                if (c[0] != '#')
                                        nb_expe++;
                } while (c != NULL);              /* repeat until NULL          */
                fclose(fd);
                t_jobU  = (char *)calloc(nb_expe,sizeof(char)) ;
                t_jobVT = (char *)calloc(nb_expe,sizeof(char)) ;
                t_m     = (int  *)calloc(nb_expe,sizeof(int )) ;
                t_n     = (int  *)calloc(nb_expe,sizeof(int )) ;
                t_nb    = (int  *)calloc(nb_expe,sizeof(int )) ;
                t_nprow = (int  *)calloc(nb_expe,sizeof(int )) ;
                t_npcol = (int  *)calloc(nb_expe,sizeof(int )) ;
                fd = fopen("svd.dat", "r");
                expe=0;
                do {    
                        c = fgets(buf, 1024, fd);  /* get one line from the file */
                        if (c != NULL)
                                if (c[0] != '#'){
                                        //printf("NBEXPE = %d\n",expe);
                                        sscanf(c,"%c %c %d %d %d %d %d",
                                                &(t_jobU[expe]),&(t_jobVT[expe]),&(t_m[expe]),&(t_n[expe]),
                                                &(t_nb[expe]),(&t_nprow[expe]),&(t_npcol[expe]));
                                        expe++;
                                }
                } while (c != NULL);              /* repeat until NULL          */
                fclose(fd);
        }
        else {
                nb_expe = 1;
                t_jobU  = (char *)calloc(nb_expe,sizeof(char)) ;
                t_jobVT = (char *)calloc(nb_expe,sizeof(char)) ;
                t_m     = (int  *)calloc(nb_expe,sizeof(int )) ;
                t_n     = (int  *)calloc(nb_expe,sizeof(int )) ;
                t_nb    = (int  *)calloc(nb_expe,sizeof(int )) ;
                t_nprow = (int  *)calloc(nb_expe,sizeof(int )) ;
                t_npcol = (int  *)calloc(nb_expe,sizeof(int )) ;
                t_jobU[0]  = jobU;
                t_jobVT[0] = jobVT;
                t_m[0]     = m;
                t_n[0]     = n;
                t_nb[0]    = nb;
                t_nprow[0] = nprow;
                t_npcol[0] = npcol;
        }

        if (myrank_mpi==0){
                printf("\n");
                printf("--------------------------------------------------------------------------------------------------------------------\n");
                                printf("                            Testing psgsevd -- float precision SVD ScaLAPACK routine                \n");
                printf("jobU jobVT    m     n     nb   p   q   || info   heter   resid     orthU    orthVT   |SNN-SVV|    time(s)   cond(A) \n");
                printf("--------------------------------------------------------------------------------------------------------------------\n");
        }
/**/
        for (expe = 0; expe<nb_expe; expe++){

        jobU  = t_jobU[expe]  ; 
        jobVT = t_jobVT[expe] ; 
        m     = t_m[expe]     ; 
        n     = t_n[expe]     ; 
        nb    = t_nb[expe]    ; 
        nprow = t_nprow[expe] ; 
        npcol = t_npcol[expe] ; 

        if (nb>n)
                nb = n;
        if (nprow*npcol>nprocs_mpi){
                if (myrank_mpi==0)
                        printf(" **** ERROR : we do not have enough processes available to make a p-by-q process grid ***\n");
                        printf(" **** Bye-bye                                                                         ***\n");
                MPI_Finalize(); exit(1);
        }
/**/
        Cblacs_pinfo( &iam, &nprocs ) ;
        Cblacs_get( -1, 0, &ictxt );
        Cblacs_gridinit( &ictxt, "Row", nprow, npcol );
        Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol );
/**/
        min_mn = min(m,n);
/**/
        //if (iam==0)
                //printf("\tm=%d\tn = %d\t\t(%d,%d)\t%dx%d\n",m,n,nprow,npcol,nb,nb);
        //printf("Hello World, I am proc %d over %d for MPI, proc %d over %d for BLACS in position (%d,%d) in the process grid\n", 
                        //myrank_mpi,nprocs_mpi,iam,nprocs,myrow,mycol);
/*
*
*     Work only the process in the process grid
*
*/
        //if ((myrow < nprow)&(mycol < npcol)){
        if ((myrow>-1)&(mycol>-1)&(myrow<nprow)&(mycol<npcol)){

/*
*
*     Compute the size of the local matrices (thanks to numroc)
*
*/ 
                mpA    = numroc_( &m     , &nb, &myrow, &izero, &nprow );
                nqA    = numroc_( &n     , &nb, &mycol, &izero, &npcol );
                mpU    = numroc_( &m     , &nb, &myrow, &izero, &nprow );
                nqU    = numroc_( &min_mn, &nb, &mycol, &izero, &npcol );
                mpVT   = numroc_( &min_mn, &nb, &myrow, &izero, &nprow );
                nqVT   = numroc_( &n     , &nb, &mycol, &izero, &npcol );
/*
*
*     Allocate and fill the matrices A and B
*
*/ 
                A = (float *)calloc(mpA*nqA,sizeof(float)) ;
                if (A==NULL){ printf("error of memory allocation A on proc %dx%d\n",myrow,mycol); exit(0); }
/**/            
//              seed = iam*(mpA*nqA*2); srand(seed);
                idist = 2;
                iseed[0] = mpA%4096;
                iseed[1] = iam%4096;
                iseed[2] = nqA%4096;
                iseed[3] = 23;
/**/            
                k = 0;
                for (i = 0; i < mpA; i++) {
                        for (j = 0; j < nqA; j++) {
                                slarnv_( &idist, iseed, &ione, &(A[k]) );
                                k++;    
                        }
                }
/*
*
*     Initialize the array descriptor for the distributed matrices xA, U and VT
*
*/ 
                itemp = max( 1, mpA );
                descinit_( descA,  &m, &n, &nb, &nb, &izero, &izero, &ictxt, &itemp, &info );
                itemp = max( 1, mpA );
                descinit_( descU,  &m, &min_mn, &nb, &nb, &izero, &izero, &ictxt, &itemp, &info );
                itemp = max( 1, mpVT );
                descinit_( descVT, &min_mn, &n, &nb, &nb, &izero, &izero, &ictxt, &itemp, &info );
/**/
                eps = pslamch_( &ictxt, "Epsilon" );
/**/
                if ( ((jobU=='V')&(jobVT=='N')) ||(jobU == 'A' )||(jobVT=='A')){
                nbtestcase++;   
                U_VN = (float *)calloc(mpU*nqU,sizeof(float)) ;
                if (U_VN==NULL){ printf("error of memory allocation U_VN on proc %dx%d\n",myrow,mycol); exit(0); }
                S_VN = (float *)calloc(min_mn,sizeof(float)) ;
                if (S_VN==NULL){ printf("error of memory allocation S_VN on proc %dx%d\n",myrow,mycol); exit(0); }
                infoVN = driver_psgesvd( 'V', 'N', m, n, A, 1, 1, descA,
                        S_VN, U_VN, 1, 1, descU, VT_VN, 1, 1, descVT,
                        &MPIelapsedVN);
                orthU_VN  = verif_orthogonality(m,min_mn,U_VN , 1, 1, descU);
                res_repres_VN = verif_repres_VN( m, n, A, 1, 1, descA, U_VN, 1, 1, descU, S_VN);
                if (infoVN==min_mn+1) hetereogeneityVN = 'H'; else hetereogeneityVN = 'N';
                if ( iam==0 )
                        printf(" V    N   %6d %6d  %3d  %3d %3d  ||  %3d     %c   %7.1e   %7.1e                        %8.2f    %7.1e\n",
                                m,n,nb,nprow,npcol,infoVN,hetereogeneityVN,res_repres_VN/(S_VN[0]/S_VN[min_mn-1]),
                                orthU_VN,MPIelapsedVN,S_VN[0]/S_VN[min_mn-1]);
                if (infoVN==min_mn+1) nbhetereogeneity++ ;
                else if ((res_repres_VN/eps/(S_VN[0]/S_VN[min_mn-1])>threshold)||(orthU_VN/eps>threshold)||(infoVN!=0)) nbfailure++;
                }
/**/
                if (((jobU=='N')&(jobVT=='V'))||(jobU == 'A' )||(jobVT=='A')){
                nbtestcase++;   
                VT_NV = (float *)calloc(mpVT*nqVT,sizeof(float)) ;
                if (VT_NV==NULL){ printf("error of memory allocation VT_NV on proc %dx%d\n",myrow,mycol); exit(0); }
                S_NV = (float *)calloc(min_mn,sizeof(float)) ;
                if (S_NV==NULL){ printf("error of memory allocation S_NV on proc %dx%d\n",myrow,mycol); exit(0); }
                infoNV = driver_psgesvd( 'N', 'V', m, n, A, 1, 1, descA,
                        S_NV, U_NV, 1, 1, descU, VT_NV, 1, 1, descVT,
                        &MPIelapsedNV);
                orthVT_NV = verif_orthogonality(min_mn,n,VT_NV, 1, 1, descVT);
                res_repres_NV = verif_repres_NV( m, n, A, 1, 1, descA, VT_NV, 1, 1, descVT, S_NV);
                if (infoNV==min_mn+1) hetereogeneityNV = 'H'; else hetereogeneityNV = 'N';
                if ( iam==0 )
                        printf(" N    V   %6d %6d  %3d  %3d %3d  ||  %3d     %c   %7.1e             %7.1e              %8.2f    %7.1e\n",
                                m,n,nb,nprow,npcol,infoNV,hetereogeneityNV,res_repres_NV/(S_NV[0]/S_NV[min_mn-1]),
                                orthVT_NV,MPIelapsedNV,S_NV[0]/S_NV[min_mn-1]);
                if (infoNV==min_mn+1) nbhetereogeneity++ ;
                else if ((res_repres_NV/eps/(S_NV[0]/S_NV[min_mn-1])>threshold)||(orthVT_NV/eps>threshold)||(infoNV!=0)) nbfailure++;
                }
/**/
                if ( ((jobU=='N')&(jobVT=='N')) || ((jobU=='V')&(jobVT=='V')) || (jobU == 'A' ) || (jobVT=='A') ) {
                nbtestcase++;   
                U_VV = (float *)calloc(mpU*nqU,sizeof(float)) ;
                if (U_VV==NULL){ printf("error of memory allocation U_VV on proc %dx%d\n",myrow,mycol); exit(0); }
                VT_VV = (float *)calloc(mpVT*nqVT,sizeof(float)) ;
                if (VT_VV==NULL){ printf("error of memory allocation VT_VV on proc %dx%d\n",myrow,mycol); exit(0); }
                S_VV = (float *)calloc(min_mn,sizeof(float)) ;
                if (S_VV==NULL){ printf("error of memory allocation S_VV on proc %dx%d\n",myrow,mycol); exit(0); }
                infoVV = driver_psgesvd( 'V', 'V', m, n, A, 1, 1, descA,
                        S_VV, U_VV, 1, 1, descU, VT_VV, 1, 1, descVT,
                        &MPIelapsedVV);
                orthU_VV  = verif_orthogonality(m,min_mn,U_VV , 1, 1, descU);
                orthVT_VV = verif_orthogonality(min_mn,n,VT_VV, 1, 1, descVT);
                residF =  verif_representativity( m, n,     A, 1, 1, descA,
                                                         U_VV, 1, 1, descU,
                                                        VT_VV, 1, 1, descVT,
                                                         S_VV);
                if (infoVV==min_mn+1) hetereogeneityVV = 'H'; else hetereogeneityVV = 'N';
                if ( iam==0 )
                        printf(" V    V   %6d %6d  %3d  %3d %3d  ||  %3d     %c   %7.1e   %7.1e   %7.1e              %8.2f    %7.1e\n",
                                m,n,nb,nprow,npcol,infoVV,hetereogeneityVV,residF,orthU_VV,orthVT_VV,MPIelapsedVV,S_VV[0]/S_VV[min_mn-1]);
                if (infoVV==min_mn+1) nbhetereogeneity++ ;
                else if ((residF/eps>threshold)||(orthU_VV/eps>threshold)||(orthVT_VV/eps>threshold)||(infoVV!=0)) nbfailure++;
                }
/**/
                if (((jobU=='N')&(jobVT=='N'))||(jobU == 'A' )||(jobVT=='A')){
                nbtestcase++;   
                S_NN = (float *)calloc(min_mn,sizeof(float)) ;
                if (S_NN==NULL){ printf("error of memory allocation S_NN on proc %dx%d\n",myrow,mycol); exit(0); }
                infoNN = driver_psgesvd( 'N', 'N', m, n, A, 1, 1, descA,
                        S_NN, U_NN, 1, 1, descU, VT_NN, 1, 1, descVT,
                        &MPIelapsedNN);
                S_res_NN = (float *)calloc(min_mn,sizeof(float)) ;
                if (S_res_NN==NULL){ printf("error of memory allocation S on proc %dx%d\n",myrow,mycol); exit(0); }
                scopy_(&min_mn,S_VV,&ione,S_res_NN,&ione);
                saxpy_ (&min_mn,&rtmone,S_NN,&ione,S_res_NN,&ione);
                residS_NN = snrm2_(&min_mn,S_res_NN,&ione) / snrm2_(&min_mn,S_VV,&ione);
                free(S_res_NN);
                if (infoNN==min_mn+1) hetereogeneityNN = 'H'; else hetereogeneityNN = 'N';
                if ( iam==0 )
                        printf(" N    N   %6d %6d  %3d  %3d %3d  ||  %3d     %c                                  %7.1e   %8.2f    %7.1e\n",
                                m,n,nb,nprow,npcol,infoNN,hetereogeneityNN,residS_NN,MPIelapsedNN,S_NN[0]/S_NN[min_mn-1]);
                if (infoNN==min_mn+1) nbhetereogeneity++ ;
                else if ((residS_NN/eps>threshold)||(infoNN!=0)) nbfailure++;
                }
/**/
                if (((jobU=='V')&(jobVT=='N'))||(jobU == 'A' )||(jobVT=='A')){ free(S_VN); free(U_VN); }
                if (((jobU=='N')&(jobVT=='V'))||(jobU == 'A' )||(jobVT=='A')){ free(VT_NV); free(S_NV); }
                if (((jobU=='N')&(jobVT=='N'))||(jobU == 'A' )||(jobVT=='A')){ free(S_NN); }
                if (((jobU=='N')&(jobVT=='N'))||((jobU=='V')&(jobVT=='V'))||(jobU == 'A' )||(jobVT=='A')){ free(U_VV); free(S_VV); free(VT_VV);}
                free(A);
                Cblacs_gridexit( 0 );
        }
/*
*     Print ending messages
*/
        }
        if ( iam==0 ){
                printf("--------------------------------------------------------------------------------------------------------------------\n");
                printf("               [ nbhetereogeneity = %d / %d ]\n",nbhetereogeneity, nbtestcase);
                printf("               [ nbfailure        = %d / %d ]\n",nbfailure, nbtestcase-nbhetereogeneity);
                printf("--------------------------------------------------------------------------------------------------------------------\n");
                printf("\n");
        }
/**/
        free(t_jobU  );
        free(t_jobVT );
        free(t_m     );
        free(t_n     );
        free(t_nb    );
        free(t_nprow );
        free(t_npcol );
        MPI_Finalize();
        exit(0);
}
void pzgecopy_hd( F_CHAR_T TRANS, int *m_in, int *n_in,
    double *A, int *ia_in, int *ja_in, int *descA,
    double *dB, int *ib_in, int *jb_in, int *descB )
{
/*
 Copy m by n distributed submatrix  from host to GPU
 */

int m  = *m_in;
int n  = *n_in;

int ia = *ia_in;
int ja = *ja_in;
int ib = *ib_in;
int jb = *jb_in;

int nprow = 0;
int npcol = 0;
int myprow = 0;
int mypcol = 0;
int mmb = 0;
int nnb = 0;

int istart = 0;
int iend = 0;
int isize = 0;

int Locp = 0;
int Locq = 0;
int lld = 0;

int jstart = 0;
int jend = 0;
int jsize = 0;

int iib = 0;
int jjb = 0;

cuDoubleComplex *dBptr = 0;
double *Btmp = 0;
F_CHAR_T NoTrans = "N";

int descBtmp[DLEN_];
int elmSize = sizeof(cuDoubleComplex);

double z_one[2];
double z_zero[2];

/*
 Tuneable parameters
 */
int mfactor = 4;
int nfactor = 4;


int rsrc = 0;
int csrc = 0;
int irsrc = 0;
int jcsrc = 0;
int info = 0;

int notran = 0;
int TrA = 0;

int lrindx = 0;
int lcindx = 0;
int nrow = 0;
int ncol = 0;
int mm = 0;
int nn = 0;
int iia = 0;
int jja = 0;

cublasStatus cu_status;

z_one[REAL_PART] = 1;
z_one[IMAG_PART] = 0;
z_zero[REAL_PART] = 0;
z_zero[IMAG_PART] = 0;

Cblacs_gridinfo( descA[CTXT_], &nprow, &npcol, &myprow, &mypcol );

notran = ( ( TrA = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN );


/*
 * check arguments
 */
if ((m <= 0) || (n <= 0)) {
   return;
   };

assert( (1 <= ia) && ((ia + m-1) <= descA[M_] ) );
assert( (1 <= ja) && ((ja + n-1) <= descA[N_] ) );
assert( (1 <= ib) && ((ib + m-1) <= descB[M_] ) );
assert( (1 <= jb) && ((jb + n-1) <= descB[N_] ) );

/*
 * Create a temp matrix that is aligned to descB.
 * Assume size is   mmb by nnb
 */

if (notran) {
  mmb = MIN( m, descB[MB_] * nprow * mfactor);
  nnb = MIN( n, descB[NB_] * npcol * nfactor);
  }
else {
  nnb = MIN( m, descB[MB_] * nprow * mfactor);
  mmb = MIN( n, descB[NB_] * npcol * nfactor);
  };
  

mmb = MAX( mmb,1);
nnb = MAX( nnb,1);

rsrc = indxg2p_( &ib, &descB[MB_], &myprow, &descB[RSRC_], &nprow);
csrc = indxg2p_( &jb, &descB[NB_], &mypcol, &descB[CSRC_], &npcol);


Locp = numroc_( &mmb, &descB[MB_], &myprow, &rsrc, &nprow );
Locq = numroc_( &nnb, &descB[NB_], &mypcol, &csrc, &npcol );

Btmp = (double*) malloc( MAX(1,(Locp * Locq))*elmSize );
assert( Btmp != 0 );


lld = MAX(1, Locp);
descinit_( descBtmp, &mmb, &nnb, &descB[MB_], &descB[NB_],
            &rsrc, &csrc, &descB[CTXT_], &lld, &info );

assert( info == 0);



for( jstart=ja; jstart <= ja + n-1; jstart = jend + 1) {
  jend = MIN( ja + n -1, jstart + nnb - 1);
  jsize = jend - jstart + 1;

  for( istart=ia; istart <= ia + m-1; istart = iend + 1) {
     iend = MIN( ia + m-1, istart + mmb -1);
     isize = iend - istart + 1;

     iia = ia + (istart-1);
     jja = ja + (jstart-1);


     iib = 1;
     jjb = 1;
     

     if (notran) {
        mm = isize;
        nn = jsize;
        }
    else {
        mm = jsize;
        nn = isize;
        };


     pzgeadd_( TRAN,  &mm, &nn,   z_one, A, &iia, &jja, descA,  
                         z_zero, Btmp, &iib, &jjb, descBtmp );

     /* 
      * find local extent
      */

     
     if (notran) {
       iib = ib + (istart-1);
       jjb = jb + (jstart-1);
       }
     else {
       iib = ib + (jstart-1);
       jjb = jb + (istart-1);
       };

     if (notran) {
       nrow = numroc_( &isize, &descB[MB_], &myprow, &rsrc, &nprow);
       ncol = numroc_( &jsize, &descB[NB_], &mypcol, &csrc, &npcol);
       }
     else {
       nrow = numroc_( &jsize, &descB[MB_], &myprow, &rsrc, &nprow);
       ncol = numroc_( &isize, &descB[NB_], &mypcol, &csrc, &npcol);

     };

     /*
      Perform global
      dB( iib:(iib+isize-1), jjb:(jjb+jsize-1)) <- B(1:isize,1:jsize)

      Perform local
      dB( lrindx:(lrindx+nrow-1), lcindx:(lcindx+ncol-1)) <-
              B(1:nrow, 1:ncol)
      */


     infog2l_( &iib, &jjb, descB, &nprow, &npcol, &myprow, &mypcol,
               &lrindx, &lcindx,  &irsrc, &jcsrc );


     dBptr = (cuDoubleComplex *) dB;
     dBptr = dBptr + INDX2F( lrindx,lcindx, descB[LLD_]);


     cu_status = cublasSetMatrix( nrow,ncol,elmSize,
               (cuDoubleComplex *) Btmp,  descBtmp[LLD_],
                                   dBptr, descB[LLD_] );
     assert( cu_status == CUBLAS_STATUS_SUCCESS );

     };
  };

  free( Btmp );

  return;
}
void pdcopymatrix(double *  	A, int   	ia, int   	ja, int *  	descA)
{
    int myid, numproc, i, icontxt;;
    int Locp, Locq, lld, info;
    int mb,nb, ic_proc, jc_proc;
    int ctxt, nprow,npcol,myprow,mypcol;
    char scope,top;
    int ldAtmp;
    int isizedAtmp;
    double *dAtmp, *hAtmp, *hcopydAtmp;
    int cuerr,hosterr;
    const int elementSize = sizeof(double);
    const int nelements = 100;

    MPI_Comm comm;
       
    comm=MPI_COMM_WORLD;
    icontxt = descA[CTXT_];
    Cblacs_gridinfo( icontxt, &nprow,&npcol,&myprow,&mypcol);
    assert( nprow >= 1);
    assert( npcol >= 1);
    assert( (0 <= myprow) && (myprow < nprow));
    assert( (0 <= mypcol) && (mypcol < npcol));

/**************  Set up device matrix *****************/
    jc_proc = 0;
    isizedAtmp = nelements ;
    isizedAtmp *= elementSize;

    cuerr = cudaMalloc((void**) &dAtmp, isizedAtmp);
    hAtmp = (double*)malloc(isizedAtmp);
    assert( dAtmp !=0 );

/**  assign value to the matrix on device */
    if ( mypcol == jc_proc ) {
      for (i=0;i<nelements;i++) {
        hAtmp[i]=1.02;
      }
    } else {
      for (i=0;i<nelements;i++) {
        hAtmp[i]=2.0;
      }
    }
    for (i=0;i<nelements;i++) {
        hcopydAtmp[i]=0.001;
    }
    cudaMemcpy (dAtmp, hAtmp, isizedAtmp, cudaMemcpyHostToDevice);
    free (hAtmp);

//    printf("after cudaMemset,isizedAtmp=%d\n",isizedAtmp);

/*****************Set up device matrix *****************/          
            scope = 'R';
            top =' ';
//           Locp = isizedAtmp;
            Locp = nelements;
            Locq = 1;
            lld = Locp;

            printf("before dgebs/r2d: icontxt=%d,Locp=%d,Locq=%d,lld=%d,\n",
            icontxt,Locp,Locq,lld);

//            MPI_Bcast(dAtmp, nelements, MPI_DOUBLE, 0, comm); 


            if (mypcol == jc_proc) {
                scalapack_dgebs2d(&icontxt, &scope, &top,
                                  &Locp, &Locq, dAtmp, &lld );
            } else {
                scalapack_dgebr2d(&icontxt,&scope,&top,
                                  &Locp,&Locq, dAtmp, &lld,
                                  &myprow, &jc_proc );
            };


     cudaMemcpy (hcopydAtmp, dAtmp, isizedAtmp, cudaMemcpyDeviceToHost);
     if (mypcol == 1 && myprow == 1) {
     printf( "mypcol %d, myprow %d \n", mypcol, myprow);
            for (int i=0; i<nelements; i=i+4){
              printf( "%f, %f, %f, %f \n", 
//                      dAtmp[i],dAtmp[i+1],dAtmp[i+2],dAtmp[i+3]);
                      hcopydAtmp[i],hcopydAtmp[i+1],hcopydAtmp[i+2],hcopydAtmp[i+3]);
            }
     }
    cudaFree(dAtmp);
}
void Cpsgecopy_general_async(int m, int n, 
        void *A, int ia, int ja, int *descA,
        void *B, int ib, int jb, int *descB, int is_device_to_host)
{
#define dA(i,j)  (((float*)A) + IDX2F(i,j,descA[LLD_]))
#define dT(i,j) (((float *)T) + IDX2F(i,j,descT[LLD_]))

#define dB(i,j)  (((float *)B) + IDX2F(i,j,descB[LLD_]))
/*
  perform    copy

  B( ib:(ib+m-1), jb:(jb+n-1)) <-  A( ia:(ia+m-1),ja:(ja+n-1))

 */

const int use_MallocHost = FALSE;
const int use_igsum2d = FALSE;

cublasStatus cu_status;

cudaError_t cuda_status;

char notrans[] = "NoTrans";

int descT[DLEN_];

int ldA,ldB,ldT;

int is_same_context, is_same_mb, is_same_nb;
int is_same_p, is_same_q;
int is_same_offset;
int is_same_Locp, is_same_Locq;
int is_aligned;

int lrA1,lcA1, lrA2,lcA2;
int lrT1,lcT1, lrT2,lcT2;
int lrB1,lcB1, lrB2,lcB2;
int rsrc,csrc;
int rsrcA1,csrcA1,  rsrcA2, csrcA2;
int rsrcB1,csrcB1,  rsrcB2, csrcB2;
int iia,jja, iib,jjb;
int icontxt, nprow,npcol, myprow,mypcol;
int LocpA,LocqA,  LocpB,LocqB, LocpT,LocqT;
int mm,nn, lmm,lnn;
size_t nbytes;

float one_[REAL_PART+IMAG_PART+1];
float *one = &(one_[0]);

float zero_[REAL_PART+IMAG_PART+1];
float *zero = &(zero_[0]);


float alpha_[REAL_PART+IMAG_PART+1];
float *alpha = &(alpha_[0]);

float beta_[REAL_PART+IMAG_PART+1];
float *beta = &(beta_[0]);

int isize, isizeT;
float *T = 0;

int elemSize = sizeof(float);
int nnb, jstart,jend,jsize;
int is_ok;

int nmax;
const int bufsize =  1024*1024;
const int use_simple = FALSE;;

one[REAL_PART] = 1.0;
one[IMAG_PART] = 0.0;
zero[REAL_PART] = 0.0;
zero[IMAG_PART] = 0.0;

 if ((m <= 0) || (n <= 0)) {
   return;
 };

 T = 0; 

 ldA = descA[LLD_];
 ldB = descB[LLD_];

  icontxt = descA[CTXT_];
  Cblacs_gridinfo( icontxt, &nprow,&npcol, &myprow, &mypcol);
  assert( nprow >= 1);
  assert( npcol >= 1);
  assert( (0 <= myprow) && (myprow < nprow));
  assert( (0 <= mypcol) && (mypcol < npcol));

  is_ok = (1 <= ia) && (ia + m-1 <= descA[M_]);
  if (!is_ok) {
    printf("Cpsgecopy (%d,%d) :ia %d m %d descA[M_] %d  \n",
            myprow,mypcol,     ia,   m,   descA[M_] );
    printf("Cpsgecopy (%d,%d) :ja %d n %d descA[N_] %d \n",
            myprow,mypcol,     ja,   n,   descA[N_] );
    printf("Cpsgecopy (%d,%d) :ib %d jb %d descB[M_] %d descB[N_] %d\n",
            myprow,mypcol,     ib,   jb,   descB[M_],   descB[N_] );
  };
  assert( (1 <= ia) && (ia + m-1 <= descA[M_]));
  assert( (1 <= ja) && (ja + n-1 <= descA[N_]));
  assert( (1 <= ib) && (ib + m-1 <= descB[M_]));
  assert( (1 <= jb) && (jb + n-1 <= descB[N_]));


  is_same_context = (descA[CTXT_] == descB[CTXT_]);
  is_same_mb = (descA[MB_] == descB[MB_]);
  is_same_nb = (descA[NB_] == descB[NB_]);

  is_same_p = (Cindxg2p(ia,descA[MB_], myprow, descA[RSRC_],nprow) ==
               Cindxg2p(ib,descB[MB_], myprow, descB[RSRC_],nprow) );

  is_same_q = (Cindxg2p(ja,descA[NB_], mypcol, descA[CSRC_],npcol) ==
               Cindxg2p(jb,descB[NB_], mypcol, descB[CSRC_],npcol) );

  is_same_offset = (MOD(ia,descA[MB_]) == MOD(ib,descB[MB_])) &&
                   (MOD(ja,descA[NB_]) == MOD(jb,descB[NB_]));


  local_extent( m,n, ia,ja,descA, &LocpA,&LocqA, &lrA1,&lcA1, &lrA2,&lcA2 );


  local_extent( m,n, ib,jb,descB, &LocpB,&LocqB,&lrB1,&lcB1, &lrB2,&lcB2 );



  /*
  if ((LocpA >= 1) || (LocpB >= 1)) {
     is_same_Locp = (LocpA == LocpB);
  };
  if ((LocqA >= 1) || (LocqB >= 1)) {
     is_same_Locq = (LocqA == LocqB);
  };
  */

  is_same_Locq = (LocqA == LocqB);

  is_same_Locp = (LocpA == LocpB);

  is_aligned = is_same_context &&
               is_same_mb && is_same_nb &&
               is_same_p && is_same_q &&
               is_same_offset &&
               is_same_Locp && is_same_Locq;

  assert( is_same_q );

  assert( is_same_p );

  assert( is_same_offset );

  assert( is_same_Locp );

  assert( is_same_Locq );

  assert( is_aligned );


        
       /*
        no communication required
        copy from device to host
        */

       ldA = descA[LLD_];
       ldB = descB[LLD_];

       mm = LocpA;
       nn = LocqA;

       if (is_device_to_host) {
         /* 
          * transfer from device to host
          */
         if ( (mm >= 1) && (nn >= 1) ) {
#ifdef USE_CUBLASV2
           {
             cublasStatus_t istatus;
             istatus = cublasGetMatrixAsync(mm, nn, elemSize, 
                 (void *) dA(lrA1,lcA1), ldA, (void *) dB(lrB1,lcB1), ldB,
                 cublas_get_stream() );
             assert( istatus == CUBLAS_STATUS_SUCCESS );
           }
#else
           cu_status = cublasGetMatrix(mm,nn, elemSize,
               (void *) dA(lrA1,lcA1), ldA,  (void *) dB(lrB1,lcB1),ldB );
            CHKERR(cu_status);
#endif
           };
         }
       else {
         /* 
          * transfer from host to device
          */
         if ( (mm >= 1) && (nn >= 1) ) {
#ifdef USE_CUBLASV2
           {
             cublasStatus_t istatus;

             istatus = cublasSetMatrixAsync(mm,nn,elemSize,
               (void *) dA(lrA1,lcA1), ldA,  (void *) dB(lrB1,lcB1),ldB,
               cublas_get_stream()   );

             assert( istatus == CUBLAS_STATUS_SUCCESS );

           }
#else
            cu_status = cublasSetMatrix(mm,nn,elemSize,
               (void *) dA(lrA1,lcA1), ldA,  (void *) dB(lrB1,lcB1),ldB );
            CHKERR(cu_status);
#endif
           };
         };
               



  return;
}
void psgetrf_gpu(int *m_in, int *n_in, 
   float *A, int *ia_in, int *ja_in, int *descA, 
   int *ipiv_, int *info)
{

int m = *m_in;
int n = *n_in;
int ia = *ia_in;
int ja = *ja_in;

const int use_setup_desc = TRUE;
const int idebug = 0;
int use_replicated_storage = FALSE;
const int use_broadcast_triangular_matrix = TRUE;

int ia_proc, ja_proc;
int lrindx, lcindx, rsrc,csrc, irsrc,icsrc;
int ictxt, nprow,npcol, myprow,mypcol;
int is_root;

int minmn;
int k1,k2,incx,ip;
int mm, nn, kk, ii, jj, mtmp;
int mm_lu,nn_lu,ia_lu,ja_lu;

int elemSize = sizeof( float );
size_t nbytes;

int nnb, jstart,jend,jsize, isize, jb;
int icontxt, isizeAtmp;


int i,j, iia,jja, ldA, ldhA;
int iinfo = 0;
int iAtmp, jAtmp, iha,jha, iib,jjb,iic,jjc;
int ldAtmp, ldBtmp, lmm,lnn;
int lrA1,lcA1, lrA2,lcA2;

int desc_hA_[DLEN_];
int *desc_hA = &(desc_hA_[0]);

int *ipiv_hA_ = 0;
float *hA = 0;
float *Atmp = 0;
float *dAtmp = 0;

int *gipiv_ = 0;
int desc_Atmp_[DLEN_];
int *desc_Atmp = &(desc_Atmp_[0]);
cublasStatus cu_status;

int isok;
int use_delayed_left_interchange = 1;

int is_mine;
int i1,j1,inc1,  i2,j2,inc2;
int desc_ipiv_hA_[DLEN_];
int *desc_ipiv_hA = &(desc_ipiv_hA_[0]);

int desc_ipiv_[DLEN_];
int *desc_ipiv = &(desc_ipiv_[0]);

int desc_gipiv_[DLEN_];
int *desc_gipiv = &(desc_gipiv_[0]);
int mb,nb, Locp, Locq, lld;


char direc = 'F';
char rowcol = 'R';

char left[] = "Left";
char lower[] = "Lower";
char notrans[] = "NoTrans";
char unit[] = "Unit";

char *side = left;
char *uplo = lower;
char *trans = notrans;
char *diag = unit;

float zero_[REAL_PART+IMAG_PART+1];
float *zero = &(zero_[0]);

float one_[REAL_PART+IMAG_PART+1];
float *one = &(one_[0]);

float neg_one_[REAL_PART+IMAG_PART+1];
float *neg_one = &(neg_one_[0]);

float beta_[REAL_PART+IMAG_PART+1];
float *beta = &(beta_[0]);

float alpha_[REAL_PART+IMAG_PART+1];
float *alpha = &(alpha_[0]);
/*
 * A is a pointer to GPU device memory but conceptually associated
 * with a scalapack distributed matrix 

 * A is array of complex numbers
 */



*info = 0;

zero[REAL_PART] = 0.0;
zero[IMAG_PART] = 0.0;
one[REAL_PART] = 1.0;
one[IMAG_PART] = 0.0;
neg_one[REAL_PART] = -1.0;
neg_one[IMAG_PART] = 0.0;


/*
 * setup copy of distributed matrix on CPU host
 */

hA = 0;
Atmp = 0;

ictxt = descA[CTXT_];
icontxt = ictxt;

Cblacs_gridinfo( ictxt, &nprow, &npcol,  &myprow, &mypcol );
is_root = (myprow == 0) && (mypcol == 0);
if ((idebug >= 1) && (is_root)) {
  printf("pcgetrf_gpu: m %d n %d ia %d ja %d \n",
      m,n,   ia,ja );
};


ia_proc = Cindxg2p( ia, descA[MB_], myprow, descA[RSRC_], nprow);
ja_proc = Cindxg2p( ja, descA[NB_], mypcol, descA[CSRC_], npcol);


/*
 * setup global pivot vector
 */
lld = MIN(m,n) + descA[MB_];
nbytes = lld;
nbytes *= sizeof(int);
if (gipiv_ != 0) {
  free(gipiv_); gipiv_ = 0;
};
gipiv_ = (int *) malloc( nbytes );
assert( gipiv_ != 0 );


desc_gipiv[DTYPE_] = descA[DTYPE_];
desc_gipiv[CTXT_] = descA[CTXT_];
desc_gipiv[M_] = MIN(m,n);
desc_gipiv[N_] = 1;
desc_gipiv[MB_] = desc_gipiv[M_];
desc_gipiv[NB_] = desc_gipiv[N_];
desc_gipiv[LLD_] = lld;

desc_gipiv[RSRC_] = -1;
desc_gipiv[CSRC_] = -1;

  /*
   * setup distribute array hA on host
   */

/*
 * Note, optimal block size on GPU might not be
 * optimal block size on CPU, but assume to be
 * the same for simplicity for now
 */

/*
 * should nnb = descA[NB_] * npcol  ?
 */
nnb = descA[NB_];

minmn = MIN(m,n);
for( jstart=1; jstart <= minmn; jstart = jend + 1) {
  jend = MIN( minmn, jstart + nnb - 1);
  jsize = jend - jstart + 1;

  /*
   * setup matrix on host
   */

  /*
  was iia = (ia-1) + 1;
  */
  j = jstart;
  jb = jsize;

  iia = (ia-1) + jstart;
  jja = (ja-1) + jstart;
  mm = m - jstart + 1;
  nn = jsize;

  if (use_setup_desc) {
    setup_desc( mm,nn, iia,jja,descA, &isize, desc_hA );
    }
  else {
    irsrc = Cindxg2p( iia, descA[MB_], myprow, descA[RSRC_], nprow );
    icsrc = Cindxg2p( jja, descA[NB_], mypcol, descA[CSRC_], npcol );
  
    mb = descA[MB_];
    nb = descA[NB_];
    Locp = Cnumroc( mm, mb, 0,0,nprow );
    Locq = Cnumroc( nn, nb, 0,0,npcol );
    lld = MAX(1,Locp);
    isize = MAX(1,Locp) * MAX(1, Locq );
  
    ictxt = descA[CTXT_];
    iinfo = 0;
    Cdescinit( desc_hA, mm,nn,  mb,nb,  irsrc,icsrc, ictxt, lld, &iinfo);
    assert( iinfo == 0);
    };


  nbytes = isize;
  nbytes *= elemSize;
  if (hA != 0) { 
    free(hA); hA = 0;
  };
  hA = (float *) malloc( nbytes );
  assert( hA != 0 );

  /*
   * distribution of pivot vector is tied to distribution of matrix
   */
  Locp = Cnumroc( desc_hA[M_], desc_hA[MB_], myprow, desc_hA[RSRC_], nprow);
  lld = Locp + desc_hA[MB_];
  nbytes = lld;
  nbytes *= sizeof(int);
  if (ipiv_hA_ != 0) {
    free( ipiv_hA_ ); ipiv_hA_ = 0;
  };
  ipiv_hA_ = (int *) malloc( nbytes );
  assert( ipiv_hA_ != 0);

  Cdescset( desc_ipiv_hA, desc_hA[M_],  1,
              desc_hA[MB_], 1,
              desc_hA[RSRC_], icsrc,
              desc_hA[CTXT_], 
              lld );





  /*
   copy column panel back to CPU host
   to be factored using scalapack
   */ 

  jb = jsize;
  j = jstart;
  mm = m  - j + 1;
  nn = jb;



  /*
    hA(1:mm,1:nn) <-  dA(j:(j+mm-1), j:(j+nn-1) )
   */


  iia = (ia-1) + j;
  jja = (ja-1) + j;
  ii = 1;
  jj = 1;

  PROFSTART("gpu:hA <- dA");
  Cpsgecopy_d2h( mm,nn, A,iia,jja,descA,  hA, ii,jj, desc_hA );
  PROFEND("gpu:hA <- dA");



  /*
   * factor on host CPU using ScaLAPACK
   * Note the pivot vector is tied to the distribution of the matrix
   * Therefore, we need a different "ipiv_hA" pivot vector
   * that is tied the the distributed matrix hA
   */

  ii = 1;
  jj = 1;
  iinfo = 0;
  mm_lu = mm;
  nn_lu = nn;
  ia_lu = ii;
  ja_lu = jj;

  PROFSTART("gpu:psgetrf");
  scalapack_psgetrf( &mm_lu, &nn_lu, 
        hA, &ia_lu, &ja_lu,  desc_hA, &(ipiv_hA(1)), &iinfo );
  PROFEND("gpu:psgetrf");

  /*
   * broadcast pivot vector to global vector
   */



  i1 = 1;
  j1 = 1;
  inc1 = 1;

  i2 = jstart;
  j2 = 1;
  inc2 = 1;
  mtmp = MIN(mm,nn);
  desc_ipiv_hA[CSRC_] = icsrc;

  use_replicated_storage = FALSE;
  if (use_replicated_storage) {
    int ja_lu_proc;

    ja_lu_proc =   Cindxg2p(ja_lu,desc_hA[NB_],
        mypcol,desc_hA[CSRC_],npcol);

    desc_ipiv_hA[CSRC_] =  ja_lu_proc;

    desc_gipiv[RSRC_] = -1;
    desc_gipiv[CSRC_] = -1;
    scalapack_picopy( &mtmp, &(ipiv_hA(1)), &i1,&j1, desc_ipiv_hA, &inc1,
                        &(gipiv(1)), &i2,&j2, desc_gipiv, &inc2 );
    }
  else {
    /*
     * copy to 1 processors (rsrc,csrc), then
     * broadcast to all processors
     */
        int icontxt = desc_ipiv_hA[CTXT_];
        char scope = 'A'; 
        char top = ' ';
        int ntmp = 1;
        int lld; 

        int ia_lu_proc,ja_lu_proc;
        int rsrc, csrc;

        ia_lu_proc = Cindxg2p( ia_lu, desc_hA[MB_],
               myprow,desc_hA[RSRC_],nprow);
        ja_lu_proc = Cindxg2p( ja_lu, desc_hA[NB_],
               mypcol,desc_hA[CSRC_],npcol);

        rsrc = ia_lu_proc;
        csrc = ja_lu_proc;

        desc_gipiv[RSRC_] = rsrc;
        desc_gipiv[CSRC_] = csrc;
        desc_ipiv_hA[CSRC_] = csrc;

        mtmp = MIN( mm_lu, nn_lu);
        scalapack_picopy( &mtmp, &(ipiv_hA(1)), &i1,&j1,desc_ipiv_hA,&inc1,
                  &(gipiv(1)), &i2,&j2, desc_gipiv, &inc2 );

    if ((myprow == rsrc) && (mypcol == csrc)) {

        lld = mtmp;
        ntmp = 1;
        scalapack_igebs2d( &icontxt, &scope, &top,
            &mtmp, &ntmp, &(gipiv(i2)), &lld );
        }
    else {
      lld = mtmp;
      ntmp = 1;
      scalapack_igebr2d( &icontxt, &scope, &top,
            &mtmp, &ntmp, &(gipiv(i2)), &lld, 
            &rsrc,&csrc );
    };
  };

  if (idebug >= 1) {
    int desctmp[DLEN_];
    char name_ipiv_hA[] = "ipiv_hA";
    char name_gipiv[] = "gipiv";

    if (is_root) {
    printf("jstart %d jend %d \n", jstart,jend);
    printf("mm_lu %d nn_lu %d ia_lu %d ja_lu %d\n",
            mm_lu,   nn_lu,   ia_lu,   ja_lu );
    };

    Cdescset(desctmp, desc_hA[M_], npcol,
        desc_hA[MB_],1,
        desc_hA[RSRC_], desc_hA[CSRC_],
        desc_hA[CTXT_], desc_hA[LLD_] );

    Cpilaprnt( MIN(mm_lu,nn_lu), npcol, &(ipiv_hA(1)), 1,1,desctmp, name_ipiv_hA);

    Cdescset(desctmp, minmn*nprow, npcol,
        minmn, 1,    0,0,
        descA[CTXT_], minmn );
    Cpilaprnt( nprow*minmn, npcol, &(gipiv(1)),1,1,desctmp, name_gipiv);
  };


  /*
   * adjust pivot sequence from 1:min(mm,nn) in ipiv to 
   * jstart:(jstart+min(mm,nn)-1)
   */
    for(int i=1; i <= MIN(mm,nn); i++) {
      i2 = (jstart-1) + i;
      gipiv(i2) = gipiv(i2) + (jstart-1);
    };


  if (iinfo < 0) {
     *info = iinfo;
     return;
     };

  if ((*info == 0) && (iinfo > 0)) {
      *info = iinfo + (j-1);
      return;
      };


  /*
   * transfer factored panel back to GPU device
   */

  iia = (ia-1) + j;
  jja = (ja-1) + j;
  ii = 1;
  jj = 1;
  PROFSTART("gpu:A <- hA");
  Cpsgecopy_h2d(mm,nn, hA, ii,jj, desc_hA,
                       A, iia,jja, descA );
  PROFEND("gpu:A <- hA");





  if (use_delayed_left_interchange) {
    /*
     * do nothing for now
     */
    }
  else {
    /* 
     * apply interchanges to columns 1:(j-1)
     */

    nn = j-1;
    k1 = j;
    k2 = j + jb-1;
    incx = 1;


    PROFSTART("gpu:left swap");
    if (nn >= 1) {
         iia = (ia-1) + 1;
         jja = (ja-1) + 1;
         for(kk=k1; kk <= k2; kk++) {
           ip = gipiv(  kk);
           assert(ip >= kk );
           assert( ip <= m );

           if (kk != ip) {
               inc1 = descA[M_];
               inc2 = descA[M_];
               i1 = (iia-1) + kk;
               i2 = (iia-1) + ip;
               j1 = jja;
               j2 = jja;
               Cpsswap_gpu(nn, A,i1,j1,descA,inc1,
                               A,i2,j2,descA,inc2 );
                };
         };
      };
    PROFEND("gpu:left swap");
    };




  /*
   * apply interchanges to columns (j+jb):n
   */

   nn = n - (jend + 1) + 1;
   k1 = j;
   k2 = j + jb - 1;
   incx = 1;



   PROFSTART("gpu:right swap");
   if (nn >= 1) {
      iia = (ia-1) + 1;
      jja = (ja-1) + (jend+1);
      for(kk=k1; kk <= k2; kk++) {
        ip = gipiv(  kk );
        assert( ip >= kk );
        assert( ip <= m );

        if (ip != kk) {
           i1 = (iia-1) + kk;
           i2 = (iia-1) + ip;
           j1 = jja;
           j2 = jja;
           inc1 = descA[M_];
           inc2 = descA[M_];
           Cpsswap_gpu( nn, A, i1,j1, descA, inc1,
                            A, i2,j2, descA, inc2 );
        };
      };
   };
   PROFEND("gpu:right swap");


   PROFSTART("gpu:pTRSM");


   mm = jb;
   nn = n - (jend+1) + 1;
   if ( (1 <= mm) && (1 <= nn)) {
               /*
               cublasCtrsm('L','L','N','U', mm,nn,
                  alpha, dA(j,j), lddA, dA(j,j+jb), lddA );
               */

     if (use_broadcast_triangular_matrix) {
       /*
        * broadcast triangular part, then solve locally
        */
         char lscope = 'A';
         char ltop = ' ';
         int  msize, nsize, lr1,lc1, lr2,lc2;
         int ia_lu_proc, ja_lu_proc;

       /*
        * copy on local processor
        */

         ia_lu_proc = Cindxg2p(ia_lu, desc_hA[MB_], myprow,
                         desc_hA[RSRC_], nprow );
         ja_lu_proc = Cindxg2p(ja_lu, desc_hA[NB_], mypcol,
                         desc_hA[CSRC_], npcol );

       /*
        * complete mm by mm block on Atmp
        */
       ldAtmp = MAX(1,mm);
       Cdescset(desc_Atmp, mm,mm, mm,mm, 
           ia_lu_proc,ja_lu_proc, icontxt, ldAtmp);
       isizeAtmp = ldAtmp * MAX(1,mm);
       nbytes = isizeAtmp;
       nbytes *= elemSize;

       if (Atmp != 0) { free(Atmp); Atmp = 0; };
       Atmp = (float *) malloc( nbytes );
       assert( Atmp != 0);

#ifdef USE_CUBLASV2
       {
         cudaError_t ierr;
         size_t isize = isizeAtmp;
         isize *= elemSize;

         ierr = cudaMalloc( (void **) &dAtmp, isize );
         assert(ierr == cudaSuccess );
       }
#else
       cu_status = cublasAlloc(isizeAtmp, elemSize, (void **) &dAtmp );
       CHKERR(cu_status);
       assert( dAtmp != 0);
#endif

       ii = 1;
       jj = 1;
       scalapack_psgeadd( notrans, &mm, &mm, 
           one,   hA, &ia_lu, &ja_lu, desc_hA,
           zero,  Atmp, &ii, &jj, desc_Atmp );
                 
       rsrc = desc_Atmp[RSRC_];
       csrc = desc_Atmp[CSRC_];
       if ((myprow == rsrc) && (mypcol == csrc)) {
          scalapack_cgebs2d( &icontxt, &lscope, &ltop,   
              &mm, &mm,  Atmp, &ldAtmp );
          }
       else {
         scalapack_cgebr2d( &icontxt, &lscope, &ltop,
              &mm, &mm, Atmp, &ldAtmp,   &rsrc, &csrc );
       };

       inc1 = 1;
       inc2 = 1;
       cu_status = cublasSetVector(isizeAtmp, elemSize, Atmp, inc1, dAtmp, inc2 );
       CHKERR(cu_status);

       /*
        * perform local solve on GPU
        */
       iia = (ia-1) + j;
       jja = (ja-1) + (j+jb);
       local_extent( mm,nn, iia,jja,descA,  
                    &msize,&nsize, &lr1,&lc1, &lr2,&lc2 );
       if (msize >= 1) {
         assert( msize == mm );
       };

       if ((msize >= 1) && (nsize >= 1)) {
         char lside = 'L';
         char luplo = 'L';
         char ltrans = 'N';
         char ldiag = 'U';

         float zalpha;


         zalpha = (float)1.0;//make_float(1.0,0.0);

         CUBLAS_STRSM( 
             ((lside == 'l')||(lside == 'L')) ?
                CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT, 
             ((luplo == 'l')||(luplo == 'L')) ? 
                CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER,
             ((ltrans == 'c')||(ltrans == 'C')) ?
               CUBLAS_OP_C :
                 ((ltrans == 't')||(ltrans == 'T')) ?
                    CUBLAS_OP_T : CUBLAS_OP_N, 
             ((ldiag == 'u')||(ldiag == 'U')) ?
                CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT,
              mm, nsize, zalpha,
              (float *) dAtmp, ldAtmp,
              dA(lr1,lc1), descA[LLD_] );

       };



       if (Atmp != 0) {
         free(Atmp); Atmp = 0;
       };

#ifdef USE_CUBLASV2
       {
         cudaError_t ierr;
         ierr = cudaFree( (void *) dAtmp );
         assert(ierr == cudaSuccess );
         dAtmp  = 0;
       }
#else
       cu_status = cublasFree( dAtmp );
       CHKERR(cu_status );
#endif


     }
     else {
         /*
          * perform triangular solve using scalapack
          */
         iia = (ia-1) + j;
         jja = (ja-1) + (j+jb);
        setup_desc(mm,nn,iia,jja,descA,  &isize, desc_Atmp );

        nbytes = elemSize;
        nbytes *= isize;
        if (Atmp != 0) {
          free(Atmp); Atmp = 0;
        };
        Atmp = (float *) malloc( nbytes );
        assert( Atmp != 0 );



         /*
          * copy to Atmp(1:mm,1:nn) <- dA(j:(j+mm-1),(j+jb):((j+jb)+nn-1))
          */


         ii = 1; jj = 1;
         PROFSTART("gpu:Atmp <- dA");
         Cpsgecopy_d2h( mm,nn,A,iia,jja,descA,
                           Atmp, ii,jj, desc_Atmp );
         PROFEND("gpu:Atmp <- dA");



         /*
          * perform triangular solve using scalapack
          */

          side = left;
          uplo = lower;
          trans = notrans;
          diag = unit;

          alpha = one;

          iha = 1; 
          jha = 1;
          ii = 1; 
          jj = 1;

          PROFSTART("gpu:pstrsm")
          scalapack_pstrsm( side, uplo, trans, diag, 
              &mm,&nn, alpha,    
              hA, &iha,&jha, desc_hA,
              Atmp,&ii,&jj,  desc_Atmp );
          PROFEND("gpu:pstrsm")
          

          /*
           * copy back to GPU
           */

          iia = (ia-1) + j;
          jja = (ja-1) + (j+jb);
          ii = 1; 
          jj = 1;

          PROFSTART("gpu:A <- Atmp");
          Cpsgecopy_h2d( mm,nn, Atmp,ii,jj,desc_Atmp,
                             A, iia,jja, descA );
          PROFEND("gpu:A <- Atmp");
     };
                           



     };
   PROFEND("gpu:pTRSM");


    /*
     * update trailing submatrix
     */


	alpha = neg_one;
	beta = one;
	mm = m-(jend+1) + 1;
	nn = n-(jend+1) + 1;
	kk = jb;

 
      if ((1 <= mm) && (1 <= nn) && (1 <= kk)) {
        
        /*
	 cublasSgemm('N','N',mm,nn,kk,
            alpha, dA(j+jb,j),lddA, dA(j,j+jb),lddA,
            beta, dA(j+jb,j+jb), lddA );
         */

        if (use_broadcast_triangular_matrix) {
          /*
           * Copy from GPU to Atmp
           */
          iia = (ia-1) + j;
          jja = (ja-1) + (j+jb);

          setup_desc( kk,nn, iia,jja, descA, &isizeAtmp, desc_Atmp);
          nbytes = isizeAtmp;
          nbytes *= elemSize;
          if (Atmp != 0) { free(Atmp); Atmp = 0; };
          Atmp = (float *) malloc( nbytes );
          assert( Atmp != 0);

          PROFSTART("gpu:Atmp <- A");
          Cpsgecopy_d2h( kk,nn, A,iia,jja,descA, 
                                Atmp,1,1,desc_Atmp );
          PROFEND("gpu:Atmp <- A");
        };


        iic = (ia-1) + (jend+1);
        jjc = (ja-1) + (jend+1);


       iha = jsize+1;
       jha = 1;
       iAtmp = 1; 
       jAtmp = 1;
     

          {
          char transA = 'N';
          char transB = 'N';

          PROFSTART("zgetrf_gpu:psgemm");
          Cpsgemm_hhd( transA, transB, mm,nn,kk, 
           alpha, hA, iha,jha, desc_hA, 
                  Atmp, iAtmp,jAtmp, desc_Atmp, 
           beta,  A, iic,jjc, descA );

          PROFEND("zgetrf_gpu:psgemm");
           };
       };



    if (Atmp != 0) {
       free(Atmp); Atmp = 0;
       };

    if (ipiv_hA_ != 0) {
       free( ipiv_hA_ ); ipiv_hA_ = 0;
       };
    if (hA != 0) {
      free(hA); hA = 0;
      };

   }; /* for (jstart) */


   if (use_delayed_left_interchange) {

     PROFSTART("gpu:dleft swap");
    for(j=1; j <= minmn; j = jend + 1) {
        jend = MIN( minmn, j+nnb-1);
        jsize = jend - j + 1;
        jb = jsize;
        /*
         * apply interchanges to columns 1:(j-1)
         */
   
        nn = j-1;
        k1 = j;
        k2 = j+jb-1;
        incx = 1;
   
   
        if (nn >= 1) {
         iia = (ia-1) + 1; 
         jja = (ja-1) + 1;
         for(kk=k1; kk <= k2; kk++) {
             ip = gipiv(kk);
             assert( ip >= kk );

             if (ip != kk) {
               inc1 = descA[M_];
               inc2 = descA[M_];
               i1 = (iia-1) + kk;
               i2 = (iia-1) + ip;
               j1 = jja;
               j2 = jja;
               Cpsswap_gpu(nn, A, i1,j1,descA, inc1, 
                               A, i2,j2,descA, inc2 );
             };
         };
        };
     }; /* end for j */
     PROFEND("gpu:dleft swap");
   }; /* end if use delayed left interchange */


   /*
    * adjust global pivot from 1:MIN(m,n) to ia:(ia + MIN(m,n)-1)
    * copy global vector back to distributed pivot vector
    */

   for(int j=1; j <= minmn; j++) {
     gipiv(j) = (ia-1) + gipiv(j);
   };


   lld = descA[MB_] + 
         Cnumroc( descA[M_], descA[MB_], myprow, descA[RSRC_], nprow);

   Cdescset( desc_ipiv, 
              descA[M_],1, 
              descA[MB_], 1, 
              descA[RSRC_], -1, descA[CTXT_], lld );

   i1 = 1; j1 = 1; inc1 = 1;
   i2 = ia; j2 = 1; inc2 = 1;
   mtmp = MIN(m,n);

   PROFSTART("gpu:ipiv");
   use_replicated_storage = FALSE;
   if (use_replicated_storage) {
     int msize,nsize,lr1,lc1,lr2,lc2, lrindx,iia;

     local_extent(MIN(m,n),n,ia,ja,descA, &msize,&nsize, &lr1,&lc1, &lr2,&lc2);
     if (msize >= 1) {
       for(lrindx=lr1; lrindx <= lr2; lrindx++) {
         iia = Cindxl2g( lrindx, descA[MB_], myprow, descA[RSRC_], nprow);
         ipiv(lrindx) =  gipiv( (iia-ia) + 1 );
         };
       };
     }
   else  {
     /*
      * copy to a column, then broadcast
      */
     char scope = 'R';
     char top = ' ';
     int Locp, Locq;
     int lld;
     int icontxt = desc_ipiv[CTXT_];

     desc_ipiv[CSRC_] = ja_proc;
     desc_gipiv[RSRC_] = ia_proc;
     desc_gipiv[CSRC_] = ja_proc;

     mtmp = MIN(m,n);
     scalapack_picopy( &mtmp, &(gipiv(1)), &i1,&j1, desc_gipiv, &inc1,
             &(ipiv(1)), &i2, &j2, desc_ipiv, &inc2 );

     if (idebug >= 1) {
       char cmatnm[] = "ipiv after picopy";
       if (is_root) {
         printf("ia_proc %d ja_proc %d i2 %d j2 %d \n",ia_proc,ja_proc,i2,j2);
       };
       Cpilaprnt( mtmp,1, &(ipiv(1)), i2,j2,desc_ipiv, cmatnm);
     };


     Locp = Cnumroc( ia + MIN(m,n)-1, desc_ipiv[MB_], 
                     myprow, desc_ipiv[RSRC_], nprow);
     lld = MAX(1,Locp);
     Locq = 1;
     if (npcol > 1) {
      if (mypcol == ja_proc) {

       scalapack_igebs2d( &icontxt, &scope, &top, 
           &Locp, &Locq,  &(ipiv(1)), &lld );
      }
      else {
       rsrc = myprow;
       scalapack_igebr2d( &icontxt, &scope, &top,
           &Locp, &Locq, &(ipiv(1)), &lld, &rsrc, &ja_proc );
      };
     };

   };
   PROFEND("gpu:ipiv");

     if (idebug >= 1) {
       int desctmp[DLEN_];
       char cmatnm[] = "final ipiv";
       Cdescset( desctmp, 
           descA[M_],npcol,
           descA[MB_],1,
           descA[RSRC_], descA[CSRC_],
           descA[CTXT_], descA[LLD_]);
       Cpilaprnt( MIN(m,n),npcol, &(ipiv(1)), ia,1,desctmp, cmatnm);
     };





  /*
   * clean up
   */
  if (Atmp != 0) {
       free(Atmp); Atmp = 0;
       };
  if (hA != 0) {
       free(hA); hA = 0;
       };
  if (ipiv_hA_ != 0) {
      free( ipiv_hA_ ); ipiv_hA_ = 0;
      };

  if (gipiv_ != 0) {
     free(gipiv_); gipiv_ = 0;
     };


  return;
}
예제 #29
0
/* Test program 
 * created 23/09/2014
 * author Alex Bombrun
 * 
 * icc -O1  -o eigen.exe lapackReadStore.c mpiutil.c normals.c matrixBlockStore.c -mkl
 * ./eigen.exe 4 4
 *
 */
int main(int argc, char **argv) {
  
    FILE* store;
    FILE* scaStore;
    
    int N , M;
    int i, j;
    
    int n_blocks;
    int scalapack_size;
    int NB, MB;
    int i_block, j_block;
    int dim[4];
    double * mat;  // local matrix block use for reading
        
    int t, t_block;
    
    const char* profileG_file_name= "./data/NormalsG/profile.txt";
    const char* store_location = "./data/ReducedNormals";
    const char* scaStore_location ="./data/DiagCholeskyReducedNormals";
    
    int mp;	 // number of rows in the processor grid
    int mla;   // number of rows in the local array
    int mb;    // number of rows in a block
    int np;	 // number of columns in the processor grid
    int nla;   // number of columns in the local array
    int nb;    // number of columns in a block
    
    int mype,npe; // rank and total number of process
    
    int idescal[9]; // matrix descriptors
    double *la; // matrix values: al is the local array
    
    int idescbl[9];
    double *lb;
    double normb;
    
    int idescxl[9];
    double *lx;
    double normx;
    
    int idesczl[9]; // matrix descriptors
    double *lz; // matrix values: al is the local array
    
    double *w;
   
    int ierr; // error output 
    int mp_ret, np_ret, myrow, mycol; // to store grid info
    
    int zero=0; // value used for the descriptor initialization
    int one=1; // value used for the descriptor initialization
    
    int  m,n; // matrix A dimensions
    double norm, cond;
    double *work = NULL;
    double * work2 = NULL;
    int *iwork = NULL;
    int lwork, liwork;


     float ll,mm,cr,cc;
      int ii,jj,pr,pc,h,g; // ii,jj coordinates of local array element
      int rsrc=0,csrc=0; // assume that 0,0 element should be stored in the 0,0 process
      int n_b = 1;
      int index;
    int icon; // scalapack cblacs context
    char normJob, jobz, uplo, trans, diag;
    
    double MPIt1, MPIt2, MPIelapsed;
    
    jobz= 'N'; uplo='U';
    Cblacs_pinfo( &mype, &npe );
    
     if (argc == 3) {
	//printf("%s %s %s\n", argv[0], argv[1], argv[2]);
	n_blocks= (int) strtol(argv[1], NULL, 10);
	scalapack_size= (int) strtol(argv[2], NULL, 10);
     } else {
	printf("Usage: expect 2 integers \n");
	printf(" 1 : the number of diagonal blocks \n");
	printf(" 2 : scalapack number to define block size (assume n is divisible by sqrt(p) and that n/sqrt(p) is divisible by this number)\n");
	exit( -1);
     }
    
  

    printf("%d/%d: read store\n",mype,npe);
   
    N = getNumberOfLine(profileG_file_name); // the dimension of the matrix;
    M = N; // square matrix
    
    m=M; //mla*mp;
    n=N; //nla*np;
   
    np = isqrt(npe); // assume that the number of process is a square
    mp = np; // square grid
    
    mla = m/mp; // assume that the matrix dimension if a multiple of the process grid dimension
    nla = n/np;
    
    mb = mla/scalapack_size; // assume that the dimension of the matrix is a multiple of the number of the number of diagonal blocks
    nb = nla/scalapack_size;
    
    // init CBLACS
    Cblacs_get( -1, 0, &icon );
    Cblacs_gridinit( &icon,"c", mp, np ); 
    Cblacs_gridinfo( icon, &mp_ret, &np_ret, &myrow, &mycol);
    
   

    // allocate local matrix
    la=malloc(sizeof(double)*mla*nla);
    printf("%d/%d: full matrix (%d,%d), local matrix (%d,%d), processor grid (%d,%d), block (%d,%d) \n", mype, npe, m, n, mla, nla, np, mp, mb, nb);

    // set identity matrix
    for(i = 0;i<M;i++){
	for(j = i;j<i+1;j++){
	    cr = (float)( i/mb );
	    h = rsrc+(int)(cr);
	    pr = h%np;
	    cc = (float)( j/mb );
	    g = csrc+(int)(cc);
	    pc = g%mp;
	    // check if process should get this element
	    if (myrow == pr && mycol==pc){
		// ii = x + l*mb
		// jj = y + m*nb
		ll = (float)( ( i/(np*mb) ) );  // thinks seems to be mixed up does not matter as long as the matrix, the block and the grid is symmetric
		mm = (float)( ( j/(mp*nb) ) );
		ii = i%mb + (int)(ll)*mb;
		jj = j%nb + (int)(mm)*nb;
		index=jj*mla+ii;   // seems to be the transpose !?
		//if(index<0) printf("%d/%d: negative index (%d,%d) \n",mype,npe,i,j);
		//if(index>=mla*nla) printf("%d/%d: too large index (%d,%d) \n",mype,npe,i,j);
		la[index] = 1;
	    }
	}
    }



/*
    for(i_block=0;i_block<n_blocks;i_block++){
      printf("%d/%d: process store block %d \n", mype, npe, i_block);
      readStore(&store,i_block,store_location);
      t_block = 0;
      while(readNextBlockDimension(dim,store)!=-1) { // loop B over all block tasks
	j_block = mpi_get_diag_block_id(i_block, t_block, n_blocks);
	mat = malloc((dim[1]-dim[0])*(dim[3]-dim[2]) * sizeof(double));         
	    
	readNextBlock(dim[0],dim[1],dim[2],dim[3],mat,store);
	if (dim[0]==dim[2]){ // process only the diagonal blocks

//	printf("%d/%d: read block (%d,%d) with global indices (%d,%d,%d,%d) \n",mype, npe, i_block,j_block,dim[0],dim[1],dim[2],dim[3]);
	
	NB = dim[1]-dim[0];
	MB = dim[3]-dim[2];
	for(i = dim[0];i<dim[1];i++){
	  for(j = dim[2];j<dim[3];j++){
	      //matA[i*M+j] = mat[(i-dim[0])*MB+(j-dim[2])];
	     // finding out which pe gets this i,j element
              cr = (float)( i/mb );
              h = rsrc+(int)(cr);
              pr = h%np;
              cc = (float)( j/mb );
              g = csrc+(int)(cc);
              pc = g%mp;
	      // check if process should get this element
              if (myrow == pr && mycol==pc){
		  // ii = x + l*mb
		  // jj = y + m*nb
                  ll = (float)( ( i/(np*mb) ) );  // thinks seems to be mixed up does not matter as long as the matrix, the block and the grid is symmetric
                  mm = (float)( ( j/(mp*nb) ) );
                  ii = i%mb + (int)(ll)*mb;
                  jj = j%nb + (int)(mm)*nb;
                  index=jj*mla+ii;   // seems to be the transpose !?
		  //if(index<0) printf("%d/%d: negative index (%d,%d) \n",mype,npe,i,j);
		  //if(index>=mla*nla) printf("%d/%d: too large index (%d,%d) \n",mype,npe,i,j);
                  la[index] = mat[(i-dim[0])*MB+(j-dim[2])];
              }
	  }
	}
	// transpose
	if(j_block != i_block){
	  for(i = dim[0];i<dim[1];i++){
	    for(j = dim[2];j<dim[3];j++){
	      //matA[j*M+i] = mat[(i-dim[0])*MB+(j-dim[2])];
	       // finding out which pe gets this j,i element
              cr = (float)( j/mb );
              h = rsrc+(int)(cr);
              pr = h%np;
              cc = (float)( i/mb );
              g = csrc+(int)(cc);
              pc = g%mp;
	      // check if process should get this element
              if (myrow == pr && mycol==pc){
		  // ii = x + l*mb
		  // jj = y + m*nb
                  ll = (float)( ( j/(np*mb) ) );  // thinks seems to be mixed up does not matter as long as the matrix, the block and the grid is symmetric
                  mm = (float)( ( i/(mp*nb) ) );
                  ii = j%mb + (int)(ll)*mb;
                  jj = i%nb + (int)(mm)*nb;
                  index=jj*mla+ii;   // seems to be the transpose !?
		  //if(index<0) printf("%d/%d: negative index (%d,%d) \n",mype,npe,i,j);
		  //if(index>=mla*nla) printf("%d/%d: too large index (%d,%d) \n",mype,npe,i,j);

                  la[index] = mat[(i-dim[0])*MB+(j-dim[2])];
              }
	    }
	  } 
	}

	}
	
	free(mat);
	t_block++;
      }


      closeStore(store);
    }
*/    
    
    printf("%d/%d: finished scaterring the matrix \n",mype,npe);
    
    printf("%d/%d: start computing \n",mype,npe);
       // set the matrix descriptor
    ierr=0;
    descinit_(idescal, &m, &n  , &mb, &nb , &zero, &zero, &icon, &mla, &ierr); // processor grip id start at 0
    if (mype==0) saveMatrixDescriptor(idescal, scaStore_location);
    
    
    ierr=0;
    descinit_(idescbl, &m, &one  , &mb, &nb , &zero, &zero, &icon, &nla, &ierr); // processor grip id start at 0
    lb = calloc(sizeof(double),mla);
    
    ierr=0;
    // set x
    descinit_(idescxl, &n, &one  , &mb, &nb , &zero, &zero, &icon, &nla, &ierr); // processor grip id start at 0
    lx = calloc(sizeof(double),mla);
    for(i=0;i<mla;i++){
      lx[i] = 1.0/m;
    }
    pddot_(&n,&normx,lx,&one,&one,idescxl,&one,lx,&one,&one,idescxl,&one); // normx <- x'x
    if (mype==0) printf("%d/%d: normx2 %E \n",mype,npe,normx);  
    
    
    ierr=0;
    // set b
    double alpha =1.0;
    double beta =0.0;
    trans = 'N';
    pdgemv_(&trans,&m,&n,&alpha,la,&one,&one,idescal,lx,&one,&one,idescxl,&one,&beta,lb,&one,&one,idescbl,&one); // b <- A x
    pddot_(&n,&normb,lb,&one,&one,idescbl,&one,lb,&one,&one,idescbl,&one); // norm <- b'b
    if (mype==0) printf("%d/%d: normb2 %E \n",mype,npe,normb);  
    
    
    ierr = 0;
    // compute norm 1 of the reduced normal matrix
    /* DO NOT WORK
    lwork = 2*mla+2*nla;
    work = malloc(sizeof(double)*lwork);
    normJob = '1';
    norm = pdlansy_(&normJob, &uplo, &n, la, &one, &one, idescal, work);  // matrix index start at one 
    printf("%d/%d: norm %f \n",mype,npe,norm);
    free(work);
    */
    
    ierr = 0;
    // compute the cholesky decomposition 
    printf("%d/%d: start computing cholesky factor\n",mype,npe);  
    pdpotrf_(&uplo,&n,la,&one,&one,idescal,&ierr);
    printf("%d/%d: finish computing cholesky factor\n",mype,npe);
    openScalapackStore(&scaStore,myrow,mycol,scaStore_location);
    saveLocalMatrix(la,nla,mla,scaStore);
    
    double test=0.0;
    for(i=0;i<nla*mla;i++){
	test += la[i]*la[i];
    }
    printf("%d/%d: finished computing cholesky, test=%f \n",mype,npe,test);
    
    ierr =0;
    // assume x and b set
    // assume cholesky decomposition
    // compute the soluation A x = b
    diag = 'N';
    printf("%d/%d: start solving\n",mype,npe);  
    //pdpptrs_(&uplo, &trans , &diag , &n , &one , la , &one , &one , idescal , lb , &one , &one , idescbl , &ierr); // solve triangular system
    //pdtrtrs (&uplo, &trans , &diag , &n , &n , la , &one , &one , idescal , lb , &one , &one , idescbl , &ierr);
    pdpotrs_(&uplo, &n , &one , la , &one , &one , idescal , lb , &one , &one , idescbl , &ierr); // b<- A-1 b
    
    alpha = -1.0;
    normb=0;
    pdaxpy_(&n,&alpha,lx,&one,&one,idescxl,&one,lb,&one,&one,idescbl,&one); // b<-b-x
    pddot_(&n,&normb,lb,&one,&one,idescbl,&one,lb,&one,&one,idescbl,&one); // norm <- b'b
    if (mype==0) printf("%d/%d: finish solving, norm2(sol-true) %E \n",mype,npe,normb);  
    
    

    ierr = 0;
    /*
    // compute the eigen values
    jobz= 'N'; uplo='U'; // with N z is ignored
    descinit_(idesczl, &m, &n  , &mb, &nb , &zero, &zero, &icon, &mla, &ierr);
    lz = malloc(sizeof(double)*mla*nla);
    w = malloc(sizeof(double)*m);
    lwork = -1;
    work = malloc(sizeof(double)*2);
    pdsyev_( &jobz, &uplo, &n, la, &one, &one, idescal, w, lz, &one, &one, idesczl, work, &lwork, &ierr);   // only compute lwork
    //pdsyev_( &jobz, &uplo, &n, A, &ione, &ione, descA, W, Z, &ione, &ione, descZ, work, &lwork, &info );
    lwork= (int) work[0];
    free(work);
    work = (double *)calloc(lwork,sizeof(double)) ;
    //MPIt1 = MPI_Wtime();
    pdsyev_( &jobz, &uplo, &n, la, &one, &one, idescal, w, lz, &one, &one, idesczl, work, &lwork, &ierr);   // compute the eigen values
    //MPIt2 = MPI_Wtime();
    //MPIelapsed=MPIt2-MPIt1;
    
    if (mype == 0) {
	saveMatrix(n,w,"eigenvalues.txt");
	//printf("%d/%d: finished job in %8.2fs\n",mype,npe,MPIelapsed); // not working
    }
    */
    
    ierr = 0;
    // compute the conditioner number assume that the norm and the cholesky decomposition have been computed
    /* DO NOT WORK
    lwork = 2*mla+3*nla;
    printf("%d/%d: lwork=%d @%p\n",mype,npe,lwork,&lwork);
    work2 = malloc(sizeof(double)*lwork);
    liwork = 2*mla+3*nla;
    iwork = malloc(sizeof(int)*liwork);
    pdpocon_(&uplo,&n,la,&one,&one,idescal,&norm,&cond,work2,&lwork,iwork,&liwork,&ierr);
    printf("%d/%d: condition number %f \n",mype,npe,cond);
    */
    
    free(la);
    Cblacs_gridexit(icon);
    Cblacs_exit( 0 );
    return 0;
}
예제 #30
0
int GridCol( int context )
{
    int gridHeight, gridWidth, gridRow, gridCol;
    Cblacs_gridinfo( context, &gridHeight, &gridWidth, &gridRow, &gridCol );
    return gridCol;
}