void blacs_gridinit_nektar(int *BLACS_PARAMS, int *DESCA, int *DESCB){
  int i,j,k;

  /*
   BLACS_PARAMS:
   [0]  = ictxt;
   [1]  = my_proc;
   [2]  = total_procs;
   [3]  = Nproc_row;
   [4]  = Nproc_col;
   [5]  = my_row;
   [6]  = my_col;
   [7]  = Global_rows;
   [8]  = Global_columns;
   [9]  = Block_Size_row;
   [10] = Block_Size_col;
   [11] = LOC_rows;
   [12] = LOC_columns;
   */


   blacs_pinfo_( i, j);
   BLACS_PARAMS[1] = i;
   BLACS_PARAMS[2] = j;

   blacs_get_( -1, 0,  k);   // <- LG check the arguments of this call
   BLACS_PARAMS[0] = k;

   i = BLACS_PARAMS[3];
   j = BLACS_PARAMS[4];
   blacs_gridinit_( BLACS_PARAMS[0], "Row", i, j);

   blacs_gridinfo_( BLACS_PARAMS[0] ,BLACS_PARAMS[3], BLACS_PARAMS[4], i, j);
   BLACS_PARAMS[5] = i;
   BLACS_PARAMS[6] = j;

   i = 0;
   BLACS_PARAMS[11] = numroc_(BLACS_PARAMS[7],BLACS_PARAMS[9], BLACS_PARAMS[5],i,BLACS_PARAMS[3]);
   BLACS_PARAMS[12] = numroc_(BLACS_PARAMS[8],BLACS_PARAMS[10],BLACS_PARAMS[6],i,BLACS_PARAMS[4]);

   i = 0;
   j = 0;
   descinit_(DESCA, BLACS_PARAMS[7], BLACS_PARAMS[8],
                    BLACS_PARAMS[9], BLACS_PARAMS[10],
                    i, j,
                    BLACS_PARAMS[0],BLACS_PARAMS[11],
                    k);
   if (k != 0)
     fprintf(stderr,"blacs_gridinit_nektar: ERROR, descinit(info) = %d \n",k);

   descinit_(DESCB, BLACS_PARAMS[7], 1,
                    BLACS_PARAMS[9], 1,
                    i, j,
                    BLACS_PARAMS[0],BLACS_PARAMS[11],
                    k);
  if (k != 0)
     fprintf(stderr,"blacs_gridinit_nektar: ERROR, descinit(info) = %d \n",k);


}
Example #2
0
        int driver_psgesvd( char jobU, char jobVT, int m, int n, float *A, int ia, int ja, int *descA,
                float *S_NN, float *U_NN, int iu, int ju, int *descU, float *VT_NN, int ivt, int jvt, int *descVT,
                double *MPIelapsedNN){

        float *Acpy=NULL, *work=NULL;
        int lwork;
/**/
        int ione=1;
/**/
        int nprow, npcol, myrow, mycol;
        int mpA, nqA;
        int ictxt, nbA, rsrcA, csrcA;
        int ctxt_ = 1, nb_ = 5, rsrc_ = 6, csrc_ = 7;
        int infoNN;

        double MPIt1, MPIt2;
/**/
        ictxt = descA[ctxt_];
        Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol );

        nbA = descA[nb_]; rsrcA = descA[rsrc_] ; csrcA = descA[csrc_] ;

        mpA    = numroc_( &m     , &nbA, &myrow, &rsrcA, &nprow );
        nqA    = numroc_( &n     , &nbA, &mycol, &csrcA, &npcol );

        Acpy = (float *)calloc(mpA*nqA,sizeof(float)) ;
        if (Acpy==NULL){ printf("error of memory allocation Acpy on proc %dx%d\n",myrow,mycol); exit(0); }

        pslacpy_( "All", &m, &n, A, &ione, &ione, descA, Acpy, &ione, &ione, descA );


        work = (float *)calloc(1,sizeof(float)) ;
        if (work==NULL){ printf("error of memory allocation for work on proc %dx%d (1st time)\n",myrow,mycol); exit(0); }

        lwork=-1;

        psgesvd_( &jobU, &jobVT, &m, &n, Acpy, &ione, &ione, descA,
                S_NN, U_NN, &ione, &ione, descU, VT_NN, &ione, &ione, descVT,
                work, &lwork, &infoNN);

        lwork = (int) (work[0]);
        free(work);

        work = (float *)calloc(lwork,sizeof(float)) ;
        if (work==NULL){ printf("error of memory allocation work on proc %dx%d\n",myrow,mycol); exit(0); }
/**/            
        MPIt1 = MPI_Wtime();
/**/
        psgesvd_( &jobU, &jobVT, &m, &n, Acpy, &ione, &ione, descA,
                S_NN, U_NN, &ione, &ione, descU, VT_NN, &ione, &ione, descVT,
                work, &lwork, &infoNN);
/**/
        MPIt2 = MPI_Wtime();
        (*MPIelapsedNN)=MPIt2-MPIt1;
/**/
        free(work);
        free(Acpy);
        return infoNN;
}
Example #3
0
///
/// This code generates a PdgesvdArgs parameter block that can be used to drive
/// pdgesvdSlave2() when there is no SciDB application to provide the info.
/// It makes up parameters for a pdgesvd call that are appropriate to the
/// processor grid and order of matrix being decomposed
///
scidb::PdgesvdArgs pdgesvdGenTestArgs(slpp::int_t ICTXT, slpp::int_t NPROW, slpp::int_t NPCOL,
                                      slpp::int_t MYPROW, slpp::int_t MYPCOL, slpp::int_t MYPNUM,
                                      slpp::int_t order)
{
    scidb::PdgesvdArgs result;

    // hard-code a problem based on order and fixed block size
    const slpp::int_t M=order;
    const slpp::int_t N=order;
    const slpp::int_t MIN_MN=order;
    const slpp::int_t BLKSZ=slpp::SCALAPACK_EFFICIENT_BLOCK_SIZE; // we are making up an array descriptor, not receiving one
                                                                  // as is normal for functions in a xxxxSlave.cpp file.  It is only because
                                                                  // its a test routine that SCALAPACK_EFFICIENT_BLOCK_SIZE is referenced here
                                                                  // Normally it is only used at the xxxxxPhysical.cpp operator level.
    const slpp::int_t one = 1 ;
    const char jobU = 'V';
    const char jobVT = 'V';

    // create ScaLAPACK array descriptors
    const slpp::int_t RSRC = 0 ;
    // LLD(A)
    slpp::int_t LLD_A = std::max(one, numroc_( order, BLKSZ, MYPROW, RSRC, NPROW ));
    // LLD(VT)
    slpp::int_t LLD_VT = std::max(one, numroc_( order, BLKSZ, MYPROW, RSRC, NPROW ));

    // WARNING -- note I never checked INFO from descinits !!
    slpp::int_t INFO = 0;

    slpp::desc_t DESC_A;
    descinit_(DESC_A, order, order, BLKSZ, BLKSZ, 0, 0, ICTXT, LLD_A, INFO);
    if (INFO != 0) throw("pdgesvdGenTestArgs: unexpected runtime error");

    slpp::desc_t DESC_U;
    descinit_(DESC_U, order, order, BLKSZ, BLKSZ, 0, 0, ICTXT, LLD_A, INFO);
    if (INFO != 0) throw("pdgesvdGenTestArgs: unexpected runtime error");

    slpp::desc_t DESC_VT;
    descinit_(DESC_VT, order, order, BLKSZ, BLKSZ, 0, 0, ICTXT, LLD_VT, INFO);
    if (INFO != 0) throw("pdgesvdGenTestArgs: unexpected runtime error");

    // S is different: global, not distributed, so its LLD(S) == LEN(S)
    slpp::desc_t DESC_S;
    descinit_(DESC_S, MIN_MN, 1, BLKSZ, BLKSZ, 0, 0, ICTXT, MIN_MN, INFO);
    if (INFO != 0) throw("pdgesvdGenTestArgs: unexpected runtime error");

    pdgesvdMarshallArgs(&result, NPROW,  NPCOL, MYPROW, MYPCOL, MYPNUM,
                           jobU, jobVT, M, N,
                           NULL /*A*/,  one, one, DESC_A,
                           NULL /*S*/,
                           NULL /*U*/,  one, one, DESC_U,
                           NULL /*VT*/, one, one, DESC_VT);
    return result;
}
Example #4
0
float verif_representativity(int m, int n, float *A, int ia, int ja, int *descA,
                                              float *U, int iu, int ju, int *descU,
                                              float *VT, int ivt, int jvt, int *descVT,
                                              float *S){

        float *Acpy=NULL, *Ucpy=NULL;
        int nprow, npcol, myrow, mycol;
        int min_mn, max_mn, mpA, pcol, localcol, i, nqA;
        int ictxt, nbA, rsrcA, csrcA, nbU, rsrcU, csrcU, mpU, nqU;
        int ctxt_ = 1, nb_ = 5, rsrc_ = 6, csrc_ = 7;
        int izero = 0, ione = 1;
        float *wwork=NULL;
        float tmone= -1.0e+00, tpone= +1.0e+00;
        float residF, AnormF;

        min_mn = min(m,n);
        max_mn = max(m,n);
        ictxt = descA[ctxt_];
        Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol );

        nbA = descA[nb_]; rsrcA = descA[rsrc_] ; csrcA = descA[csrc_] ;
        mpA    = numroc_( &m     , &nbA, &myrow, &rsrcA, &nprow );
        nqA    = numroc_( &n     , &nbA, &mycol, &csrcA, &npcol );
        Acpy = (float *)calloc(mpA*nqA,sizeof(float)) ;
        if (Acpy==NULL){ printf("error of memory allocation Acpy on proc %dx%d\n",myrow,mycol); exit(0); }
        pslacpy_( "All", &m, &n, A, &ia, &ja, descA, Acpy, &ia, &ja, descA );

        nbU = descU[nb_]; rsrcU = descU[rsrc_] ; csrcU = descU[csrc_] ;
        mpU    = numroc_( &m     , &nbU, &myrow, &rsrcU, &nprow );
        nqU    = numroc_( &min_mn, &nbU, &mycol, &csrcU, &npcol );
        Ucpy = (float *)calloc(mpU*nqU,sizeof(float)) ;
        if (Ucpy==NULL){ printf("error of memory allocation Ucpy on proc %dx%d\n",myrow,mycol); exit(0); }
        pslacpy_( "All", &m, &min_mn, U, &iu, &ju, descU, Ucpy, &iu, &ju, descU );

        AnormF = pslange_( "F", &m, &n, A, &ia, &ja, descA, wwork);

        for (i=1;i<min_mn+1;i++){
                pcol = indxg2p_( &i, &(descU[5]), &izero, &izero, &npcol );
                localcol = indxg2l_( &i, &(descU[5]), &izero, &izero, &npcol );
                if( mycol==pcol )
                        sscal_( &mpA, &(S[i-1]), &(Ucpy[ ( localcol-1 )*descU[8] ]), &ione );
        }
        psgemm_( "N", "N", &m, &n, &min_mn, &tpone, Ucpy, &iu, &ju, descU, VT, &ivt, &jvt, descVT,
                        &tmone, Acpy, &ia, &ja, descA ); 
        residF = pslange_( "F", &m, &n, Acpy, &ione, &ione, descA, wwork);
        residF = residF/AnormF/((float) max_mn);

        free(Ucpy);
        free(Acpy);

        return residF;
}
Example #5
0
float verif_repres_VN(int m, int n, float *A, int ia, int ja, int *descA,
                                       float *U, int iu, int ju, int *descU,
                                       float *S){

        float *VTcpy=NULL;
        int nprow, npcol, myrow, mycol;
        int min_mn, max_mn, mpA, prow, localcol, i, nqA;
        int ictxt, nbA, rsrcA, csrcA, mpVT, nqVT, descVTcpy[9], itemp, ivtcpy, jvtcpy;
        int ctxt_ = 1, nb_ = 5, rsrc_ = 6, csrc_ = 7;
        int izero = 0, info;
        float tpone= +1.0e+00,  tzero= +0.0e+00;
        float verif_repres_VN, invStemp;

        min_mn = min(m,n);
        max_mn = max(m,n);
        ictxt = descA[ctxt_];
        Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol );

        nbA = descA[nb_]; rsrcA = descA[rsrc_] ; csrcA = descA[csrc_] ;

        mpA    = numroc_( &m     , &nbA, &myrow, &rsrcA, &nprow );
        nqA    = numroc_( &n     , &nbA, &mycol, &csrcA, &npcol );

        mpVT   = numroc_( &min_mn, &nbA, &myrow, &rsrcA, &nprow );
        nqVT   = numroc_( &n     , &nbA, &mycol, &csrcA, &npcol );

        itemp = max( 1, mpVT );
        descinit_( descVTcpy, &min_mn, &n, &nbA, &nbA, &rsrcA, &csrcA, &ictxt, &itemp, &info );

        ivtcpy = 1; jvtcpy = 1; 
        VTcpy = (float *)calloc(mpVT*nqVT,sizeof(float)) ;
        if (VTcpy==NULL){ printf("error of memory allocation VTcpy on proc %dx%d\n",myrow,mycol); exit(0); }

        psgemm_( "T", "N", &min_mn, &n, &m, &tpone, U, &iu, &ju, descU, A, &ia, &ja, descA,
                        &tzero, VTcpy, &ivtcpy, &jvtcpy, descVTcpy ); 

        for (i=1;i<min_mn+1;i++){
                prow = indxg2p_( &i, &nbA, &izero, &izero, &nprow );
                localcol = indxg2l_( &i, &nbA, &izero, &izero, &nprow );
                invStemp = 1/S[i-1];
                if( myrow==prow )
                        sscal_( &nqA, &invStemp, &(VTcpy[localcol-1]), &mpVT );
        }

        verif_repres_VN = verif_orthogonality(min_mn,n,VTcpy, ivtcpy, jvtcpy, descVTcpy);

        free(VTcpy);

        return verif_repres_VN;
}
Example #6
0
float verif_repres_NV(int m, int n, float *A, int ia, int ja, int *descA,
                                float *VT, int ivt, int jvt, int *descVT,
                                float *S){

        float *Ucpy=NULL;
        int nprow, npcol, myrow, mycol;
        int min_mn, max_mn, mpA, pcol, localcol, i, nqA;
        int ictxt, nbA, rsrcA, csrcA, mpU, nqU, descUcpy[9], itemp, iucpy, jucpy;
        int ctxt_ = 1, nb_ = 5, rsrc_ = 6, csrc_ = 7;
        int izero = 0, ione = 1, info;
        float tpone= +1.0e+00,  tzero= +0.0e+00;
        float verif_repres_NV, invStemp;

        min_mn = min(m,n);
        max_mn = max(m,n);
        ictxt = descA[ctxt_];
        Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol );

        nbA = descA[nb_]; rsrcA = descA[rsrc_] ; csrcA = descA[csrc_] ;

        mpA    = numroc_( &m     , &nbA, &myrow, &rsrcA, &nprow );
        nqA    = numroc_( &n     , &nbA, &mycol, &csrcA, &npcol );

        itemp = max( 1, mpA );
        descinit_( descUcpy,  &m, &min_mn, &nbA, &nbA, &rsrcA, &csrcA, &ictxt, &itemp, &info );

        iucpy = 1; jucpy = 1;   
        mpU    = numroc_( &m     , &nbA, &myrow, &rsrcA, &nprow );
        nqU    = numroc_( &min_mn, &nbA, &mycol, &csrcA, &npcol );
        Ucpy = (float *)calloc(mpU*nqU,sizeof(float)) ;
        if (Ucpy==NULL){ printf("error of memory allocation Ucpy on proc %dx%d\n",myrow,mycol); exit(0); }

        psgemm_( "N", "T", &m, &min_mn, &n, &tpone, A, &ia, &ja, descA, VT, &ivt, &jvt, descVT,
                        &tzero, Ucpy, &iucpy, &jucpy, descUcpy ); 

        for (i=1;i<min_mn+1;i++){
                pcol = indxg2p_( &i, &(descUcpy[5]), &izero, &izero, &npcol );
                localcol = indxg2l_( &i, &(descUcpy[5]), &izero, &izero, &npcol );
                invStemp = 1/S[i-1];
                if( mycol==pcol )
                        sscal_( &mpA, &invStemp, &(Ucpy[ ( localcol-1 )*descUcpy[8] ]), &ione );
        }

        verif_repres_NV = verif_orthogonality(m,min_mn,Ucpy, iucpy, jucpy, descUcpy);

        free(Ucpy);

        return verif_repres_NV;
}
Example #7
0
void 
dgather(int ictxt, int n, int numc, int nb, double *A, double *A_d, int *descAd){

int RootNodeic, ione=1, izero=0, isRootNode=0, nru, info;
int nprow, npcol, myrow, mycol, descA[9], itemp;
int i,k;

   sl_init_(&RootNodeic, &ione, &ione);

   Cblacs_gridinfo(ictxt, &nprow,&npcol, &myrow, &mycol);

   if (myrow==0 && mycol ==0){ isRootNode = 1;}

   if(isRootNode){
      nru = numroc_(&n, &n, &myrow, &izero, &nprow);
      itemp = max(1,nru);
      descinit_(descA, &n, &numc, &n, &n, &izero, &izero, &RootNodeic, &itemp, &info );
   }
   else{
      k=0;
      for(i=0;i<9;i++){
          descA[k]=0;
          k++;
      }
      descA[1]=-1;
    }

   pdgemr2d_(&n,&numc,A_d,&ione, &ione, descAd, A, &ione, &ione, descA, &ictxt );

   if (isRootNode){
       Cblacs_gridexit(RootNodeic);
   }
  
}
Example #8
0
File: blacs.c Project: qsnake/gpaw
PyObject* get_blacs_local_shape(PyObject *self, PyObject *args)
{
  int ConTxt;
  int m, n, mb, nb, rsrc, csrc;
  int nprow, npcol, myrow, mycol;
  int locM, locN;

  if (!PyArg_ParseTuple(args, "iiiiiii", &ConTxt, &m, &n, &mb, 
			&nb, &rsrc, &csrc)){
    return NULL;
  }

  Cblacs_gridinfo_(ConTxt, &nprow, &npcol, &myrow, &mycol);
  locM = numroc_(&m, &mb, &myrow, &rsrc, &nprow);
  locN = numroc_(&n, &nb, &mycol, &csrc, &npcol);
  return Py_BuildValue("(ii)", locM, locN);
}
void blacs_pdgetri_nektar(int *BLACS_PARAMS, int *DESCA, int *ipvt, double **inva_LOC){
  int row_start = 1, col_start = 1;
  int lwork,liwork,info = 0;
  double *work;
  int *iwork;
  int M;

  M = BLACS_PARAMS[7] + ((row_start-1) % BLACS_PARAMS[10]);
  lwork = BLACS_PARAMS[9]*numroc_( M, BLACS_PARAMS[10], BLACS_PARAMS[5], row_start, BLACS_PARAMS[3]);
  liwork = BLACS_PARAMS[7];

  work  = dvector(0,lwork-1);
  iwork = ivector(0,liwork-1);

  int i = -1, j = -1;
  pdgetri_(BLACS_PARAMS[7],*inva_LOC,
             row_start,col_start,DESCA,ipvt,
             work,i,
             iwork,j,
             info);

  if (info != 0)
    fprintf(stderr,"blacs_pdgetri_nektar: ERROR - info = %d \n",info);


  if ( ((int) work[0]) > lwork){
    lwork = (int) work[0];
    free(work);
    work = dvector(0,lwork);
  }


  if ( iwork[0] > lwork){
    liwork = iwork[0];
    free(iwork);
    iwork = ivector(0,liwork);
  }

  pdgetri_(BLACS_PARAMS[7],*inva_LOC,
             row_start,col_start,DESCA,ipvt,
             work,lwork,
             iwork,liwork,
             info);


  if (info != 0)
    fprintf(stderr,"blacs_pdgetri_nektar: ERROR - info = %d \n",info);


  free(work);
  free(iwork);

}
Example #10
0
float verif_orthogonality(int m, int n, float *U, int iu, int ju, int *descU){

        float *R=NULL;
        int nprow, npcol, myrow, mycol;
        int mpR, nqR, nb, itemp, descR[9], ictxt, info, min_mn, max_mn;
        int ctxt_ = 1, nb_ = 5;
        int izero = 0, ione = 1;
        float *wwork=NULL;
        float tmone= -1.0e+00,  tpone= +1.0e+00,  tzero= +0.0e+00;
        float orthU;

        min_mn = min(m,n);
        max_mn = max(m,n);
        ictxt = descU[ctxt_];
        nb = descU[nb_];
        Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol );

        mpR    = numroc_( &min_mn, &nb, &myrow, &izero, &nprow );
        nqR    = numroc_( &min_mn, &nb, &mycol, &izero, &npcol );
        R = (float *)calloc(mpR*nqR,sizeof(float)) ;
        if (R==NULL){ printf("error of memory allocation R on proc %dx%d\n",myrow,mycol); exit(0); }
        itemp = max( 1, mpR );
        descinit_( descR,  &min_mn, &min_mn, &nb, &nb, &izero, &izero, &ictxt, &itemp, &info );

        pslaset_( "F", &min_mn, &min_mn, &tzero, &tpone, R, &ione, &ione, descR );
        if (m>n)
                psgemm_( "T", "N", &min_mn, &min_mn, &m, &tpone, U, &iu, &ju, descU, U, 
                        &iu, &ju, descU, &tmone, R, &ione, &ione, descR );
        else
                psgemm_( "N", "T", &min_mn, &min_mn, &n, &tpone, U, &iu, &ju, descU, U, 
                        &iu, &ju, descU, &tmone, R, &ione, &ione, descR );
        orthU = pslange_( "F", &min_mn, &min_mn, R, &ione, &ione, descR, wwork );
        orthU = orthU / ((float) max_mn);
        free(R);

        return orthU;

}
Example #11
0
void dmat_ldimget(int *desc, int* nrows, int* ncols)
{
  int M = desc[2];
  int N = desc[3];
  int Mb = desc[4];
  int Nb = desc[5];
  
  int ictxt = desc[1];
  int rsrc = desc[6];
  int csrc = desc[7];
  
  int nprow, npcol, myprow, mypcol;
  Cblacs_gridinfo(ictxt, &nprow, &npcol, &myprow, &mypcol);
  
  *nrows = numroc_(&M, &Mb, &myprow, &rsrc, &nprow);
  *ncols = numroc_(&N, &Nb, &mypcol, &csrc, &npcol);
  
  if (*nrows < 1 || *ncols < 1)
  {
    *nrows = 0;
    *ncols = 0;
  }
}
Example #12
0
int BlacsSystem::MSize(int m)
{
        int irsrc=0;
        return numroc_(&m,&mb_,&myrow_,&irsrc,&nprow_);
}
int main (int argc, char *argv[]) {
  myscalar *A=NULL, *B=NULL, *Btrue=NULL;
  int descA[BLACSCTXTSIZE], descB[BLACSCTXTSIZE];
  int n;
  int nb;
  int locr, locc;
  int i, j, ii, jj;
  int *I, *J;
  int nI, nJ;
  int ierr;
  int dummy;
  int myid, np;
  int myrow, mycol, nprow, npcol;
  int ctxt;
  myreal err;

  n=1024; /* Size of the problem */
  nb=16;  /* Blocksize for the 2D block-cyclic distribution */

  /* Initialize MPI */
  if((ierr=MPI_Init(&argc,&argv)))
    return 1;
  myid=-1;
  if((ierr=MPI_Comm_rank(MPI_COMM_WORLD,&myid)))
    return 1;
  np=-1;
  if((ierr=MPI_Comm_size(MPI_COMM_WORLD,&np)))
    return 1;

  /* Initialize the BLACS grid */
  nprow=floor(sqrt((float)np));
  npcol=np/nprow;
  blacs_get_(&IZERO,&IZERO,&ctxt);
  blacs_gridinit_(&ctxt,"R",&nprow,&npcol);
  blacs_gridinfo_(&ctxt,&nprow,&npcol,&myrow,&mycol);

  /* A is a dense n x n distributed Toeplitz matrix */
  if(myid<nprow*npcol) {
    locr=numroc_(&n,&nb,&myrow,&IZERO,&nprow);
    locc=numroc_(&n,&nb,&mycol,&IZERO,&npcol);
    A=new myscalar[locr*locc];
    dummy=std::max(1,locr);
    descinit_(descA,&n,&n,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr);

    for(i=1;i<=locr;i++)
      for(j=1;j<=locc;j++) {
        ii=indxl2g_(&i,&nb,&myrow,&IZERO,&nprow);
        jj=indxl2g_(&j,&nb,&mycol,&IZERO,&npcol);
        // Toeplitz matrix from Quantum Chemistry.
        myreal pi=3.1416, d=0.1;
        A[locr*(j-1)+(i-1)]=ii==jj?std::pow(pi,2)/6.0/std::pow(d,2):std::pow(-1.0,ii-jj)/std::pow((myreal)ii-jj,2)/std::pow(d,2);
      }
  } else {
    descset_(descA,&n,&n,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE);
  }

  /* Initialize the solver and set parameters */
  StrumpackDensePackage<myscalar,myreal> sdp(MPI_COMM_WORLD);
  sdp.use_HSS=true;
  sdp.levels_HSS=4;
  sdp.min_rand_HSS=64;
  sdp.lim_rand_HSS=0;
  sdp.tol_HSS=1e-6;

  /* Compression */
  sdp.compress(A,descA);

  /* Accuracy checking */
  sdp.check_compression(A,descA);

  /* Element extraction: a bunch of random indices.
   * Not that duplicates do not matter (the code works).
   */
  nI=1+rand()%n;
  MPI_Bcast((void*)&nI,IONE,MPI_INTEGER,IZERO,MPI_COMM_WORLD);
  I=new int[nI];
  if(!myid)
    for(i=0;i<nI;i++)
      I[i]=1+rand()%n;
  MPI_Bcast((void*)I,nI,MPI_INTEGER,IZERO,MPI_COMM_WORLD);
  nJ=1+rand()%n;
  MPI_Bcast((void*)&nJ,IONE,MPI_INTEGER,IZERO,MPI_COMM_WORLD);
  J=new int[nJ];
  if(!myid)
    for(j=0;j<nJ;j++)
      J[j]=1+rand()%n;
  MPI_Bcast((void*)J,nJ,MPI_INTEGER,IZERO,MPI_COMM_WORLD);

  /* Extraction for the HSS form */
  if(myid<nprow*npcol) {
    locr=numroc_(&nI,&nb,&myrow,&IZERO,&nprow);
    locc=numroc_(&nJ,&nb,&mycol,&IZERO,&npcol);
    B=new myscalar[locr*locc]();
    dummy=std::max(1,locr);
    descinit_(descB,&nI,&nJ,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr);
  } else
    descset_(descB,&nI,&nJ,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE);
  sdp.extract(A,descA,B,descB,I,nI,J,nJ);

  /* Extraction from the original matrix just to compare */
  sdp.use_HSS=false;
  if(myid<nprow*npcol)
    Btrue=new myscalar[locr*locc];
  sdp.extract(A,descA,Btrue,descB,I,nI,J,nJ);

  /* Comparison with elements of input matrix */
  if(myid<nprow*npcol){
    err=plange('M',nI,nJ,Btrue,IONE,IONE,descB,(myreal*)NULL);
    for(i=0;i<locr*locc;i++)
      Btrue[i]-=B[i];
    err=plange('M',nI,nJ,Btrue,IONE,IONE,descB,(myreal*)NULL)/err;
  }
  if(!myid)
    std::cout << "Element extraction (" << nI << "x" << nJ << " submatrix): maximum relative error max ||A(I,J)-HSS(I,J)||//||A(I,J)|| = " << err << std::endl << std::endl;

  /* Statistics */
  sdp.print_statistics();

  /* Clean-up */
  delete[] A;
  delete[] B;
  delete[] Btrue;
  delete[] I;
  delete[] J;

  /* The end */
  MPI_Finalize();
  return 0;

}
Example #14
0
void test_gemr2d(int M, int N)
{
    int repeat = 10;
    
    int32_t one = 1;
    int32_t isrc = 0;

    int32_t num_ranks;
    MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);
    int32_t rank;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    int32_t bs_row_A = M / num_ranks + std::min(1, M % num_ranks);
    int32_t bs_col_A = 1;

    int32_t bs_row_B = 1;
    int32_t bs_col_B = 1;

    int32_t blacs_handler = Csys2blacs_handle(MPI_COMM_WORLD);
    int32_t context1 = blacs_handler;
    int32_t context2 = blacs_handler;

    /* create BLACS context */
    Cblacs_gridinit(&context1, "C", num_ranks, 1);
    /* get row and column ranks */
    int32_t rank_row1, rank_col1;
    Cblacs_gridinfo(context1, &num_ranks, &one, &rank_row1, &rank_col1);
    /* get local number of rows and columns of a matrix */
    int32_t num_rows_local1, num_cols_local1;
    num_rows_local1 = numroc_(&M, &bs_row_A, &rank_row1, &isrc, &num_ranks);
    num_cols_local1 = numroc_(&N, &bs_col_A, &rank_col1, &isrc, &one);


    Cblacs_gridinit(&context2, "C", 1, num_ranks);
    int32_t rank_row2, rank_col2;
    Cblacs_gridinfo(context2, &one, &num_ranks, &rank_row2, &rank_col2);
    int32_t num_rows_local2, num_cols_local2;
    num_rows_local2 = numroc_(&M, &bs_row_B, &rank_row2, &isrc, &one);
    num_cols_local2 = numroc_(&N, &bs_col_B, &rank_col2, &isrc, &num_ranks);

    if (rank == 0)
    {
        printf("local dimensions of A: %i x %i\n", num_rows_local1, num_cols_local1);
        printf("local dimensions of B: %i x %i\n", num_rows_local2, num_cols_local2);
    } 

    int32_t descA[9], descB[9], info;
    descinit_(descA, &M, &N, &bs_row_A, &bs_col_A, &isrc, &isrc, &context1, &num_rows_local1, &info);
    descinit_(descB, &M, &N, &bs_row_B, &bs_col_B, &isrc, &isrc, &context2, &num_rows_local2, &info);

    std::vector<double_complex> A(num_rows_local1 * num_cols_local1);
    std::vector<double_complex> B(num_rows_local2 * num_cols_local2, double_complex(0, 0));
    std::vector<double_complex> C(num_rows_local1 * num_cols_local1);

    for (int i = 0; i < num_rows_local1 * num_cols_local1; i++)
    {
        A[i] = double_complex(double(rand()) / RAND_MAX, double(rand()) / RAND_MAX);
        C[i] = A[i];
    }

    double time = -MPI_Wtime();
    for (int i = 0; i < repeat; i++)
    {
        pzgemr2d_(&M, &N, &A[0], &one, &one, descA, &B[0], &one, &one, descB, &context1);
    }
    time += MPI_Wtime();

    if (rank == 0)
    {
        printf("average time %.4f sec, swap speed: %.4f GB/sec\n", time / repeat,
               sizeof(double_complex) * repeat * M * N / double(1 << 30) / time);
    }

    /* check correctness */
    pzgemr2d_(&M, &N, &B[0], &one, &one, descB, &A[0], &one, &one, descA, &context1);
    for (int i = 0; i < num_rows_local1 * num_cols_local1; i++)
    {
        if (std::abs(A[i] - C[i]) > 1e-14)
        {
            printf("Fail.\n");
            exit(0);
        }
    }

    Cblacs_gridexit(context1);
    Cblacs_gridexit(context2);
    Cfree_blacs_system_handle(blacs_handler);
}
Example #15
0
///
/// @return INFO = the status of the psgemm_()
///
slpp::int_t pdgemmSlave(void* bufs[], size_t sizes[], unsigned count)
{
    enum dummy {BUF_ARGS=0, BUF_A, BUF_B, BUF_C, NUM_BUFS };

    for(size_t i=0; i < count; i++) {
        if(DBG) {
            std::cerr << "pdgemmSlave: buffer at:"<< bufs[i] << std::endl;
            std::cerr << "pdgemmSlave: bufsize =" << sizes[i] << std::endl;
        }
    }

    if(count < NUM_BUFS) {
        std::cerr << "pdgemmSlave: master sent " << count << " buffers, but " << NUM_BUFS << " are required." << std::endl;
        ::exit(99); // something that does not look like a signal
    }

    // take a COPY of args (because we will have to patch DESC.CTXT)
    scidb::PdgemmArgs args = *reinterpret_cast<PdgemmArgs*>(bufs[BUF_ARGS]) ;
    if(DBG) {
        std::cerr << "pdgemmSlave: args {" << std::endl ;
        std::cerr << args << std::endl;
        std::cerr << "}" << std::endl ;
    }

    // set up the scalapack grid
    if(DBG) std::cerr << "pdgemmSlave: NPROW:"<<args.NPROW<<" NPCOL:"<<args.NPCOL<<std::endl;
    slpp::int_t ICTXT=-1; // will be overwritten by sl_init

    // call scalapack tools routine to initialize a scalapack grid and give us its
    // context
    sl_init_(ICTXT/*out*/, args.NPROW/*in*/, args.NPCOL/*in*/);
    slpp::int_t NPROW=-1, NPCOL=-1, MYPROW=-1, MYPCOL=-1, MYPNUM=-1; // illegal vals
    getSlaveBLACSInfo(ICTXT/*in*/, NPROW, NPCOL, MYPROW, MYPCOL, MYPNUM);

    if(NPROW != args.NPROW || NPCOL != args.NPCOL ||
       MYPROW != args.MYPROW || MYPCOL != args.MYPCOL || MYPNUM != args.MYPNUM){
        if(DBG) {
            std::cerr << "scalapack general parameter mismatch" << std::endl;
            std::cerr << "args NPROW:"<<args.NPROW<<" NPCOL:"<<args.NPCOL
                      << "MYPROW:"<<args.MYPROW<<" MYPCOL:"<<args.MYPCOL<<"MYPNUM:"<<MYPNUM
                      << std::endl;
            std::cerr << "ScaLAPACK NPROW:"<<NPROW<<" NPCOL:"<<NPCOL
                      << "MYPROW:"<<MYPROW<<" MYPCOL:"<<MYPCOL<<"MYPNUM:"<<MYPNUM
                      << std::endl;
        }
    }

    const slpp::int_t one = 1 ;
    const slpp::int_t  LTD_A = std::max(one, numroc_( args.A.DESC.N, args.A.DESC.NB, MYPCOL, /*CSRC_A*/0, NPCOL ));
    const slpp::int_t  LTD_B = std::max(one, numroc_( args.B.DESC.N, args.B.DESC.NB, MYPCOL, /*CSRC_B*/0, NPCOL ));
    const slpp::int_t  LTD_C = std::max(one, numroc_( args.C.DESC.N, args.C.DESC.NB, MYPCOL, /*CSRC_C*/0, NPCOL ));

    if(DBG) {
        std::cerr << "##################################################" << std::endl;
        std::cerr << "####pdgemmSlave##################################" << std::endl;
        std::cerr << "one:" << one << std::endl;
        std::cerr << "args.A.DESC.MB:" << args.A.DESC.MB << std::endl;
        std::cerr << "MYPROW:" << MYPROW << std::endl;
        std::cerr << "NPROW:" << NPROW << std::endl;
    }

    // size check args
    SLAVE_ASSERT_ALWAYS( sizes[BUF_ARGS] >= sizeof(PdgemmArgs));

    // size check A,B,C -- debugs first
    slpp::int_t SIZE_A = args.A.DESC.LLD * LTD_A ;
    slpp::int_t SIZE_B = args.B.DESC.LLD * LTD_B ;
    slpp::int_t SIZE_C = args.C.DESC.LLD * LTD_C ;
    if(DBG) {
        if(sizes[BUF_A] != SIZE_A *sizeof(double)) {
            std::cerr << "sizes[BUF_A]:" << sizes[BUF_A]
                      << " != args.A.DESC.LLD:" << args.A.DESC.LLD
                      << "* LTD_A" << LTD_A << "*" << sizeof(double) << std::endl;
        }
        if(sizes[BUF_B] != SIZE_B *sizeof(double)) {
            std::cerr << "sizes[BUF_B]:" << sizes[BUF_B]
                      << " != args.B.DESC.LLD:" << args.B.DESC.LLD
                      << "* LTD_B" << LTD_B << "*" << sizeof(double) << std::endl;
        }
        if(sizes[BUF_C] != SIZE_C *sizeof(double)) {
            std::cerr << "sizes[BUF_C]:" << sizes[BUF_C]
                      << " != args.C.DESC.LLD:" << args.C.DESC.LLD
                      << "* LTD_C" << LTD_C << "*" << sizeof(double) << std::endl;
        }
    }
    SLAVE_ASSERT_ALWAYS(sizes[BUF_A] >= SIZE_A * sizeof(double));
    SLAVE_ASSERT_ALWAYS(sizes[BUF_B] >= SIZE_B * sizeof(double));
    SLAVE_ASSERT_ALWAYS(sizes[BUF_C] >= SIZE_C * sizeof(double));

    // sizes are correct, give the pointers their names
    double* A = reinterpret_cast<double*>(bufs[BUF_A]) ;
    double* B = reinterpret_cast<double*>(bufs[BUF_B]) ;
    double* C = reinterpret_cast<double*>(bufs[BUF_C]) ;

    // debug that the input is readable and show its contents
    if(DBG) {
        for(int ii=0; ii < SIZE_A; ii++) {
            std::cerr << "Pgrid("<< MYPROW << "," << MYPCOL << ") A["<<ii<<"] = " << A[ii] << std::endl;
        }
        for(int ii=0; ii < SIZE_B; ii++) {
            std::cerr << "Pgrid("<< MYPROW << "," << MYPCOL << ") B["<<ii<<"] = " << B[ii] << std::endl;
        }
        for(int ii=0; ii < SIZE_C; ii++) {
            std::cerr << "Pgrid("<< MYPROW << "," << MYPCOL << ") C["<<ii<<"] = " << C[ii] << std::endl;
        }
    }


    // ScaLAPACK: the DESCS are complete except for the correct context
    args.A.DESC.CTXT= ICTXT ;
    // (no DESC for S)
    args.B.DESC.CTXT= ICTXT ;
    args.C.DESC.CTXT= ICTXT ;

    if(true || DBG) {    // we'll leave this on in Cheshire.0 and re-evaluate later
        std::cerr << "pdgemmSlave: argsBuf is: {" << std::endl;
        std::cerr << args << std::endl;
        std::cerr << "}" << std::endl << std::endl;

        std::cerr << "pdgemmSlave: calling pdgemm_ for computation, with args:" << std::endl ;
        std::cerr << "TRANSA: " << args.TRANSA
                  << ", TRANSB: " << args.TRANSB
                  << ", M: " << args.M
                  << ", N: " << args.N
                  << ", K: " << args.K << std::endl;

        std::cerr << "ALPHA: " << args.ALPHA << std::endl;

        std::cerr << "A: " <<  (void*)(A)
                  << ", A.I: " << args.A.I
                  << ", A.J: " << args.A.J << std::endl;
        std::cerr << ", A.DESC: " << args.A.DESC << std::endl;

        std::cerr << "B: " <<  (void*)(B)
                  << ", B.I: " << args.B.I
                  << ", B.J: " << args.B.J << std::endl;
        std::cerr << ", B.DESC: " << args.B.DESC << std::endl;

        std::cerr << "BETA: " << args.BETA << std::endl;

        std::cerr << "C: " <<  (void*)(C)
                  << ", C.I: " << args.C.I
                  << ", C.J: " << args.C.J << std::endl;
        std::cerr << ", C.DESC: " << args.C.DESC << std::endl;
    }

    //////////////////////////////////////////////////////////////////////
    //////////////////////////////////////////////////////////////////////
    //////////////////////////////////////////////////////////////////////
    pdgemm_( args.TRANSA, args.TRANSB, args.M, args.N, args.K,
             &args.ALPHA,
             A,  args.A.I,  args.A.J,  args.A.DESC,
             B,  args.B.I,  args.B.J,  args.B.DESC,
             &args.BETA,
             C, args.C.I, args.C.J, args.C.DESC);

    if(true || DBG) {    // we'll leave this on in Cheshire.0 and re-evaluate later
        std::cerr << "pdgemmSlave: pdgemm_ complete (pdgemm_ has no result INFO)" << std::endl;
    }

    if (DBG) {
        std::cerr << "pdgemmSlave outputs: {" << std::endl;
        // debug prints of the outputs:
        for(int ii=0; ii < SIZE_C; ii++) {
            std::cerr << " C["<<ii<<"] = " << C[ii] << std::endl;
        }
        std::cerr << "}" << std::endl;
    }

    // TODO: what is the check on the pdgemm_ (pblas call) for successful completion?
    if (DBG) std::cerr << "pdgemmSlave returning successfully:" << std::endl;
    slpp::int_t INFO = 0 ;
    return INFO ;
}
Example #16
0
int
pdtrans(char *trans, int *m, int *n, int * mb, int *nb, double *a, int *lda, double *beta,
	double *c__, int *ldc, int *imrow, int *imcol, double *work, int *iwork) {
    /* System generated locals */
    long a_dim1, a_offset, c_dim1, c_offset;
    int i__1, i__2, i__3, i__4;

    /* Local variables */
    int j1, k1, k2, ml, nl, mp, mq, np, nq, mb0, mb1, mb2, nb0,
	    nb1, nb2, kia, kja, kic, kjc, lbm, lbn, lcm, ldt, lbm0, lbm1,
	     lbm2, lbn0, lbn1, lbn2, igcd;
    long ipt;
    int mcol, info, lcmp, lcmq, item, ncol, kmod1, kmod2;
    double tbeta;
    int kpcol, mpcol, npcol, mrcol, mycol, kprow, mprow, nprow, mrrow, myrow;

/*  -- PUMMA Package routine (version 2.1) -- */
/*     Jaeyoung Choi, Oak Ridge National Laboratory. */
/*     Jack Dongarra, Univ. of Tennessee, Oak Ridge National Laboratory. */
/*     David Walker,  Oak Ridge National Laboratory. */
/*     October 31, 1994. */

/*  Purpose */

/*  PDTRANS routine is one of the PUMMA package based on block cyclic */
/*  data distribution on 2-D process configuration. */

/*  It is used for the following matrix transposition, */

/*     Form  C := A' + beta*C */

/*  where beta is a scalar, and A and C are matrices, with A an M by N */
/*  matrix (globally), and C an N by M matrix (globally). */

/*  Parameters */

/*  TRANS  - (input) CHARACTER*1 */
/*           TRANS specifies whether A is transposed or conjugate */
/*           transposed. */

/*              TRANS = 'T',   transpose; */

/*              TRANS = 'C',   conjugate transpose. */

/*  M      - (input) INTEGER */
/*           M specifies the (global) number of rows of the matrix A and */
/*           the (global) number of rows of the matrix C.  M >= 0. */

/*  N      - (input) INTEGER */
/*           N specifies the (global) number of columns of the matrix A */
/*           and columns of the matrix B.  N >= 0. */

/*  MB     - (input) INTEGER */
/*           MB specifies the row block size of the matrix A and the */
/*           column block of the matrix C.  MB >= 1. */

/*  NB     - (input) INTEGER */
/*           NB specifies the column block size of the matrix A and the */
/*           row block size of the matrix C.  NB >= 1. */

/*  A      - (input) DOUBLE PRECISION array of DIMENSION ( LDA, Nq ). */
/*           The leading Mp by Nq part of the array A must contain the */
/*           local matrix  A.  Mp and Nq are local variables */
/*           (see description of local parameters). */

/*  LDA    - (input) INTEGER */
/*           The leading dimension of the (local) array A. */
/*           LDA >= MAX( 1, Mp ). */

/*  BETA   - (input) DOUBLE PRECISION */
/*           BETA  specifies the scalar beta.  When BETA is supplied as */
/*           zero then C need not be set on input. */

/*  C      - (input/ouput) DOUBLE PRECISION array of DIMENSION (LDC, Mq). */
/*           On entry the leading Np by Mq part of the array C must */
/*           contain the local matrix C, except when beta is zero, */
/*           in which case C need not be set on entry. */
/*           On exit, the array C is overwritten by the Np by Mq matrix */
/*           (A'+bata*C).  Np and Mq are local variables */
/*           (see description of local parameters). */

/*  LDC    - (input) INTEGER */
/*           The leading dimension of the (local) array C. */
/*           LDC >= MAX( 1, Np ). */

/*  IMROW  - (input) INTEGER */
/*           IMROW specifies a row of the process template, which holds */
/*           the first block of the matrices.  0 <= IMROW < NPROW. */

/*  IMCOL  - (input) INTEGER */
/*           IMCOL specifies a column of the process template, which */
/*           holds the first block of the matrices.  0 <= IMCOL < NPCOL. */

/*  WORK   - (workspace) DOUBLE PRECISION array */
/*           See requirements. */

/*  IWORK  - (workspace) INTEGER array */
/*           See requirements. */

/*  Local  Parameters */

/*  LCM   =  the lowest common multiple of P and Q */
/*  LCMP  =  LCM/P = number of template rows in LCM block */
/*  LCMQ  =  LCM/Q = number of template columns in LCM block */
/*  IGCD   =  the greatest common divisor (GCD) of P and Q */
/*  MpxNq =  size of (local) matrix A in the process, iam */
/*  NpxMq =  size of (local) matrix C in the process, iam */
/*  KMOD  =  Define Group I.D. */
/*  item  =  temporal integer parameter */

/*    Two buffers for storing A' and T(= subblock of A') */
/*       WORK       <== A' */
/*       WORK(IPT)  <== T */

/*    Three interger buffers */
/*       IWORK(1,k) <== starting point of row subblock of A  to send and */
/*                      C to receive in K2 loop (rowwise communication) */
/*       IWORK(2,k) <== starting point of column subblock of A to send in */
/*                      J1 loop (columnwise communication) */
/*       IWORK(3,k) <== starting point of column subblock of C to receive */
/*                      in J1 loop (columnwise communication) */

/*  Requirements (approximate) */

/*   Size(IWORK) = 3 x MAX(P, Q) */
/*   Size(WORK)  = 2 x Ceil(Ceil(M,MB),LCM)xMB x Ceil(Ceil(N,NB),LCM)xNB */

/*     Get grid parameters */

    /* Parameter adjustments */
    a_dim1 = *lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    c_dim1 = *ldc;
    c_offset = 1 + c_dim1;
    c__ -= c_offset;
    --work;
    --iwork;

    /* Function Body */
    Cblacs_gridinfo(context_1.ictxt, &nprow, &npcol, &myrow, &mycol);

/*     Test for the input parameters. */

    info = 0;
    if (*trans != 'T' && *trans != 'C') {
	info = 1;
    } else if (*m < 0) {
	info = 2;
    } else if (*n < 0) {
	info = 3;
    } else if (*mb < 1) {
	info = 4;
    } else if (*nb < 1) {
	info = 5;
    } else if (*lda < 1) {
	info = 7;
    } else if (*ldc < 1) {
	info = 10;
    } else if (*imrow < 0 || *imrow >= nprow) {
	info = 11;
    } else if (*imcol < 0 || *imcol >= npcol) {
	info = 12;
    }

L10:
    if (info != 0) {
	pxerbla( &context_1.ictxt, "PDTRANS", &info );
	return 0;
    }

/*     Initialize parameters */

    mprow = nprow + myrow;
    mpcol = npcol + mycol;
    mrrow = (mprow - *imrow) % nprow;
    mrcol = (mpcol - *imcol) % npcol;

    lcm = ilcm_(&nprow, &npcol);
    lcmp = lcm / nprow;
    lcmq = lcm / npcol;
    igcd = nprow / lcmq;

    mp = numroc_(m, mb, &mrrow, &c__0, &nprow);
    mq = numroc_(m, mb, &mrcol, &c__0, &npcol);
    np = numroc_(n, nb, &mrrow, &c__0, &nprow);
    nq = numroc_(n, nb, &mrcol, &c__0, &npcol);

    i__1 = iceil_(m, mb);
    lbm = iceil_(&i__1, &lcm);
    i__1 = iceil_(n, nb);
    lbn = iceil_(&i__1, &lcm);

/*     Test for the input parameters again with local parameters */

    if (*lda < mp) {
	info = 7;
    } else if (*ldc < np) {
	info = 10;
    }
    if (info != 0) {
	goto L10;
    }

/*     Quick return if possible. */

    if (*m == 0 || *n == 0) {
	return 0;
    }

/*     At first, scale C with beta if beta != 0.0 & beta != 1.0 */

    tbeta = *beta;
    if (*beta != 0. && *beta != 1.) {
	i__1 = mq;
	for (j1 = 1; j1 <= i__1; ++j1) {
	    HPL_dscal( np, *beta, &c__[j1 * c_dim1 + 1], 1 );
/* L20: */
	}
	tbeta = 1.;
    }

    commtrb_1.iaz = lcmp * *mb;
    commtrb_1.jaz = lcmq * *nb;
    commtrb_1.itz = lcmp * *nb;
    commtrb_1.jtz = lcmq * *mb;

    ml = lbm * *mb;
    nl = lbn * *nb;
    ipt = (long)ml * (long)nl + 1;
    ldt = nl;
    kprow = mrrow + nprow;
    kpcol = mrcol + npcol;

/*     Initialize Parameters -- Compute the positions of subblocks */

    i__1 = npcol - 1;
    for (k1 = 0; k1 <= i__1; ++k1) {
	ncol = (kpcol - k1) % npcol;
	i__2 = lcmq - 1;
	for (j1 = 0; j1 <= i__2; ++j1) {
	    item = npcol * j1 + ncol;
	    if (item % nprow == mrrow) {
		iwork[ncol * 3 + 1] = item / nprow;
	    }
/* L30: */
	}
    }

    i__2 = lcmq - 1;
    for (j1 = 0; j1 <= i__2; ++j1) {
	item = (npcol * j1 + mrcol) % nprow;
	iwork[item * 3 + 2] = j1;
	iwork[item * 3 + 3] = j1;
	i__1 = igcd - 1;
	for (k1 = 1; k1 <= i__1; ++k1) {
	    iwork[(item + nprow - k1) % nprow * 3 + 2] = j1;
	    iwork[(item + k1) % nprow * 3 + 3] = j1;
/* L40: */
	}
    }

/*     Set parameters for efficient copying */

    lbm0 = lbm;
    lbm1 = lbm;
    lbm2 = lbm;
    lbn0 = lbn;
    lbn1 = lbn;
    lbn2 = lbn;
    mb0 = *mb;
    mb1 = *mb;
    mb2 = *mb;
    nb0 = *nb;
    nb1 = *nb;
    nb2 = *nb;

    if (nprow == npcol) {
	lbm0 = 1;
	lbn0 = 1;
	mb0 = mp;
	nb0 = nq;
    }
    if (nprow == lcm) {
	lbm1 = 1;
	lbn2 = 1;
	mb1 = mp;
	nb2 = np;
    }
    if (npcol == lcm) {
	lbn1 = 1;
	lbm2 = 1;
	nb1 = nq;
	mb2 = mq;
    }

/*     For each K2 loop (rowwise), Copy A' to WORK & Send it to KTPROC */
/*                                 then, Receive WORK and Copy WORK to C */

    kmod1 = (nprow + mrcol - mrrow) % igcd;
    kmod2 = (igcd - kmod1) % igcd;

    i__1 = lcmp - 1;
    for (k2 = 0; k2 <= i__1; ++k2) {

/*        Copy A' to WORK in the appropriate order & Send it */

	k1 = k2 * igcd + kmod1;
	mcol = (kpcol - k1) % npcol;
	kia = iwork[mcol * 3 + 1] * *mb;
	mcol = (mcol + *imcol) % npcol;
	ncol = (mrcol + k2 * igcd + kmod2) % npcol;
	kic = iwork[ncol * 3 + 1] * *nb;
	ncol = (ncol + *imcol) % npcol;

	i__2 = lcmq - 1;
	for (j1 = 0; j1 <= i__2; ++j1) {
	    kja = iwork[(mrrow + igcd * j1) % nprow * 3 + 2] * *nb;

	    if (myrow == (myrow + igcd * j1 + kmod1) % nprow && mycol == mcol)
		     {
		kjc = iwork[(kprow - igcd * j1) % nprow * 3 + 3] * *mb;
		i__3 = mp - kia;
		i__4 = nq - kja;
		dtr2mx_(&a[kia + 1 + (kja + 1) * a_dim1], lda, &tbeta, &c__[
			kic + 1 + (kjc + 1) * c_dim1], ldc, &lbm0, &lbn0, &
			mb0, &nb0, &i__3, &i__4);

	    } else {
		i__3 = mp - kia;
		i__4 = nq - kja;
		dtr2bf_(&a[kia + 1 + (kja + 1) * a_dim1], lda, &work[1], &ldt,
			 &lbm1, &lbn1, &mb1, &nb1, &i__3, &i__4);

		if (nprow == npcol && *beta == 0. && *ldc == ldt) {
		    i__3 = (myrow + igcd * j1 + kmod1) % nprow;
		    i__4 = (mprow - igcd * j1 - kmod2) % nprow;
		    kjc = iwork[(kprow - igcd * j1) % nprow * 3 + 3] * *mb;
#if 0
		    Cdgesd2d(context_1.ictxt,nl,ml,&work[1],nl,i__3,mcol);
		    Cdgerv2d(context_1.ictxt,nl,ml,&c__[(kjc + 1) * c_dim1 + 1],*ldc,i__4,ncol);
#else
		    Cblacs_dSendrecv( context_1.ictxt,
                          nl, ml, &work[1], nl, i__3, mcol,
                          nl, ml, &c__[(kjc + 1) * c_dim1 + 1], *ldc, i__4, ncol );
#endif

		} else {
		    i__3 = (myrow + igcd * j1 + kmod1) % nprow;
		    i__4 = (mprow - igcd * j1 - kmod2) % nprow;
#if 0
		    Cdgesd2d(context_1.ictxt,nl,ml,&work[1],nl,i__3,mcol);
		    Cdgerv2d(context_1.ictxt,nl,ml,&work[ipt],nl, i__4,ncol);
#else
        Cblacs_dSendrecv( context_1.ictxt,
                          nl, ml, &work[1],   nl, i__3, mcol,
                          nl, ml, &work[ipt], nl, i__4, ncol );
#endif

		    kjc = iwork[(kprow - igcd * j1) % nprow * 3 + 3] * *mb;
		    i__3 = np - kic;
		    i__4 = mq - kjc;
		    dmv2mx_(&work[ipt], &ldt, &tbeta, &c__[kic + 1 + (kjc + 1)
			     * c_dim1], ldc, &lbn2, &lbm2, &nb2, &mb2, &i__3,
			    &i__4);
		}
	    }
	}
    }

    return 0;
} /* pdtrans_ */
Example #17
0
int main()
{
	const MKL_INT m = 1000;
	const MKL_INT k = 100000;
	const MKL_INT n = 1000;
	const MKL_INT nb = 100;
	const MKL_INT nprow = 2;
	const MKL_INT npcol = 2;

    MKL_INT iam, nprocs, ictxt, myrow, mycol;
    MDESC   descA, descB, descC, descA_local, descB_local, descC_local;
	MKL_INT info;
	MKL_INT a_m_local, a_n_local, b_m_local, b_n_local, c_m_local, c_n_local;
	MKL_INT a_lld, b_lld, c_lld;

    blacs_pinfo_( &iam, &nprocs );
    blacs_get_( &i_negone, &i_zero, &ictxt );
    blacs_gridinit_( &ictxt, "R", &nprow, &npcol );
    blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol );

    double *a = 0;
    double *b = 0;
	double *c = 0;

    if (iam==0)
    {
		a = gen_a(m, k);
		b = gen_b(k, n);
		c = (double*)calloc(m*n, sizeof(double));

		puts("a=");
		print(a, m, k);

		puts("b=");
		print(b, k, n);
    }

    a_m_local = numroc_( &m, &nb, &myrow, &i_zero, &nprow );
    a_n_local = numroc_( &k, &nb, &mycol, &i_zero, &npcol );

	b_m_local = numroc_( &k, &nb, &myrow, &i_zero, &nprow );
	b_n_local = numroc_( &n, &nb, &mycol, &i_zero, &npcol );

    c_m_local = numroc_( &m, &nb, &myrow, &i_zero, &nprow );
	c_n_local = numroc_( &n, &nb, &mycol, &i_zero, &npcol );

    double *A = (double*) calloc( a_m_local * a_n_local, sizeof( double ) );
    double *B = (double*) calloc( b_m_local * b_n_local, sizeof( double ) );
    double *C = (double*) calloc( c_m_local * c_n_local, sizeof( double ) );

    a_lld = MAX( a_m_local, 1 );
	b_lld = MAX( b_m_local, 1 );
	c_lld = MAX( c_m_local, 1 );

	if (iam==0)
	{
			printf("a_m_local = %d\ta_n_local = %d\tb_m_local = %d\tb_n_local = %d\tc_m_local = %d\tc_n_local = %d\n", a_m_local, a_n_local, b_m_local, b_n_local,
							c_m_local, c_n_local);
			printf("a_lld = %d\tb_lld = %d\tc_lld = %d\n", a_lld, b_lld, c_lld);
	}

    descinit_( descA_local, &m, &k, &m, &k, &i_zero, &i_zero, &ictxt, &m, &info );
    descinit_( descB_local, &k, &n, &k, &n, &i_zero, &i_zero, &ictxt, &k, &info );
    descinit_( descC_local, &m, &n, &m, &n, &i_zero, &i_zero, &ictxt, &m, &info );

    descinit_( descA, &m, &k, &nb, &nb, &i_zero, &i_zero, &ictxt, &a_lld, &info );
    descinit_( descB, &k, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &b_lld, &info );
    descinit_( descC, &m, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &c_lld, &info );

	printf("Rank %d: start distribute data\n", iam);
    pdgeadd_( &trans, &m, &k, &one, a, &i_one, &i_one, descA_local, &zero, A, &i_one, &i_one, descA );
    pdgeadd_( &trans, &k, &n, &one, b, &i_one, &i_one, descB_local, &zero, B, &i_one, &i_one, descB );
	printf("Rank %d: finished distribute data\n", iam);

	if (iam==0)
	{
			puts("a");
			print(A, a_m_local, a_n_local);
			puts("b");
			print(B, b_m_local, b_n_local);
	}

    pdgemm_( "N", "N", &m, &n, &k, &one, A, &i_one, &i_one, descA, B, &i_one, &i_one, descB,
             &zero, C, &i_one, &i_one, descC );
	printf("Rank %d: finished dgemm\n", iam);
	if (iam == 0)
	{
			puts("c");
			print(C, c_m_local, c_n_local);
	}

	pdgeadd_( &trans, &m, &n, &one, C, &i_one, &i_one, descC, &zero, c, &i_one, &i_one, descC_local);

	if (iam==0)
	{
			puts("global c");
			print(c, m, n);
	}

	free(A);
	free(B);
	free(C);
	if (iam==0)
	{
			free(a);
			free(b);
			free(c);
	}

    blacs_gridexit_( &ictxt );
    blacs_exit_( &i_zero );
}
Example #18
0
void 
ddistr(int ictxt, int n, int numc, int nb, double *A , double *A_d, int *descAd ){

int RootNodeic, ione=1, izero=0, isRootNode=0, nru, info;
int nprow, npcol, myrow, mycol,descA[9], itemp;
int i,k;

/*
#ifdef NOUNDERLAPACK
 sl_init__(&RootNodeic,&ione, &ione);
#else
  sl_init__(&RootNodeic,&ione, &ione);
#endif
*/



   
  sl_init_(&RootNodeic,&ione, &ione);
  
  Cblacs_gridinfo(ictxt, &nprow, &npcol, &myrow, &mycol);  


	//printf("nprow=%d, npcol=%d, myrow=%d, mycol=%d\n",nprow,npcol,myrow,mycol);
	//printf("nb=%d\n",nb);


  if (myrow==0 && mycol==0) { isRootNode=1;}

  if (isRootNode){
     //printf("root entro aca...\n");
	nru = numroc_(&n, &n, &myrow,&izero, &nprow );
     //printf("root paso numroc\n");
	itemp = max(1, nru);
     descinit_(descA, &n, &numc, &n, &n, &izero, &izero, &RootNodeic, &itemp, &info);
  	//printf("root paso descinit\n");
  } 
  else{
     //printf("yo entre aca\n");
     k=0;
     for(i=0;i<9;i++){ 
       descA[k]=0;
       k++;
     }
     descA[1]=-1;
  }

  //printf("inicio de cosas para todos\n");
  nru = numroc_(&n, &nb, &myrow, &izero, &nprow);
  //printf("todos pasan numroc\n");
  itemp = max(1,nru);
  descinit_(descAd, &n, &numc, &nb, &nb, &izero, &izero, &ictxt,&itemp, &info);  
  //printf("todos pasan descinit\n");

  pdgemr2d_( &n, &numc, A, &ione, &ione, descA, A_d, &ione, &ione, descAd, &ictxt);
  //printf("todos pasan pdgemr2d\n");
  
 if (isRootNode){ 
     	//printf("RootNodeic=%d\n",RootNodeic);
	Cblacs_gridexit(RootNodeic);
	//printf("root paso gridexit\n");
  }
}
Example #19
0
static int
CheckNode(int imrow, int imcol, int nmat, int *mval, int *nval, int nbmat, int *mbval, int *nbval,
          int myrow, int mycol, int nprow, int npcol, long *maxMem) {
  int i__, ii, m, n, mb, nb, ierr[1];
  int lcm, np0, nq0, mp0, mq0, mg, ng, np, nq, mp, mq;
  long isw, ipw, ipiw, ipa, ipc;

  *maxMem = 0;
  for (i__ = 0; i__ < nmat; ++i__) {
    m = mval[i__];
    n = nval[i__];

/*           Make sure matrix information is correct */

      ierr[0] = 0;
      if (m < 1) {
        ierr[0] = 1;
      } else if (n < 1) {
        ierr[0] = 1;
      }

      if (ierr[0] > 0) {
        continue;
      }

      for (ii = 0; ii < nbmat; ++ii) { /* Loop over different block sizes */

        mb = mbval[ii];
        nb = nbval[ii];

/*              Make sure blocking sizes are legal */
        ierr[0] = 0;
        if (mb < 1) {
          ierr[0] = 1;
        } else if (nb < 1) {
          ierr[0] = 1;
        }

/*              Make sure no one had error */

        if (ierr[0] > 0) {
          continue;
        }

        mp = numroc_(&m, &mb, &myrow, &imrow, &nprow);
        mq = numroc_(&m, &mb, &mycol, &imcol, &npcol);
        np = numroc_(&n, &nb, &myrow, &imrow, &nprow);
        nq = numroc_(&n, &nb, &mycol, &imcol, &npcol);

        mg = iceil_(&m, &mb);
        ng = iceil_(&n, &nb);

        mp0 = iceil_(&mg, &nprow) * mb;
        mq0 = iceil_(&mg, &npcol) * mb;
        np0 = iceil_(&ng, &nprow) * nb;
        nq0 = iceil_(&ng, &npcol) * nb;

        lcm = ilcm_(&nprow, &npcol);
        ipc = 1;
        ipa = ipc + (long)np0 * (long)mq0;
        ipiw = (long)mp0 * (long)nq0 + ipa;
        ipw = ipiw;
        isw = ipw + (long)(iceil_(&mg, &lcm) << 1) * (long)mb * (long)iceil_(&ng, &lcm) * (long)nb;

        if (*maxMem < isw) *maxMem = isw;
      }
  }
  return 0;
}
Example #20
0
  // torus-wrap distribution used in EigenK and Elemental
  distributed_matrix(int m_global, int n_global, const grid<rokko::eigen_sx>& g_in)
    : m_global(m_global), n_global(n_global), g(g_in), myrank(g_in.myrank), nprocs(g_in.nprocs), myrow(g_in.myrow), mycol(g_in.mycol), nprow(g_in.nprow), npcol(g_in.npcol)
  {
    int n = m_global;
    int nx = (n-1)/nprow+1;
    int i1 = 6, i2 = 16 * 2, i3 = 16 * 4, nm;
    CSTAB_get_optdim(nx, i1, i2, i3, nm);  // return an optimized (possiblly) leading dimension of local block-cyclic matrix to nm.
    //int para_int = 0;   eigen_free_wrapper(para_int);

    int NB  = 64 + 32;
    int nmz = ((n-1)/nprow+1);
    nmz = ((nmz-1)/NB+1)*NB+1;
    int nmw = ((n-1)/npcol+1);
    nmw = ((nmw-1)/NB+1)*NB+1;
    nme = ((n-1)/2+1) * 2;
    int nh = (n-1)/4+1;
    i1 = 4;
    int nnh;
    CSTAB_get_optdim(nh, i1, i2, i3, nnh);  // return an optimized (possiblly) leading dimension of local block-cyclic matrix to nm.
    nnh = 4 * nnh;
    cout << "nnh=" << nnh << endl;

    cout << "nm=" << nm << endl;
    cout << "nmz=" << nmz << endl;
    cout << "nmw=" << nmw << endl;
    cout << "nme=" << nme << endl;
    cout << "nnh=" << nnh << endl;

    int n1x = ((n-1)/nprocs + 1);
    int larray = std::max(std::max(std::max(nmz, nm), nnh) * nmw, n*n1x);

    cout << "n1x=" << n1x << endl;
    cout << "larray=" << larray << endl;

    // calculate sizes of my proc's local part of distributed matrix
    mb = 1;
    nb = 1;
    const int ZERO=0, ONE=1;
    lld = nm;
    m_local = numroc_( m_global, mb, myrow, ZERO, nprow );  //(m_global + nprow - myrow) / nprow;
    n_local = numroc_( n_global, nb, mycol, ZERO, npcol );  //(n_global + npcol - mycol) / npcol;
    //m_local = nm;
    //n_local = (larray + (nm-1)) / nm;

    for (int proc=0; proc<nprocs; ++proc) {
      if (proc == g.myrank) {
	cout << "proc=" << proc << endl;
	//cout << "  mb=" << mb << "  nb=" << nb << endl;
	//cout << "  mA=" << m_local << "  nprow=" << g.nprow << endl;
	//cout << "  nA=" << n_local << "  npcol=" << g.npcol << endl;
	cout << "nm(lld)=" << lld << endl;
	cout << " m_local=" << m_local << " n_local=" << n_local << endl;
     }
      MPI_Barrier(MPI_COMM_WORLD);
    }

    array = new double[larray];  //m_local * n_local];
    if (array == NULL) {
      cerr << "failed to allocate array." << endl;
      MPI_Abort(MPI_COMM_WORLD, 3);
    }
    for (int ii=0; ii<larray; ++ii)
      array[ii] = -3;
  }
sl_int_t mpiCopySlave(void* bufs[], size_t sizes[], unsigned count)
{
    enum dummy  {DBG=0};
    enum dummy2 {BUF_ARGS=0, BUF_IN, BUF_OUT, NUM_BUFS };

    if(DBG) {
        std::cerr << "mpiCopySlave(): entered" << std::endl;
        for(size_t i=0; i < count; i++) {
            std::cerr << "mpiCopySlave: buffer at:"<< bufs[i] << std::endl;
            std::cerr << "mpiCopySlave: bufsize =" << sizes[i] << std::endl;
        }
    }

    if(count < NUM_BUFS) {
        std::cerr << "mpiCopySlave: master sent " << count << " buffers, but " << NUM_BUFS << " are required." << std::endl;
        ::exit(99); // something that does not look like a signal
    }

    // take a COPY of args (because we will have to patch DESC.CTXT)
    scidb::MPICopyArgs args = *reinterpret_cast<MPICopyArgs*>(bufs[BUF_ARGS]) ;
    if(DBG) {
        std::cerr << "mpiCopySlave: args --------------------------" << std::endl ;
        std::cerr << args << std::endl;
        std::cerr << "mpiCopySlave: args end ----------------------" << std::endl ;
    }

    // set up the scalapack grid
    if(DBG) std::cerr << "##### sl_init() NPROW:"<<args.NPROW<<" NPCOL:"<<args.NPCOL<<std::endl;
    slpp::int_t ICTXT=-1; // will be overwritten by sl_init

    // call scalapack tools routine to initialize a scalapack grid and give us its
    // context
    sl_init_(ICTXT/*out*/, args.NPROW/*in*/, args.NPCOL/*in*/); 

    sl_int_t NPROW=-1, NPCOL=-1, MYPROW=-1, MYPCOL=-1, MYPNUM=-1; // illegal vals
    getSlaveBLACSInfo(ICTXT/*in*/, NPROW, NPCOL, MYPROW, MYPCOL, MYPNUM);

    if(NPROW  != args.NPROW  || NPCOL  != args.NPCOL ||
       MYPROW != args.MYPROW || MYPCOL != args.MYPCOL ||
       MYPNUM != args.MYPNUM){
        std::cerr << "scalapack general parameter mismatch:" << std::endl;
        std::cerr << "args:" << std::endl;
        std::cerr << "NP=("<<args.NPROW<<", "<<args.NPCOL <<")"<< std::endl;
        std::cerr << "MYP("<<args.MYPROW<<", "<<args.MYPCOL<<")"<< std::endl;
        std::cerr << "MYPNUM" <<args.MYPNUM << std::endl;
        std::cerr << "ScaLAPACK:" << std::endl;
        std::cerr << "NP=("<<NPROW<<", "<<NPCOL <<")"<< std::endl;
        std::cerr << "MYP("<<MYPROW<<", "<<MYPCOL<<")"<< std::endl;
        std::cerr << "MYPNUM" <<MYPNUM << std::endl;
        ::exit(99); // something that does not look like a signal
    }

    const sl_int_t& LLD_IN = args.IN.DESC.LLD ;
    const sl_int_t one = 1 ;
    const sl_int_t  LTD_IN = std::max(one, numroc_( args.IN.DESC.N, args.IN.DESC.NB, MYPCOL, /*CSRC_IN*/0, NPCOL )); 
    const sl_int_t& MP = LLD_IN ;
    const sl_int_t& NQ = LTD_IN ;

    // size check args
    if( sizes[BUF_ARGS] != sizeof(MPICopyArgs)) {
        assert(false); // TODO: correct way to fail
        ::exit(99); // something that does not look like a signal
    }

    // size check IN
    sl_int_t SIZE_IN = MP*NQ ;
    if( sizes[BUF_IN] != SIZE_IN * sizeof(double)) {
        std::cerr << "slave: error size mismatch:" << std::endl;
        std::cerr << "sizes[BUF_IN]" << sizes[BUF_IN] << std::endl;
        std::cerr << "MP * NQ = " << MP <<"*"<<NQ<<"="<< MP*NQ << std::endl;
        assert(false); // TODO: correct way to fail
        ::exit(99); // something that does not look like a signal
    }

    // size check OUT
    sl_int_t SIZE_OUT = SIZE_IN;
    if( sizes[BUF_OUT] != SIZE_OUT *sizeof(double)) {
        std::cerr << "sizes[BUF_OUT]:"<<sizes[BUF_OUT];
        std::cerr << "MP * NQ = " << MP <<"*"<<NQ<<"="<< MP*NQ << std::endl;
        assert(false); // TODO: correct way to fail
        ::exit(99); // something that does not look like a signal
    }

    // sizes are correct, give the pointers their names
    double* IN = reinterpret_cast<double*>(bufs[BUF_IN]) ;
    double* OUT = reinterpret_cast<double*>(bufs[BUF_OUT]) ;

    // here's the whole thing: copy IN to OUT
    // TODO: use memcpy instead
    for(int ii=0; ii < SIZE_OUT; ii++) {
        OUT[ii] = IN[ii] ;
    }

    return 0;
}
Example #22
0
int BlacsSystem::NSize(int n)
{
        int icsrc=0;
        return numroc_(&n,&nb_,&mycol_,&icsrc,&npcol_);
}
void pzgecopy_hd( F_CHAR_T TRANS, int *m_in, int *n_in,
    double *A, int *ia_in, int *ja_in, int *descA,
    double *dB, int *ib_in, int *jb_in, int *descB )
{
/*
 Copy m by n distributed submatrix  from host to GPU
 */

int m  = *m_in;
int n  = *n_in;

int ia = *ia_in;
int ja = *ja_in;
int ib = *ib_in;
int jb = *jb_in;

int nprow = 0;
int npcol = 0;
int myprow = 0;
int mypcol = 0;
int mmb = 0;
int nnb = 0;

int istart = 0;
int iend = 0;
int isize = 0;

int Locp = 0;
int Locq = 0;
int lld = 0;

int jstart = 0;
int jend = 0;
int jsize = 0;

int iib = 0;
int jjb = 0;

cuDoubleComplex *dBptr = 0;
double *Btmp = 0;
F_CHAR_T NoTrans = "N";

int descBtmp[DLEN_];
int elmSize = sizeof(cuDoubleComplex);

double z_one[2];
double z_zero[2];

/*
 Tuneable parameters
 */
int mfactor = 4;
int nfactor = 4;


int rsrc = 0;
int csrc = 0;
int irsrc = 0;
int jcsrc = 0;
int info = 0;

int notran = 0;
int TrA = 0;

int lrindx = 0;
int lcindx = 0;
int nrow = 0;
int ncol = 0;
int mm = 0;
int nn = 0;
int iia = 0;
int jja = 0;

cublasStatus cu_status;

z_one[REAL_PART] = 1;
z_one[IMAG_PART] = 0;
z_zero[REAL_PART] = 0;
z_zero[IMAG_PART] = 0;

Cblacs_gridinfo( descA[CTXT_], &nprow, &npcol, &myprow, &mypcol );

notran = ( ( TrA = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN );


/*
 * check arguments
 */
if ((m <= 0) || (n <= 0)) {
   return;
   };

assert( (1 <= ia) && ((ia + m-1) <= descA[M_] ) );
assert( (1 <= ja) && ((ja + n-1) <= descA[N_] ) );
assert( (1 <= ib) && ((ib + m-1) <= descB[M_] ) );
assert( (1 <= jb) && ((jb + n-1) <= descB[N_] ) );

/*
 * Create a temp matrix that is aligned to descB.
 * Assume size is   mmb by nnb
 */

if (notran) {
  mmb = MIN( m, descB[MB_] * nprow * mfactor);
  nnb = MIN( n, descB[NB_] * npcol * nfactor);
  }
else {
  nnb = MIN( m, descB[MB_] * nprow * mfactor);
  mmb = MIN( n, descB[NB_] * npcol * nfactor);
  };
  

mmb = MAX( mmb,1);
nnb = MAX( nnb,1);

rsrc = indxg2p_( &ib, &descB[MB_], &myprow, &descB[RSRC_], &nprow);
csrc = indxg2p_( &jb, &descB[NB_], &mypcol, &descB[CSRC_], &npcol);


Locp = numroc_( &mmb, &descB[MB_], &myprow, &rsrc, &nprow );
Locq = numroc_( &nnb, &descB[NB_], &mypcol, &csrc, &npcol );

Btmp = (double*) malloc( MAX(1,(Locp * Locq))*elmSize );
assert( Btmp != 0 );


lld = MAX(1, Locp);
descinit_( descBtmp, &mmb, &nnb, &descB[MB_], &descB[NB_],
            &rsrc, &csrc, &descB[CTXT_], &lld, &info );

assert( info == 0);



for( jstart=ja; jstart <= ja + n-1; jstart = jend + 1) {
  jend = MIN( ja + n -1, jstart + nnb - 1);
  jsize = jend - jstart + 1;

  for( istart=ia; istart <= ia + m-1; istart = iend + 1) {
     iend = MIN( ia + m-1, istart + mmb -1);
     isize = iend - istart + 1;

     iia = ia + (istart-1);
     jja = ja + (jstart-1);


     iib = 1;
     jjb = 1;
     

     if (notran) {
        mm = isize;
        nn = jsize;
        }
    else {
        mm = jsize;
        nn = isize;
        };


     pzgeadd_( TRAN,  &mm, &nn,   z_one, A, &iia, &jja, descA,  
                         z_zero, Btmp, &iib, &jjb, descBtmp );

     /* 
      * find local extent
      */

     
     if (notran) {
       iib = ib + (istart-1);
       jjb = jb + (jstart-1);
       }
     else {
       iib = ib + (jstart-1);
       jjb = jb + (istart-1);
       };

     if (notran) {
       nrow = numroc_( &isize, &descB[MB_], &myprow, &rsrc, &nprow);
       ncol = numroc_( &jsize, &descB[NB_], &mypcol, &csrc, &npcol);
       }
     else {
       nrow = numroc_( &jsize, &descB[MB_], &myprow, &rsrc, &nprow);
       ncol = numroc_( &isize, &descB[NB_], &mypcol, &csrc, &npcol);

     };

     /*
      Perform global
      dB( iib:(iib+isize-1), jjb:(jjb+jsize-1)) <- B(1:isize,1:jsize)

      Perform local
      dB( lrindx:(lrindx+nrow-1), lcindx:(lcindx+ncol-1)) <-
              B(1:nrow, 1:ncol)
      */


     infog2l_( &iib, &jjb, descB, &nprow, &npcol, &myprow, &mypcol,
               &lrindx, &lcindx,  &irsrc, &jcsrc );


     dBptr = (cuDoubleComplex *) dB;
     dBptr = dBptr + INDX2F( lrindx,lcindx, descB[LLD_]);


     cu_status = cublasSetMatrix( nrow,ncol,elmSize,
               (cuDoubleComplex *) Btmp,  descBtmp[LLD_],
                                   dBptr, descB[LLD_] );
     assert( cu_status == CUBLAS_STATUS_SUCCESS );

     };
  };

  free( Btmp );

  return;
}
Example #24
0
void CG(StrumpackDensePackage<myscalar,myreal> *sdp, myscalar *X, myscalar *B, int *descVec, int n, int nrhs, int niter, myreal threshold) {
  /* Conjugate Gradients. Calls sdp.schur_product for matvecs. Fills only the bottom part of X. */
  int printit=250;
  int IA;
  int ctxt;
  int nprow, npcol;
  int myrow, mycol;
  int rsrc, csrc;
  int nb;
  int locr, locc;
  int i, j;
  int idummy, ierr;
  int neff;
  int it;
  int desc[BLACSCTXTSIZE], descScal[BLACSCTXTSIZE];
  bool ingrid;
  myreal res;
  myscalar *x=NULL, *b=NULL;
  myscalar *r=NULL, *p=NULL, *Ap=NULL;
  myscalar alpha, rrprev, rrnext;

  IA=sdp->split_HSS+1;
  neff=n-IA+1;

  ctxt=descVec[BLACSctxt];
  rsrc=descVec[BLACSrsrc];
  csrc=descVec[BLACScsrc];
  nb=descVec[BLACSmb];
  blacs_gridinfo_(&ctxt,&nprow,&npcol,&myrow,&mycol);
  ingrid=myrow>=0 && mycol>=0;

  /* X and B have n rows. We create versions with
   * the last neff rows only that we pass to SDP.
   */
  if(ingrid) {
    locr=numroc_(&neff,&nb,&myrow,&rsrc,&nprow);
    locc=numroc_(&IONE,&nb,&mycol,&csrc,&npcol);
    idummy=locr>1?locr:1;
    descinit_(desc,&neff,&IONE,&nb,&nb,&rsrc,&csrc,&ctxt,&idummy,&ierr);
    x=new myscalar[locr*locc]();
    b=new myscalar[locr*locc]();
    r=new myscalar[locr*locc]();
    p=new myscalar[locr*locc]();
    Ap=new myscalar[locr*locc]();
  } else {
    locr=0;
    locc=0;
    descset_(desc,&neff,&IONE,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE);
  }

  /* Descriptor for the scalar containing the result of the dot product */
  if(ingrid)
    descinit_(descScal,&IONE,&IONE,&nb,&nb,&IZERO,&IZERO,&ctxt,&IONE,&ierr);
  else
    descset_(descScal,&IONE,&IONE,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE);

  for(j=1;j<=nrhs;j++) {
    /* One RHS at a time */
    if(ingrid) {
      pgeadd('N',neff,IONE,ONE,B,IA,j,descVec,ZERO,b,IONE,IONE,desc);
      pgeadd('N',neff,IONE,ONE,X,IA,j,descVec,ZERO,x,IONE,IONE,desc);
    }

    /* r = b - A x
     * If the Schur was explitly in matrix A, the BLAS call would be
     * pgemm('N','N',neff,IONE,neff,NONE,A,IA,IA,descA,x,IONE,IONE,desc,ONE,r,IONE,IONE,desc);
     *
     */
    if(ingrid)
      placpy('N',neff,IONE,b,IONE,IONE,desc,r,IONE,IONE,desc);
    sdp->schur_product('N',NONE,x,desc,ONE,r,desc);

    /* p = r */
    if(ingrid)
      placpy('N',neff,IONE,r,IONE,IONE,desc,p,IONE,IONE,desc);

    /* "Previous" r' * r */
    rrprev=ZERO;
    if(ingrid)
      pgemm('C','N',IONE,IONE,neff,ONE,r,IONE,IONE,desc,r,IONE,IONE,desc,ZERO,&rrprev,IONE,IONE,descScal);
    MPI_Bcast((void *)&rrprev,IONE,MY_MPI_REAL,IZERO,MPI_COMM_WORLD);

    it=1;
    while(it<=niter) {
      /* Ap = A * p
       * If the Schur was explitly in matrix A, the BLAS call would be
       * pgemm('N','N',neff,IONE,neff,ONE,A,IA,IA,descA,p,IONE,IONE,desc,ZERO,Ap,IONE,IONE,desc);
       *
       */
      sdp->schur_product('N',ONE,p,desc,ZERO,Ap,desc);

      /* alpha = r'*r / (p' * A * p) = rrprev/(p' * Ap) */
      alpha=ZERO;
      if(ingrid)
        pgemm('C','N',IONE,IONE,neff,ONE,p,IONE,IONE,desc,Ap,IONE,IONE,desc,ZERO,&alpha,IONE,IONE,descScal);
      MPI_Bcast((void *)&alpha,IONE,MY_MPI_REAL,IZERO,MPI_COMM_WORLD);
      alpha=rrprev/alpha;

      /* x = x + alpha * p */
      for(i=0;i<locr*locc;i++)
        x[i]+=alpha*p[i];

      /* r = r - alpha * Ap */
      for(i=0;i<locr*locc;i++)
        r[i]-=alpha*Ap[i];

      /* "Next" r' * r */
      rrnext=ZERO;
      if(ingrid)
        pgemm('C','N',IONE,IONE,neff,ONE,r,IONE,IONE,desc,r,IONE,IONE,desc,ZERO,&rrnext,IONE,IONE,descScal);
      MPI_Bcast((void *)&rrnext,IONE,MY_MPI_REAL,IZERO,MPI_COMM_WORLD);

      /* Residual */
      res=sqrt(rrnext.real());
      if(it%printit==0)
        if(!myrow && !mycol)
          std::cout << "RHS " << j << ": iteration " << it << ", ||Ax-b||/||b||=" << res << std::endl;
      if(res<threshold)
        break;

      /* p = r + rrnext/rrprev * p */
      for(i=0;i<locr*locc;i++)
        p[i]=r[i]+rrnext/rrprev*p[i];

      /* "Previous" r' * r */
      rrprev=rrnext;

      it++;
    }

    if(it>niter)
      it=niter;

    if(it%printit)
      if(!myrow && !mycol)
        std::cout << "RHS " << j << ": iteration " << it << ", ||Ax-b||/||b||=" << res << std::endl << std::endl;

    /* Back to n-sized vector */
    if(ingrid)
      pgeadd('N',neff,IONE,ONE,x,IONE,IONE,desc,ZERO,X,IA,j,descVec);

  }

  delete[] b;
  delete[] x;
  delete[] r;
  delete[] p;
  delete[] Ap;
}
Example #25
0
int Amesos_Scalapack::RedistributeA( ) {

  if( debug_ == 1 ) std::cout << "Entering `RedistributeA()'" << std::endl;

  Time_->ResetStartTime();
  
  Epetra_RowMatrix *RowMatrixA = dynamic_cast<Epetra_RowMatrix *>(Problem_->GetOperator());
  EPETRA_CHK_ERR( RowMatrixA == 0 ) ; 

  const Epetra_Map &OriginalMap = RowMatrixA->RowMatrixRowMap() ; 
  int NumberOfProcesses = Comm().NumProc() ; 

  //
  //  Compute a uniform distribution as ScaLAPACK would want it
  //    MyFirstElement - The first element which this processor would have
  //    NumExpectedElemetns - The number of elements which this processor would have
  //

  int NumRows_ = RowMatrixA->NumGlobalRows() ; 
  int NumColumns_  = RowMatrixA->NumGlobalCols() ; 
  if ( MaxProcesses_ > 0 ) {
    NumberOfProcesses = EPETRA_MIN( NumberOfProcesses, MaxProcesses_ ) ; 
  }
  else {
    int ProcessNumHeuristic = (1+NumRows_/200)*(1+NumRows_/200);
    NumberOfProcesses = EPETRA_MIN( NumberOfProcesses,  ProcessNumHeuristic );
  }
  
  if ( debug_ == 1) std::cout  << "iam_ = " << iam_  << "Amesos_Scalapack.cpp:171" << std::endl;
  //
  // Create the ScaLAPACK data distribution.
  // The TwoD data distribution is created in a completely different
  // manner and is not transposed (whereas the SaLAPACK 1D data
  // distribution was transposed) 
  //
  if ( debug_ == 1) std::cout  << "iam_ = " << iam_  << "Amesos_Scalapack.cpp:163" << std::endl;
  Comm().Barrier(); 
  if ( TwoD_distribution_ ) { 
    if ( debug_ == 1) std::cout  << "iam_ = " << iam_  << "Amesos_Scalapack.cpp:166" << std::endl;
    Comm().Barrier(); 
    npcol_ = EPETRA_MIN( NumberOfProcesses, 
			 EPETRA_MAX ( 2, (int) sqrt( NumberOfProcesses * 0.5 ) ) ) ; 
    nprow_ = NumberOfProcesses / npcol_ ;

    //
    //  Create the map for FatA - our first intermediate matrix
    //
    int NumMyElements = RowMatrixA->RowMatrixRowMap().NumMyElements() ;
    std::vector<int> MyGlobalElements( NumMyElements );
    RowMatrixA->RowMatrixRowMap().MyGlobalElements( &MyGlobalElements[0] ) ;

    int NumMyColumns = RowMatrixA->RowMatrixColMap().NumMyElements() ;
    std::vector<int> MyGlobalColumns( NumMyColumns );
    RowMatrixA->RowMatrixColMap().MyGlobalElements( &MyGlobalColumns[0] ) ;

    if ( debug_ == 1) std::cout  << "iam_ = " << iam_  << "Amesos_Scalapack.cpp:194" << std::endl;

    std::vector<int> MyFatElements( NumMyElements * npcol_ ); 
    
    for( int LocalRow=0; LocalRow<NumMyElements; LocalRow++ ) {
      for (int i = 0 ; i < npcol_; i++ ){
	MyFatElements[LocalRow*npcol_+i] = MyGlobalElements[LocalRow]*npcol_+i;
      } 
    }
    
    Epetra_Map FatInMap( npcol_*NumRows_, NumMyElements*npcol_, 
			 &MyFatElements[0], 0, Comm() ); 
    
    //
    //  Create FatIn, our first intermediate matrix
    //
    Epetra_CrsMatrix FatIn( Copy, FatInMap, 0 );
    
    
    std::vector<std::vector<int> > FatColumnIndices(npcol_,std::vector<int>(1));
    std::vector<std::vector<double> > FatMatrixValues(npcol_,std::vector<double>(1));
    std::vector<int> FatRowPtrs(npcol_);  // A FatRowPtrs[i] = the number 
    // of entries in local row LocalRow*npcol_ + i 
    
    if ( debug_ == 1) std::cout  << "iam_ = " << iam_  << "Amesos_Scalapack.cpp:219" << std::endl;
    //
    mypcol_ = iam_%npcol_;
    myprow_ = (iam_/npcol_)%nprow_;
    if ( iam_ >= nprow_ * npcol_ ) {
      myprow_ = nprow_; 
      mypcol_ = npcol_; 
    }
    //  Each row is split into npcol_ rows, with each of the 
    //  new rows containing only those elements belonging to 
    //  its process column (in the ScaLAPACK 2D process grid)
    //
    int MaxNumIndices = RowMatrixA->MaxNumEntries();
    int NumIndices;
    std::vector<int> ColumnIndices(MaxNumIndices);
    std::vector<double> MatrixValues(MaxNumIndices); 
    
    if ( debug_ == 1) std::cout  << "iam_ = " << iam_  << "Amesos_Scalapack.cpp:232 NumMyElements = " 
			    << NumMyElements 
			    << std::endl;
    
    nb_ = grid_nb_;
    
    for( int LocalRow=0; LocalRow<NumMyElements; ++LocalRow ) {
      
      RowMatrixA->ExtractMyRowCopy( LocalRow, 
				    MaxNumIndices,
				    NumIndices, 
				    &MatrixValues[0],
				    &ColumnIndices[0] );
      
      for (int i=0; i<npcol_; i++ )  FatRowPtrs[i] = 0 ; 

      //
      //  Deal the individual matrix entries out to the row owned by
      //  the process to which this matrix entry will belong.
      //
      for( int i=0 ; i<NumIndices ; ++i ) {
	int GlobalCol = MyGlobalColumns[ ColumnIndices[i] ];
	int pcol_i = pcolnum( GlobalCol, nb_, npcol_ ) ;
	if ( FatRowPtrs[ pcol_i ]+1 >= FatColumnIndices[ pcol_i ].size() ) {
	  FatColumnIndices[ pcol_i ]. resize( 2 * FatRowPtrs[ pcol_i ]+1 );
	  FatMatrixValues[ pcol_i ]. resize( 2 * FatRowPtrs[ pcol_i ]+1 );
	}
	FatColumnIndices[pcol_i][FatRowPtrs[pcol_i]] = GlobalCol ;
	FatMatrixValues[pcol_i][FatRowPtrs[pcol_i]] = MatrixValues[i];
	
	FatRowPtrs[ pcol_i ]++;
      }
      
      //
      //  Insert each of the npcol_ rows individually
      //
      for ( int pcol_i = 0 ; pcol_i < npcol_ ; pcol_i++ ) { 
	FatIn.InsertGlobalValues( MyGlobalElements[LocalRow]*npcol_ + pcol_i, 
				  FatRowPtrs[ pcol_i ],
				  &FatMatrixValues[ pcol_i ][0], 
				  &FatColumnIndices[ pcol_i ][0] );
      }
    }
    FatIn.FillComplete( false );
    
    if (  debug_ == 1) std::cout  << "iam_ = " << iam_  << "Amesos_Scalapack.cpp:260" << std::endl;
    if (  debug_ == 1) std::cout  << "Amesos_Scalapack.cpp:265B" 
			     << " iam_ = " << iam_ 
			     << " nb_ = " << nb_ 
			     << " nprow_ = " << nprow_ 
			     << " npcol_ = " << npcol_ 
			     << std::endl;
    
    //
    //  Compute the map for our second intermediate matrix, FatOut
    //
    //  Compute directly
    int UniformRows =  ( NumRows_ / ( nprow_ * nb_ ) ) * nb_ ; 
    int AllExcessRows = NumRows_ - UniformRows * nprow_ ; 
    int OurExcessRows = EPETRA_MIN( nb_, AllExcessRows - ( myprow_ * nb_ ) ) ; 
    OurExcessRows = EPETRA_MAX( 0, OurExcessRows );
    NumOurRows_ = UniformRows + OurExcessRows ; 
    
    if (  debug_ == 1) std::cout  << "iam_ = " << iam_  << "Amesos_Scalapack.cpp:277" << std::endl;
    int UniformColumns =  ( NumColumns_ / ( npcol_ * nb_ ) ) * nb_ ; 
    int AllExcessColumns = NumColumns_ - UniformColumns * npcol_ ; 
    int OurExcessColumns = EPETRA_MIN( nb_, AllExcessColumns - ( mypcol_ * nb_ ) ) ; 
    if (  debug_ == 1) std::cout  << "iam_ = " << iam_  << "Amesos_Scalapack.cpp:281" << std::endl;
    OurExcessColumns = EPETRA_MAX( 0, OurExcessColumns );
    NumOurColumns_ = UniformColumns + OurExcessColumns ; 
    
    if ( iam_ >= nprow_ * npcol_ ) {
      UniformRows = 0;
      NumOurRows_ = 0;
      NumOurColumns_ = 0;
    }
    
    Comm().Barrier(); 
    
    if (  debug_ == 1) std::cout  << "iam_ = " << iam_  << "Amesos_Scalapack.cpp:295" << std::endl;
#if 0
    //  Compute using ScaLAPACK's numroc routine, assert agreement  
    int izero = 0; // All matrices start at process 0
    int NumRocSays = numroc_( &NumRows_, &nb_, &myprow_, &izero, &nprow_ );
    assert( NumOurRows_ == NumRocSays );
#endif
    //
    //  Compute the rows which this process row owns in the ScaLAPACK 2D
    //  process grid.
    //
    std::vector<int> AllOurRows(NumOurRows_);
    
    int RowIndex = 0 ; 
    int BlockRow = 0 ;
    for ( ; BlockRow < UniformRows / nb_ ; BlockRow++ ) {
      for ( int RowOffset = 0; RowOffset < nb_ ; RowOffset++ ) {
	AllOurRows[RowIndex++] = BlockRow*nb_*nprow_  + myprow_*nb_ + RowOffset ;
      } 
    }
    Comm().Barrier(); 
    if (  debug_ == 1) std::cout  << "iam_ = " << iam_  << "Amesos_Scalapack.cpp:315" << std::endl;
    assert ( BlockRow == UniformRows / nb_ ) ; 
    for ( int RowOffset = 0; RowOffset < OurExcessRows ; RowOffset++ ) {
      AllOurRows[RowIndex++] = BlockRow*nb_*nprow_ + myprow_*nb_ + RowOffset ;
    } 
    assert( RowIndex == NumOurRows_ );
    //
    //  Distribute those rows amongst all the processes in that process row
    //  This is an artificial distribution with the following properties:
    //  1)  It is a 1D data distribution (each row belogs entirely to 
    //      a single process
    //  2)  All data which will eventually belong to a given process row, 
    //      is entirely contained within the processes in that row.
    //
    
    Comm().Barrier(); 
    if (  debug_ == 1) std::cout  << "iam_ = " << iam_  << "Amesos_Scalapack.cpp:312" << std::endl;
    //
    //  Compute MyRows directly
    //
    std::vector<int>MyRows(NumOurRows_);
    RowIndex = 0 ; 
    BlockRow = 0 ;
    for ( ; BlockRow < UniformRows / nb_ ; BlockRow++ ) {
      for ( int RowOffset = 0; RowOffset < nb_ ; RowOffset++ ) {
	MyRows[RowIndex++] = BlockRow*nb_*nprow_*npcol_  + 
	  myprow_*nb_*npcol_ + RowOffset*npcol_  + mypcol_ ;
      } 
    }
    
    Comm().Barrier(); 
    if (  debug_ == 1) std::cout  << "iam_ = " << iam_  << "Amesos_Scalapack.cpp:326" << std::endl;
    
    assert ( BlockRow == UniformRows / nb_ ) ; 
    for ( int RowOffset = 0; RowOffset < OurExcessRows ; RowOffset++ ) {
      MyRows[RowIndex++] = BlockRow*nb_*nprow_*npcol_  + 
	myprow_*nb_*npcol_ + RowOffset*npcol_  + mypcol_ ;
    } 
    
    Comm().Barrier(); 
    if (  debug_ == 1) std::cout  << "iam_ = " << iam_  << "Amesos_Scalapack.cpp:334" << std::endl;
    Comm().Barrier(); 
    
    for (int i=0; i < NumOurRows_; i++ ) { 
      assert( MyRows[i] == AllOurRows[i]*npcol_+mypcol_ );
    } 
    
    Comm().Barrier(); 
    if (  debug_ == 1) std::cout  << "Amesos_Scalapack.cpp:340" 
			     << " iam_ = " << iam_ 
			     << " myprow_ = " << myprow_ 
			     << " mypcol_ = " << mypcol_ 
			     << " NumRows_ = " << NumRows_ 
			     << " NumOurRows_ = " << NumOurRows_ 
			     << std::endl;
    
    Comm().Barrier(); 
    Epetra_Map FatOutMap( npcol_*NumRows_, NumOurRows_, &MyRows[0], 0, Comm() ); 
    
    if (  debug_ == 1) std::cout  << "iam_ = " << iam_  << "Amesos_Scalapack.cpp:344" << std::endl;
    Comm().Barrier(); 
    
    if ( FatOut_ ) delete FatOut_ ; 
    FatOut_ = new Epetra_CrsMatrix( Copy, FatOutMap, 0 ) ;
    
    if (  debug_ == 1) std::cout  << "iam_ = " << iam_  << "Amesos_Scalapack.cpp:348" << std::endl;
    
    Epetra_Export ExportToFatOut( FatInMap, FatOutMap ) ;
    
    if (  debug_ == 1) std::cout  << "iam_ = " << iam_  << "Amesos_Scalapack.cpp:360" << std::endl;
    
    FatOut_->Export( FatIn, ExportToFatOut, Add );
    FatOut_->FillComplete( false );
    
    //
    //  Create a map to allow us to redistribute the vectors X and B 
    //
    Epetra_RowMatrix *RowMatrixA = dynamic_cast<Epetra_RowMatrix *>(Problem_->GetOperator());
    const Epetra_Map &OriginalMap = RowMatrixA->RowMatrixRowMap() ; 
    assert( NumGlobalElements_ == OriginalMap.NumGlobalElements() ) ;
    int NumMyVecElements = 0 ;
    if ( mypcol_ == 0 ) { 
      NumMyVecElements = NumOurRows_;
    }
    
    if (  debug_ == 1) std::cout  << "iam_ = " << iam_  << "Amesos_Scalapack.cpp:385" << std::endl;
    
    if (VectorMap_) { delete VectorMap_ ; VectorMap_ = 0 ; } 
    VectorMap_ = new Epetra_Map( NumGlobalElements_, 
				 NumMyVecElements, 
				 &AllOurRows[0], 
				 0, 
				 Comm() );
    if (  debug_ == 1) std::cout  << "iam_ = " << iam_  << " Amesos_Scalapack.cpp:393 debug_ = "
			     << debug_ << std::endl;
    
  } else {
    nprow_ = 1 ;
    npcol_ = NumberOfProcesses / nprow_ ;
    assert ( nprow_ * npcol_ == NumberOfProcesses ) ; 
    
    m_per_p_ = ( NumRows_ + NumberOfProcesses - 1 ) / NumberOfProcesses ;
    int MyFirstElement = EPETRA_MIN( iam_ * m_per_p_, NumRows_ ) ;
    int MyFirstNonElement = EPETRA_MIN( (iam_+1) * m_per_p_, NumRows_ ) ;
    int NumExpectedElements = MyFirstNonElement - MyFirstElement ; 
    
    assert( NumRows_ ==  RowMatrixA->NumGlobalRows() ) ; 
    if ( ScaLAPACK1DMap_ ) delete( ScaLAPACK1DMap_ ) ; 
    ScaLAPACK1DMap_ = new Epetra_Map( NumRows_, NumExpectedElements, 0, Comm() );
    if ( ScaLAPACK1DMatrix_ ) delete( ScaLAPACK1DMatrix_ ) ; 
    ScaLAPACK1DMatrix_ = new Epetra_CrsMatrix(Copy, *ScaLAPACK1DMap_, 0);
    Epetra_Export ExportToScaLAPACK1D_( OriginalMap, *ScaLAPACK1DMap_);
    
    ScaLAPACK1DMatrix_->Export( *RowMatrixA, ExportToScaLAPACK1D_, Add ); 
    
    ScaLAPACK1DMatrix_->FillComplete( false ) ; 
  }
  if (  debug_ == 1) std::cout  << "iam_ = " << iam_  << " Amesos_Scalapack.cpp:417 debug_ = "
			   << debug_ << std::endl;
  if (  debug_ == 1) std::cout  << "iam_ = " << iam_  << "Amesos_Scalapack.cpp:402"
			   << " nprow_ = " << nprow_
			   << " npcol_ = " << npcol_ << std::endl ; 
  int info; 
  const int zero = 0 ; 
  if ( ictxt_ == -1313 ) {
    ictxt_ = 0 ; 
    if (  debug_ == 1) std::cout  << "iam_ = " << iam_  << "Amesos_Scalapack.cpp:408" << std::endl;
    SL_INIT_F77(&ictxt_, &nprow_, &npcol_) ; 
  }
  if (  debug_ == 1) std::cout  << "iam_ = " << iam_  << "Amesos_Scalapack.cpp:410A" << std::endl;
  
  int nprow;
  int npcol;
  int myrow;
  int mycol;
  BLACS_GRIDINFO_F77(&ictxt_, &nprow, &npcol, &myrow, &mycol) ; 
  if (  debug_ == 1) std::cout  << "iam_ = " << iam_  << "iam_ = " << iam_ << " Amesos_Scalapack.cpp:410" << std::endl;
  if ( iam_ < nprow_ * npcol_ ) { 
    assert( nprow == nprow_ ) ; 
    if ( npcol != npcol_ ) std::cout << "Amesos_Scalapack.cpp:430 npcol = " << 
      npcol << " npcol_ = " << npcol_ << std::endl ; 
    assert( npcol == npcol_ ) ; 
    if ( TwoD_distribution_ ) {
      assert( myrow == myprow_ ) ; 
      assert( mycol == mypcol_ ) ; 
      lda_ = EPETRA_MAX(1,NumOurRows_) ;
    } else { 
      assert( myrow == 0 ) ; 
      assert( mycol == iam_ ) ; 
      nb_ = m_per_p_;
      lda_ = EPETRA_MAX(1,NumGlobalElements_);
    }
    if (  debug_ == 1) std::cout  << "iam_ = " << iam_  
			     << "Amesos_Scalapack.cpp: " << __LINE__ 
			     << " TwoD_distribution_ = "  << TwoD_distribution_ 
			     << " NumGlobalElements_ = "  << NumGlobalElements_ 
			     << " debug_ = "  << debug_ 
			     << " nb_ = "  << nb_ 
			     << " lda_ = "  << lda_ 
			     << " nprow_ = "  << nprow_ 
			     << " npcol_ = "  << npcol_ 
			     << " myprow_ = "  << myprow_ 
			     << " mypcol_ = "  << mypcol_ 
			     << " iam_ = "  << iam_ << std::endl ;
    AMESOS_PRINT( myprow_ );
    DESCINIT_F77(DescA_, 
		 &NumGlobalElements_, 
		 &NumGlobalElements_, 
		 &nb_,
		 &nb_,
		 &zero,
		 &zero,
		 &ictxt_,
		 &lda_,
		 &info) ;
    if (  debug_ == 1) std::cout  << "iam_ = " << iam_  << "Amesos_Scalapack.cpp:441" << std::endl;
    assert( info == 0 ) ; 
  } else {
    DescA_[0] = -13;
    if (  debug_ == 1) std::cout  << "iam_ = " << iam_  << "Amesos_Scalapack.cpp:458 nprow = " << nprow << std::endl;
    assert( nprow == -1 ) ; 
  }
  
  if (  debug_ == 1) std::cout  << "Amesos_Scalapack.cpp:446" << std::endl;
  MatTime_ += Time_->ElapsedTime();
  
  return 0;
}
Example #26
0
int main (int argc, char *argv[]) {
  myscalar *A=NULL, *X=NULL, *B=NULL, *Y=NULL, *Xtrue=NULL;
  myscalar elem;
  int descA[BLACSCTXTSIZE], descVec[BLACSCTXTSIZE], descelem[BLACSCTXTSIZE];
  int n;
  int nrhs;
  int nb;
  int locr, locc;
  int i, j, ii, jj;
  int ierr;
  int dummy;
  int myid, np;
  int myrow, mycol, nprow, npcol;
  int ctxt;
  myreal rdummy, res;

  n=1024; /* Size of the problem */
  nrhs=3; /* Number of RHS */
  nb=16;  /* Blocksize for the 2D block-cyclic distribution */

  /* Initialize MPI */
  if((ierr=MPI_Init(&argc,&argv)))
    return 1;
  myid=-1;
  if((ierr=MPI_Comm_rank(MPI_COMM_WORLD,&myid)))
    return 1;
  np=-1;
  if((ierr=MPI_Comm_size(MPI_COMM_WORLD,&np)))
    return 1;

  /* Initialize the BLACS grid */
  nprow=floor(sqrt((float)np));
  npcol=np/nprow;
  blacs_get_(&IZERO,&IZERO,&ctxt);
  blacs_gridinit_(&ctxt,"R",&nprow,&npcol);
  blacs_gridinfo_(&ctxt,&nprow,&npcol,&myrow,&mycol);

  /* A is a dense n x n distributed Toeplitz matrix */
  if(myid<nprow*npcol) {
    locr=numroc_(&n,&nb,&myrow,&IZERO,&nprow);
    locc=numroc_(&n,&nb,&mycol,&IZERO,&npcol);
    A=new myscalar[locr*locc];
    dummy=std::max(1,locr);
    descinit_(descA,&n,&n,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr);

    myreal pi=3.1416, d=0.1;
    for(i=1;i<=locr;i++) {
      for(j=1;j<=locc;j++) {
        ii=indxl2g_(&i,&nb,&myrow,&IZERO,&nprow);
        jj=indxl2g_(&j,&nb,&mycol,&IZERO,&npcol);
        // Toeplitz matrix from Quantum Chemistry.
        A[locr*(j-1)+(i-1)]=ii==jj?std::pow(pi,2)/6.0/std::pow(d,2):std::pow(-1.0,ii-jj)/std::pow((myreal)ii-jj,2)/std::pow(d,2);
      }
    }
  } else {
    descset_(descA,&n,&n,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE);
  }

  /* Initializing solution */
  if(myid<nprow*npcol) {
    locr=numroc_(&n,&nb,&myrow,&IZERO,&nprow);
    locc=numroc_(&nrhs,&nb,&mycol,&IZERO,&npcol);
    dummy=std::max(1,locr);
    descinit_(descVec,&n,&nrhs,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr);
    Xtrue=new myscalar[locr*locc]();
    for(i=0;i<locr*locc;i++)
      Xtrue[i]=static_cast<myscalar>(rand())/(static_cast<myscalar>(RAND_MAX));
  } else {
    descset_(descVec,&n,&nrhs,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE);
  }

  /* Initializing solution and intermediate vector space */
  if(myid<nprow*npcol) {
    X=new myscalar[locr*locc]();
    Y=new myscalar[locr*locc]();
  }

  /* Initializing RHS as A*Xtrue */
  if(myid<nprow*npcol) {
    B=new myscalar[locr*locc]();
    pgemm('N','N',n,nrhs,n,ONE,A,IONE,IONE,descA,Xtrue,IONE,IONE,descVec,ZERO,B,IONE,IONE,descVec);
  }

  /* Initialize the solver and set parameters */
  StrumpackDensePackage<myscalar,myreal> sdp(MPI_COMM_WORLD);
  sdp.use_HSS=true;
  sdp.levels_HSS=4;
  sdp.min_rand_HSS=64;
  sdp.lim_rand_HSS=0;
  sdp.tol_HSS=1e-12;
  sdp.split_HSS=768; /* Size of A11 */

  /* Compression */
  sdp.compress(A,descA);

  /* Accuracy checking */
  sdp.check_compression(A,descA);

  /* Factorization */
  sdp.partially_factor(A,descA,sdp.split_HSS);

  /* Schur complement update */
  sdp.compute_schur();

  /* Extracting a random element from the Schur complement */
  if(myid<nprow*npcol) {
    descinit_(descelem,&IONE,&IONE,&nb,&nb,&IZERO,&IZERO,&ctxt,&dummy,&ierr);
  } else {
    descset_(descelem,&IONE,&IONE,&nb,&nb,&IZERO,&IZERO,&INONE,&IONE);
  }
  i=1+rand()%(n-sdp.split_HSS);
  j=1+rand()%(n-sdp.split_HSS);
  MPI_Bcast((void *)&i,IONE,MPI_INTEGER,IZERO,MPI_COMM_WORLD);
  MPI_Bcast((void *)&j,IONE,MPI_INTEGER,IZERO,MPI_COMM_WORLD);
  sdp.extract_schur(&elem,descelem,&i,IONE,&j,IONE);
  if(!myid)
    std::cout << "Element (" << i << "," << j << ") of Schur complement = " << elem << std::endl << std::endl;

  /* Condensation */
  sdp.reduce_RHS(Y,descVec,B,descVec);

  /* Solve the Schur complement system (touches only the bottom part of X)*/
  sdp.verbose=false;
  if(!myid)
    std::cout << "Solving Schur complement system with Conjugate Gradient..." << std::endl << std::endl;
  CG(&sdp,X,Y,descVec,n,nrhs,3000,1e-14);
  sdp.verbose=true;

  /* Expansion (touches only the top part of X) */
  sdp.expand_solution(X,descVec,Y,descVec);

  /* Accuracy checking */
  sdp.check_solution(A,descA,X,descVec,B,descVec);

  /* Forward error */
  if(myid<nprow*npcol) {
    for(i=0;i<locr*locc;i++)
      Y[i]=X[i]-Xtrue[i];
    res=plange('F',n,nrhs,Y,IONE,IONE,descVec,&rdummy);
    res/=plange('F',n,nrhs,Xtrue,IONE,IONE,descVec,&rdummy);
    if(!myid)
      std::cout << "Forward error = " << res << std::endl;
  }

  /* Statistics */
  sdp.print_statistics();

  /* Clean-up */
  delete[] A;
  delete[] B;
  delete[] X;
  delete[] Y;
  delete[] Xtrue;

  /* The end */
  MPI_Finalize();
  return 0;

}
Example #27
0
///
/// This is the new standard style for implementing a slave routine for a ScaLAPACK operator,
/// in this case, pdgesvd_().  The difference from the old style is that the new style
/// requires that the ScaLAPACK context, ICTXT, be provided.  Until that requirement can be pushed
/// up into the "mpi_slave_xxx" files, the existing pdgesvdSlave() routine will create the context
/// and then call this routine.
///
slpp::int_t pdgesvdSlave2(const slpp::int_t ICTXT, PdgesvdArgs args, void* bufs[], size_t sizes[], unsigned count)
{
    // find out where I am in the scalapack grid
    slpp::int_t NPROW, NPCOL, MYPROW, MYPCOL, MYPNUM;
    getSlInfo(ICTXT/*in*/, NPROW/*in*/, NPCOL/*in*/, MYPROW/*out*/, MYPCOL/*out*/, MYPNUM/*out*/);

    if(NPROW != args.NPROW || NPCOL != args.NPCOL ||
       MYPROW != args.MYPROW || MYPCOL != args.MYPCOL || MYPNUM != args.MYPNUM){
        if(DBG) {
            std::cerr << "scalapack general parameter mismatch" << std::endl;
            std::cerr << "args NPROW:"<<args.NPROW<<" NPCOL:"<<args.NPCOL
                      << "MYPROW:"<<args.MYPROW<<" MYPCOL:"<<args.MYPCOL<<"MYPNUM:"<<MYPNUM
                      << std::endl;
            std::cerr << "ScaLAPACK NPROW:"<<NPROW<<" NPCOL:"<<NPCOL
                      << "MYPROW:"<<MYPROW<<" MYPCOL:"<<MYPCOL<<"MYPNUM:"<<MYPNUM
                      << std::endl;
        }
    }

    // setup MB,NB
    const slpp::int_t& M = args.A.DESC.M ;
    const slpp::int_t& N = args.A.DESC.N ;
    const slpp::int_t& MB = args.A.DESC.MB ;
    const slpp::int_t& NB = args.A.DESC.NB ;

    const slpp::int_t& LLD_A = args.A.DESC.LLD ;
    const slpp::int_t one = 1 ;
    const slpp::int_t  LTD_A = std::max(one, numroc_( N, NB, MYPCOL, /*CSRC_A*/0, NPCOL ));

    const slpp::int_t& MP = LLD_A ;
    const slpp::int_t& NQ = LTD_A ;


    // size check A, S, U, VT
    slpp::int_t SIZE_A = MP * NQ ;
    slpp::int_t SIZE_S = std::min(M, N);
    slpp::int_t size_p = std::max(one, numroc_( SIZE_S, MB, MYPROW, /*RSRC_A*/0, NPROW ));
    slpp::int_t size_q = std::max(one, numroc_( SIZE_S, NB, MYPCOL, /*RSRC_A*/0, NPCOL ));
    slpp::int_t SIZE_U = MP * size_q;
    slpp::int_t SIZE_VT= size_p * NQ;

    if(DBG) {
        std::cerr << "##################################################" << std::endl;
        std::cerr << "####pdgesvdSlave##################################" << std::endl;
        std::cerr << "one:" << one << std::endl;
        std::cerr << "SIZE_S:" << SIZE_S << std::endl;
        std::cerr << "MB:" << MB << std::endl;
        std::cerr << "MYPROW:" << MYPROW << std::endl;
        std::cerr << "NPROW:" << NPROW << std::endl;
    }

    // TODO: >= because master is permitted to use a larger buffer
    //          to allow to see if rounding up to chunksize eliminates some errors
    //          before we put the roundUp formula everywhere
    SLAVE_ASSERT_ALWAYS(sizes[BUF_A] >= SIZE_A * sizeof(double));
    SLAVE_ASSERT_ALWAYS(sizes[BUF_S] >= SIZE_S * sizeof(double));
    if (args.jobU == 'V') {
        SLAVE_ASSERT_ALWAYS( sizes[BUF_U] >= SIZE_U *sizeof(double));
    }
    if (args.jobVT == 'V') {
        SLAVE_ASSERT_ALWAYS( sizes[BUF_VT] >= SIZE_VT *sizeof(double));
    }

    // sizes are correct, give the pointers their names
    double* A = reinterpret_cast<double*>(bufs[BUF_A]) ;
    double* S = reinterpret_cast<double*>(bufs[BUF_S]) ;
    double* U = reinterpret_cast<double*>(bufs[BUF_U]) ;
    double* VT = reinterpret_cast<double*>(bufs[BUF_VT]) ;

    // debug that the input is readable and show its contents
    if(DBG) {
        for(int ii=0; ii < SIZE_A; ii++) {
            std::cerr << "("<< MYPROW << "," << MYPCOL << ") A["<<ii<<"] = " << A[ii] << std::endl;
        }
    }

    if(false) {
        // debug that outputs are writeable:
        for(int ii=0; ii < SIZE_S; ii++) {
            S[ii] = -9999.0 ;
        }
        if (args.jobU == 'V') {
            for(int ii=0; ii < SIZE_U; ii++) {
                U[ii] = -9999.0 ;
            }
        }
        if (args.jobVT == 'V') {
            for(int ii=0; ii < SIZE_VT; ii++) {
                VT[ii] = -9999.0 ;
            }
        }
    }

    // ScaLAPACK: the DESCS are complete except for the correct context
    args.A.DESC.CTXT= ICTXT ;  // note: no DESC for S, it is not distributed, all have a copy
    args.U.DESC.CTXT= ICTXT ;
    args.VT.DESC.CTXT= ICTXT ;

    if(DBG) {
        std::cerr << "pdgesvdSlave: argsBuf is: {" << std::endl;
        std::cerr << args << std::endl;
        std::cerr << "}" << std::endl << std::endl;

        std::cerr << "pdgesvdSlave: calling pdgesvd_ for computation, with args:" << std::endl ;
        std::cerr << "jobU: " << args.jobU
                  << ", jobVT: " << args.jobVT
                  << ", M: " << args.M
                  << ", N: " << args.N << std::endl;

        std::cerr << "A: " <<  (void*)(A)
                  << ", A.I: " << args.A.I
                  << ", A.J: " << args.A.J << std::endl;
        std::cerr << ", A.DESC: " << args.A.DESC << std::endl;

        std::cerr << "S: " << (void*)(S) << std::endl;

        std::cerr << "U: " <<  (void*)(U)
                  << ", U.I: " << args.U.I
                  << ", U.J: " << args.U.J << std::endl;
        std::cerr << ", U.DESC: " << args.U.DESC << std::endl;

        std::cerr << "VT: " <<  (void*)(VT)
                  << ", VT.I: " << args.VT.I
                  << ", VT.J: " << args.VT.J << std::endl;
        std::cerr << ", VT.DESC: " << args.VT.DESC << std::endl;
    }


    if(DBG) std::cerr << "pdgesvdSlave calling PDGESVD to get work size" << std:: endl;
    slpp::int_t INFO = 0;
    double LWORK_DOUBLE;
    pdgesvd_(args.jobU, args.jobVT, args.M, args.N,
             A,  args.A.I,  args.A.J,  args.A.DESC, S,
             U,  args.U.I,  args.U.J,  args.U.DESC,
             VT, args.VT.I, args.VT.J, args.VT.DESC,
             &LWORK_DOUBLE, -1, INFO);

    if(INFO < 0) {
        // argument error
        std::cerr << "pdgesvdSlave(r:"<<MYPNUM<<"): "
                  << "ERROR: pdgesvd_() for work size, argument error, argument # " << -INFO << std::endl;
    } else if(INFO == (std::min(args.M, args.N)+1)) {
        // should not happen when checking work size
        // heterogeneity detected (eigenvalues did not match on all nodes)
        std::cerr << "pdgesvdSlave(r:"<<MYPNUM<<"): "
                  << "WARNING: pdgesvd_() for work size, eigenvalues did not match across all instances" << std::endl;
    } else if (INFO > 0) { // other + value of INFO
        // should not  happen when checking work size
        // DBDSQR did not converge
        std::cerr << "pdgesvdSlave(r:"<<MYPNUM<<"): "
                  << "ERROR: pdgesvd_() for work size, DBDSQR did not converge: " << INFO << std::endl;
    }

    if ( LWORK_DOUBLE < 0.0 ||
         LWORK_DOUBLE > double(numeric_limits<slpp::int_t>::max())) {
        // Houston, we have a problem ... the user wants to do more than 1 instance
        // can handle through the slpp::int ... the size of which is determined
        // by which binary for ScaLAPACK/BLAS we are using .. .32-bit or 64-bit FORTRAN INTEGER
        // noting that 32-bit INTEGER is what is shipped with RHEL, CentOS, etc, even on
        // 64-bit systems, for some unknown reason.
        std::cerr << "pdgesvdSlave(r:"<<MYPNUM<<"): "
                  << "ERROR: LWORK_DOUBLE, " << LWORK_DOUBLE << ", is too large for the ScaLAPACK API to accept" << INFO << std::endl;
        if (INFO >= 0) {
            // make up our own argument error... -22 (there are 20 arguments)
            INFO = -22;
        }
        return INFO;
    }

    slpp::int_t LWORK = int(LWORK_DOUBLE); // get the cast from SVDPhysical.cpp
    std::cerr << "pdgesvdSlave(): info: LWORK is " << LWORK << std::endl;

    // ALLOCATE an array WORK size LWORK
    boost::scoped_array<double> WORKtmp(new double[LWORK]);
    double* WORK = WORKtmp.get();

    //////////////////////////////////////////////////////////////////////
    //////////////////////////////////////////////////////////////////////
    //////////////////////////////////////////////////////////////////////
    if(DBG) std::cerr << "pdgesvdSlave: calling pdgesvd_ for computation." << std::endl ;
    INFO=0;
    pdgesvd_(args.jobU, args.jobVT, args.M, args.N,
             A,  args.A.I,  args.A.J,  args.A.DESC, S,
             U,  args.U.I,  args.U.J,  args.U.DESC,
             VT, args.VT.I, args.VT.J, args.VT.DESC,
             WORK, LWORK, INFO);


    slpp::int_t numToPrintAtStart=4 ;
    if (MYPNUM==0 && DBG) {
        int ii; // used in 2nd loop
        for(ii=0; ii < std::min(SIZE_S, numToPrintAtStart); ii++) {
            std::cerr << "pdgesvdSlave: S["<<ii<<"] = " << S[ii] << std::endl;
        }
        // now skip to numToPrintAtStart before the end, (without repeating) and print to the end
        // to see if the test cases are producing zero eigenvalues (don't want that)
        slpp::int_t numToPrintAtEnd=4;
        for(int ii=std::max(ii, SIZE_S-numToPrintAtEnd); ii < SIZE_S; ii++) {
            std::cerr << "pdgesvdSlave: S["<<ii<<"] = " << S[ii] << std::endl;
        }
    }
    if (DBG) {
        if (args.jobU == 'V') {
            for(int ii=0; ii < std::min(SIZE_U, numToPrintAtStart); ii++) {
                std::cerr << "pdgesvdSlave: U["<<ii<<"] = " << U[ii] << std::endl;
            }
        }
        if (args.jobVT == 'V') {
            for(int ii=0; ii < std::min(SIZE_VT, numToPrintAtStart); ii++) {
                std::cerr << "pdgesvdSlave: VT["<<ii<<"] = " << VT[ii] << std::endl;
            }
        }
    }

    if(MYPNUM == 0) {
        if(INFO < 0) {
            // argument error
            std::cerr << "pdgesvdSlave(r:"<<MYPNUM<<"): "
                      << "ERROR: argument error, argument # " << -INFO << std::endl;
        } else if(INFO == (std::min(args.M, args.N)+1)) {
            // heterogeneity detected (eigenvalues did not match on all nodes)
            std::cerr << "pdgesvdSlave(r:"<<MYPNUM<<"): "
                      << "WARNING: eigenvalues did not match across all instances" << std::endl;
        } else if (INFO > 0) { // other + value of INFO
            // DBDSQR did not converge
            std::cerr << "pdgesvdSlave(r:"<<MYPNUM<<"): "
                      << "ERROR: DBDSQR did not converge: " << INFO << std::endl;
        }
    }
    return INFO ;
}
Example #28
0
/*==== MAIN FUNCTION =================================================*/
int main( int argc, char *argv[] ){

/*  ==== Declarations =================================================== */

/*  File variables */
    FILE    *fin;

/*  Matrix descriptors */
    MDESC   descA, descB, descC, descA_local, descB_local;

/*  Local scalars */
    MKL_INT iam, nprocs, ictxt, myrow, mycol, nprow, npcol;
    MKL_INT n, nb, mp, nq, lld, lld_local;
    MKL_INT i, j, info;
    int     n_int, nb_int, nprow_int, npcol_int;
    double  thresh, diffnorm, anorm, bnorm, residual, eps;

/*  Local arrays */
    double  *A_local, *B_local, *A, *B, *C, *work;
    MKL_INT iwork[ 4 ];


/*  ==== Executable statements ========================================== */

/*  Get information about how many processes are used for program execution
    and number of current process */
    blacs_pinfo_( &iam, &nprocs );

/*  Init temporary 1D process grid */
    blacs_get_( &i_negone, &i_zero, &ictxt );
    blacs_gridinit_( &ictxt, "C", &nprocs, &i_one );

/*  Open input file */
    if ( iam == 0 ) {
        fin = fopen( "../in/pblas3ex.in", "r" );
        if ( fin == NULL ) {
            printf( "Error while open input file." );
            return 2;
        }
    }

/*  Read data and send it to all processes */
    if ( iam == 0 ) {

/*      Read parameters */
        fscanf( fin, "%d n, dimension of vectors, must be > 0 ", &n_int );
        fscanf( fin, "%d nb, size of blocks, must be > 0 ", &nb_int );
        fscanf( fin, "%d p, number of rows in the process grid, must be > 0", &nprow_int );
        fscanf( fin, "%d q, number of columns in the process grid, must be > 0, p*q = number of processes", &npcol_int );
        fscanf( fin, "%lf threshold for residual check (to switch off check set it < 0.0) ", &thresh );
        n = (MKL_INT) n_int;
        nb = (MKL_INT) nb_int;
        nprow = (MKL_INT) nprow_int;
        npcol = (MKL_INT) npcol_int;

/*      Check if all parameters are correct */
        if( ( n<=0 )||( nb<=0 )||( nprow<=0 )||( npcol<=0 )||( nprow*npcol != nprocs ) ) {
            printf( "One or several input parameters has incorrect value. Limitations:\n" );
            printf( "n > 0, nb > 0, p > 0, q > 0 - integer\n" );
            printf( "p*q = number of processes\n" );
            printf( "threshold - double (set to negative to swicth off check)\n");
            return 2;
        }

/*      Pack data into array and send it to other processes */
        iwork[ 0 ] = n;
        iwork[ 1 ] = nb;
        iwork[ 2 ] = nprow;
        iwork[ 3 ] = npcol;
        igebs2d_( &ictxt, "All", " ", &i_four, &i_one, iwork, &i_four );
        dgebs2d_( &ictxt, "All", " ", &i_one, &i_one, &thresh, &i_one );
    } else {

/*      Recieve and unpack data */
        igebr2d_( &ictxt, "All", " ", &i_four, &i_one, iwork, &i_four, &i_zero, &i_zero );
        dgebr2d_( &ictxt, "All", " ", &i_one, &i_one, &thresh, &i_one, &i_zero, &i_zero );
        n = iwork[ 0 ];
        nb = iwork[ 1 ];
        nprow = iwork[ 2 ];
        npcol = iwork[ 3 ];
    }
    if ( iam == 0 ) { fclose( fin ); }

/*  Destroy temporary process grid */
    blacs_gridexit_( &ictxt );

/*  Init workind 2D process grid */
    blacs_get_( &i_negone, &i_zero, &ictxt );
    blacs_gridinit_( &ictxt, "R", &nprow, &npcol );
    blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol );

/*  Create on process 0 two matrices: A - orthonormal, B -random */
    if ( ( myrow == 0 ) && ( mycol == 0 ) ){

/*      Allocate arrays */
        A_local = (double*) calloc( n*n, sizeof( double ) );
        B_local = (double*) calloc( n*n, sizeof( double ) );

/*      Set arrays */
        for ( i=0; i<n; i++ ){
            for ( j=0; j<n; j++ ){
                B_local[ i+n*j ] = one*rand()/RAND_MAX;
            }
            B_local[ i+n*i ] += two;
        }
        for ( j=0; j<n; j++ ){
            for ( i=0; i<n; i++ ){
                if ( j < n-1 ){
                    if ( i <= j ){
                        A_local[ i+n*j ] = one / sqrt( ( double )( (j+1)*(j+2) ) );
                    } else if ( i == j+1 ) {
                        A_local[ i+n*j ] = -one / sqrt( one + one/( double )(j+1) );
                    } else {
                        A_local[ i+n*j ] = zero;
                    }
                } else {
                    A_local[ i+n*(n-1) ] = one / sqrt( ( double )n );
                }
            }
        }

/*      Print information of task */
        printf( "=== START OF EXAMPLE ===================\n" );
        printf( "Matrix-matrix multiplication: A*B = C\n\n" );
        printf( "/  1/q_1 ........   1/q_n-1     1/q_n  \\ \n" );
        printf( "|        .                             | \n" );
        printf( "|         `.           :         :     | \n" );
        printf( "| -1/q_1    `.         :         :     | \n" );
        printf( "|        .    `.       :         :     |  =  A \n" );
        printf( "|   0     `.    `                      | \n" );
        printf( "|   : `.    `.      1/q_n-1     1/q_n  | \n" );
        printf( "|   :   `.    `.                       | \n" );
        printf( "\\   0 .... 0     -(n-1)/q_n-1   1/q_n  / \n\n" );
        printf( "q_i = sqrt( i^2 + i ), i=1..n-1, q_n = sqrt( n )\n\n" );
        printf( "A  -  n*n real matrix (orthonormal) \n" );
        printf( "B  -  random n*n real matrix\n\n" );
        printf( "n = %d, nb = %d; %dx%d - process grid\n\n", n, nb, nprow, npcol );
        printf( "=== PROGRESS ===========================\n" );
    } else {

/*      Other processes don't contain parts of initial arrays */
        A_local = NULL;
        B_local = NULL;
    }

/*  Compute precise length of local pieces and allocate array on
    each process for parts of distributed vectors */
    mp = numroc_( &n, &nb, &myrow, &i_zero, &nprow );
    nq = numroc_( &n, &nb, &mycol, &i_zero, &npcol );
    A = (double*) calloc( mp*nq, sizeof( double ) );
    B = (double*) calloc( mp*nq, sizeof( double ) );
    C = (double*) calloc( mp*nq, sizeof( double ) );

/*  Compute leading dimensions */
    lld_local = MAX( numroc_( &n, &n, &myrow, &i_zero, &nprow ), 1 );
    lld = MAX( mp, 1 );

/*  Initialize descriptors for initial arrays located on 0 process */
    descinit_( descA_local, &n, &n, &n, &n, &i_zero, &i_zero, &ictxt, &lld_local, &info );
    descinit_( descB_local, &n, &n, &n, &n, &i_zero, &i_zero, &ictxt, &lld_local, &info );

/*  Initialize descriptors for distributed arrays */
    descinit_( descA, &n, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info );
    descinit_( descB, &n, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info );
    descinit_( descC, &n, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info );

/*  Distribute matrices from 0 process over process grid */
    pdgeadd_( &trans, &n, &n, &one, A_local, &i_one, &i_one, descA_local, &zero, A, &i_one, &i_one, descA );
    pdgeadd_( &trans, &n, &n, &one, B_local, &i_one, &i_one, descB_local, &zero, B, &i_one, &i_one, descB );
    if( iam == 0 ){ printf( ".. Arrays are distributed ( p?geadd ) ..\n" ); }

/*  Destroy arrays on 0 process - they are not necessary anymore */
    if( ( myrow == 0 ) && ( mycol == 0 ) ){
        free( A_local );
        free( B_local );
    }

/*  Compute norm of A and B */
    work = (double*) calloc( mp, sizeof( double ) );
    anorm = pdlange_( "I", &n, &n, A, &i_one, &i_one, descA, work );
    bnorm = pdlange_( "I", &n, &n, B, &i_one, &i_one, descB, work );
    if( iam == 0 ){ printf( ".. Norms of A and B are computed ( p?lange ) ..\n" ); }

/*  Compute product C = A*B */
    pdgemm_( "N", "N", &n, &n, &n, &one, A, &i_one, &i_one, descA, B, &i_one, &i_one, descB,
             &zero, C, &i_one, &i_one, descC );
    if( iam == 0 ){ printf( ".. Multiplication A*B=C is done ( p?gemm ) ..\n" ); }

/*  Compute difference  B - inv_A*C (inv_A = transpose(A) because A is orthonormal) */
    pdgemm_( "T", "N", &n, &n, &n, &one, A, &i_one, &i_one, descA, C, &i_one, &i_one, descC,
             &negone, B, &i_one, &i_one, descB );
    if( iam == 0 ){ printf( ".. Difference is computed ( p?gemm ) ..\n" ); }

/*  Compute norm of B - inv_A*C (which is contained in B) */
    diffnorm = pdlange_( "I", &n, &n, B, &i_one, &i_one, descB, work );
    free( work );
    if( iam == 0 ){ printf( ".. Norms of the difference B-inv_A*C is computed ( p?lange ) ..\n" ); }

/*  Print results */
    if( iam == 0 ){
        printf( ".. Solutions are compared ..\n" );
        printf( "== Results ==\n" );
        printf( "||A|| = %03.11f\n", anorm );
        printf( "||B|| = %03.11f\n", bnorm );
        printf( "=== END OF EXAMPLE =====================\n" );
    }

/*  Compute machine epsilon */
    eps = pdlamch_( &ictxt, "e" );

/*  Compute residual */
    residual = diffnorm /( two*anorm*bnorm*eps );

/*  Destroy arrays */
    free( A );
    free( B );
    free( C );

/*  Destroy process grid */    
    blacs_gridexit_( &ictxt );
    blacs_exit_( &i_zero );
    
/*  Check if residual passed or failed the threshold */
    if ( ( iam == 0 ) && ( thresh >= zero ) && !( residual <= thresh ) ){
        printf( "FAILED. Residual = %05.16f\n", residual );
        return 1;
    } else {
        return 0;
    }

/*========================================================================
  ====== End of PBLAS Level 3 example ====================================
  ======================================================================*/
}
Example #29
0
File: diag.c Project: qsnake/gpaw
int main(int argc, char *argv[]) {

     // Some constants
     int minusone = -1;
     int zero = 0;
     int one = 1;
     double dzero = 0.0; 

     // ConText
     int ConTxt = minusone;

     // order
     char order = 'R';
     char scope = 'A';

     // root process
     int root = zero;

     // BLACS/SCALAPACK parameters
     // the size of the blocks the distributed matrix is split into
     // (applies to both rows and columns)
     int mb = 32;
     int nb = mb; // PDSYEVxxx constraint

     // the number of rows and columns in the processor grid
     // only square processor grids due to C vs. Fortran ordering
     int nprow = 2;
     int npcol = nprow; // only square processor grids, 

     // starting row and column in grid, do not change
     int rsrc = zero; 
     int csrc = zero;

     // dimensions of the matrix to diagonalize
     int m = 1000;
     int n = m; // only square matrices

     int info = zero;

     // Rest of code will only work for:
     // nprow = npcol
     // mb = nb;
     // m = n;
     // rsrc = crsc;

     // Paramteres for Trivial Matrix
     double alpha = 0.1; // off-diagonal
     double beta = 75.0; // diagonal
     
     // For timing:
     double tdiag0, tdiag, ttotal0, ttotal;

     // BLACS Communicator
     MPI_Comm blacs_comm;
     int nprocs;
     int iam;
     int myrow, mycol;

     MPI_Init(&argc, &argv);
     MPI_Barrier(MPI_COMM_WORLD);
     ttotal0 = MPI_Wtime();
     MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
     MPI_Comm_rank(MPI_COMM_WORLD, &iam);

     if (argc > one) {
       nprow = strtod(argv[1],NULL);
       m = strtod(argv[2],NULL);
       npcol = nprow;
       n = m;
     }
    
     if (iam == root) {
       printf("world size %d \n",nprocs);
       printf("n %d \n", n);
       printf("nprow %d \n", nprow);
       printf("npcol %d \n", npcol);
     }

     // We can do this on any subcommunicator.
#ifdef CartComm
     int dim[2];
     int pbc[2];
     dim[0] = nprow;
     dim[1] = npcol;
     pbc[0] = 0;
     pbc[1] = 0;
     MPI_Cart_create(MPI_COMM_WORLD, 2, dim, pbc, 1, &blacs_comm);
#else
     blacs_comm = MPI_COMM_WORLD;
#endif

     // initialize the grid
     // The lines below are equivalent to the one call to:
     if (blacs_comm != MPI_COMM_NULL) {
       ConTxt = Csys2blacs_handle_(blacs_comm);
       Cblacs_gridinit_(&ConTxt, &order, nprow, npcol);

       // get information back about the grid
       Cblacs_gridinfo_(ConTxt, &nprow, &npcol, &myrow, &mycol);
     }

     if (ConTxt != minusone) {

          int desc[9];

          // get the size of the distributed matrix
          int locM = numroc_(&m, &mb, &myrow, &rsrc, &nprow);
          int locN = numroc_(&n, &nb, &mycol, &csrc, &npcol);

	  // printf ("locM = %d \n", locM);
	  // printf ("locN = %d \n", locN);

          int lld = MAX(one,locM);

          // build the descriptor
          descinit_(desc, &m, &n, &mb, &nb, &rsrc, &csrc, &ConTxt, &lld, &info);
          // Allocate arrays
	  // eigenvalues
	  double* eigvals = malloc(n * sizeof(double));

          // allocate the distributed matrices
          double* mata = malloc(locM*locN * sizeof(double));
          // allocate the distributed matrix of eigenvectors
          double* z = malloc(locM*locN * sizeof(double));

          // Eigensolver parameters
          int ibtype = one;
          char jobz = 'V'; // eigenvectors also
          char range = 'A'; // all eiganvalues
          char uplo = 'L'; // work with upper

          double vl, vu;
          int il, iu;

          char cmach = 'U';

          double abstol = 2.0 * pdlamch_(&ConTxt, &cmach);

          int eigvalm, nz;

          double orfac = -1.0;
          //double orfac = 0.001;

          int* ifail;
          ifail = malloc(m * sizeof(int));

          int* iclustr;
          iclustr =  malloc(2*nprow*npcol * sizeof(int));

          double* gap;
          gap =  malloc(nprow*npcol * sizeof(double));

          double* work;
          work = malloc(3 * sizeof(double));
          int querylwork = minusone;
          int* iwork;
          iwork = malloc(1 * sizeof(int));
          int queryliwork = minusone;

          // Build a trivial distributed matrix: Diagonal matrix
	  pdlaset_(&uplo, &m, &n, &alpha, &beta, mata, &one, &one, desc);

	  // First there is a workspace query

          // pdsyevx_(&jobz, &range, &uplo, &n, mata, &one, &one, desc, &vl,
          //          &vu, &il, &iu, &abstol, &eigvalm, &nz, eigvals, &orfac, z, &one,
          //          &one, desc, work, &querylwork, iwork, &queryliwork, ifail, iclustr, gap, &info);
          pdsyevd_(&jobz, &uplo, &n, mata, &one, &one, desc, eigvals,
		   z, &one, &one, desc,
		   work, &querylwork, iwork, &queryliwork, &info);
          //pdsyev_(&jobz, &uplo, &m, mata, &one, &one, desc, eigvals,
          //        z, &one, &one, desc, work, &querylwork, &info);

          int lwork = (int)work[0];
          //printf("lwork %d\n", lwork);
          free(work);
          int liwork = (int)iwork[0];
          //printf("liwork %d\n", liwork);
          free(iwork);

          work = (double*)malloc(lwork * sizeof(double));
          iwork = (int*)malloc(liwork * sizeof(int));

	  // This is actually diagonalizes the matrix
          // pdsyevx_(&jobz, &range, &uplo, &n, mata, &one, &one, desc, &vl,
          //          &vu, &il, &iu, &abstol, &eigvalm, &nz, eigvals, &orfac, z, &one,
          //          &one, desc, work, &lwork, iwork, &liwork, ifail, iclustr, gap, &info);
  
          Cblacs_barrier(ConTxt, &scope);
          tdiag0 = MPI_Wtime();
          pdsyevd_(&jobz, &uplo, &n, mata, &one, &one, desc, eigvals,
                   z, &one, &one, desc,
                   work, &lwork, iwork, &liwork, &info);

          //pdsyev_(&jobz, &uplo, &m, mata, &one, &one, desc, eigvals,
          //        z, &one, &one, desc, work, &lwork, &info);
          Cblacs_barrier(ConTxt, &scope);
          tdiag = MPI_Wtime() - tdiag0;

          free(work);
          free(iwork);
          free(gap);
          free(iclustr);
          free(ifail);
          free(z);
          free(mata);

          // Destroy BLACS grid
          Cblacs_gridexit_(ConTxt);

	  // Check eigenvalues
	  if (myrow == zero && mycol == zero) {
	    for (int i = 0; i < n; i++)
	      {
                if (fabs(eigvals[i] - beta) > 0.0001) 
		    printf("Problem: eigval %d != %f5.2 but %f\n", 
                            i, beta, eigvals[i]);
	      }
	    
	    if (info != zero) {
	      printf("info = %d \n", info);
	    }
	    
	    printf("Time (s) diag: %f\n", tdiag);
	  }

	  free(eigvals);
     }

     MPI_Barrier(MPI_COMM_WORLD);
     ttotal = MPI_Wtime() - ttotal0;
     if (iam == 0)
          printf("Time (s) total: %f\n", ttotal);
     MPI_Finalize();
}
Example #30
0
int main(int argc, char **argv) {
        int iam, nprocs;
        int myrank_mpi, nprocs_mpi;
        int ictxt, nprow, npcol, myrow, mycol;
        int nb, m, n;
        int mpA, nqA, mpU, nqU, mpVT, nqVT;
        int i, j, k, itemp, min_mn;
        int descA[9], descU[9], descVT[9];
        float *A=NULL;
        int info, infoNN, infoVV, infoNV, infoVN;
        float *U_NN=NULL,  *U_VV=NULL,  *U_NV=NULL,  *U_VN=NULL;
        float *VT_NN=NULL, *VT_VV=NULL, *VT_NV=NULL, *VT_VN=NULL;
        float *S_NN=NULL,  *S_VV=NULL, *S_NV=NULL, *S_VN=NULL;
        float *S_res_NN=NULL;
        float orthU_VV, residF, orthVT_VV;
        float orthU_VN, orthVT_NV;
        float  residS_NN, eps;
        float  res_repres_NV, res_repres_VN;
/**/
        int izero=0,ione=1;
        float rtmone=-1.0e+00;
/**/
        double MPIelapsedVV, MPIelapsedNN, MPIelapsedVN, MPIelapsedNV;
        char jobU, jobVT;
        int nbfailure=0, nbtestcase=0,inputfromfile, nbhetereogeneity=0;
        float threshold=100e+00;
        char buf[1024];
        FILE *fd;       
        char *c;
        char *t_jobU, *t_jobVT;
        int *t_m, *t_n, *t_nb, *t_nprow, *t_npcol;
        int nb_expe, expe;
        char hetereogeneityVV, hetereogeneityNN, hetereogeneityVN, hetereogeneityNV;
        int iseed[4], idist;
/**/
        MPI_Init( &argc, &argv);
        MPI_Comm_rank(MPI_COMM_WORLD, &myrank_mpi);
        MPI_Comm_size(MPI_COMM_WORLD, &nprocs_mpi);
/**/
        m = 100; n = 100; nprow = 1; npcol = 1; nb = 64; jobU='A'; jobVT='A'; inputfromfile = 0;
        for( i = 1; i < argc; i++ ) {
                if( strcmp( argv[i], "-f" ) == 0 ) {
                        inputfromfile = 1;
                }
                if( strcmp( argv[i], "-jobvt" ) == 0 ) {
                        if (i+1<argc) {
                                if( strcmp( argv[i+1], "V" ) == 0 ){ jobVT = 'V'; i++; }
                                else if( strcmp( argv[i+1], "N" ) == 0 ){ jobVT = 'N'; i++; }
                                else if( strcmp( argv[i+1], "A" ) == 0 ){ jobVT = 'A'; i++; }
                                else printf(" ** warning: jobvt should be set to V, N or A in the command line ** \n");
                        }
                        else    
                                printf(" ** warning: jobvt should be set to V, N or A in the command line ** \n");
                }
                if( strcmp( argv[i], "-jobu" ) == 0 ) {
                        if (i+1<argc) {
                                if( strcmp( argv[i+1], "V" ) == 0 ){ jobU = 'V'; i++; }
                                else if( strcmp( argv[i+1], "N" ) == 0 ){ jobU = 'N'; i++; }
                                else if( strcmp( argv[i+1], "A" ) == 0 ){ jobU = 'A'; i++; }
                                else printf(" ** warning: jobu should be set to V, N or A in the command line ** \n");
                        }
                        else    
                                printf(" ** warning: jobu should be set to V, N or A in the command line ** \n");
                }
                if( strcmp( argv[i], "-m" ) == 0 ) {
                        m      = atoi(argv[i+1]);
                        i++;
                }
                if( strcmp( argv[i], "-n" ) == 0 ) {
                        n      = atoi(argv[i+1]);
                        i++;
                }
                if( strcmp( argv[i], "-p" ) == 0 ) {
                        nprow  = atoi(argv[i+1]);
                        i++;
                }
                if( strcmp( argv[i], "-q" ) == 0 ) {
                        npcol  = atoi(argv[i+1]);
                        i++;
                }
                if( strcmp( argv[i], "-nb" ) == 0 ) {
                        nb     = atoi(argv[i+1]);
                        i++;
                }
        }
/**/
        if (inputfromfile){
                nb_expe = 0;
                fd = fopen("svd.dat", "r");
                if (fd == NULL) { printf("File failed to open svd.dat from processor mpirank(%d/%d): \n",myrank_mpi,nprocs_mpi); exit(-1); }
                do {    
                        c = fgets(buf, 1024, fd);  /* get one line from the file */
                        if (c != NULL)
                                if (c[0] != '#')
                                        nb_expe++;
                } while (c != NULL);              /* repeat until NULL          */
                fclose(fd);
                t_jobU  = (char *)calloc(nb_expe,sizeof(char)) ;
                t_jobVT = (char *)calloc(nb_expe,sizeof(char)) ;
                t_m     = (int  *)calloc(nb_expe,sizeof(int )) ;
                t_n     = (int  *)calloc(nb_expe,sizeof(int )) ;
                t_nb    = (int  *)calloc(nb_expe,sizeof(int )) ;
                t_nprow = (int  *)calloc(nb_expe,sizeof(int )) ;
                t_npcol = (int  *)calloc(nb_expe,sizeof(int )) ;
                fd = fopen("svd.dat", "r");
                expe=0;
                do {    
                        c = fgets(buf, 1024, fd);  /* get one line from the file */
                        if (c != NULL)
                                if (c[0] != '#'){
                                        //printf("NBEXPE = %d\n",expe);
                                        sscanf(c,"%c %c %d %d %d %d %d",
                                                &(t_jobU[expe]),&(t_jobVT[expe]),&(t_m[expe]),&(t_n[expe]),
                                                &(t_nb[expe]),(&t_nprow[expe]),&(t_npcol[expe]));
                                        expe++;
                                }
                } while (c != NULL);              /* repeat until NULL          */
                fclose(fd);
        }
        else {
                nb_expe = 1;
                t_jobU  = (char *)calloc(nb_expe,sizeof(char)) ;
                t_jobVT = (char *)calloc(nb_expe,sizeof(char)) ;
                t_m     = (int  *)calloc(nb_expe,sizeof(int )) ;
                t_n     = (int  *)calloc(nb_expe,sizeof(int )) ;
                t_nb    = (int  *)calloc(nb_expe,sizeof(int )) ;
                t_nprow = (int  *)calloc(nb_expe,sizeof(int )) ;
                t_npcol = (int  *)calloc(nb_expe,sizeof(int )) ;
                t_jobU[0]  = jobU;
                t_jobVT[0] = jobVT;
                t_m[0]     = m;
                t_n[0]     = n;
                t_nb[0]    = nb;
                t_nprow[0] = nprow;
                t_npcol[0] = npcol;
        }

        if (myrank_mpi==0){
                printf("\n");
                printf("--------------------------------------------------------------------------------------------------------------------\n");
                                printf("                            Testing psgsevd -- float precision SVD ScaLAPACK routine                \n");
                printf("jobU jobVT    m     n     nb   p   q   || info   heter   resid     orthU    orthVT   |SNN-SVV|    time(s)   cond(A) \n");
                printf("--------------------------------------------------------------------------------------------------------------------\n");
        }
/**/
        for (expe = 0; expe<nb_expe; expe++){

        jobU  = t_jobU[expe]  ; 
        jobVT = t_jobVT[expe] ; 
        m     = t_m[expe]     ; 
        n     = t_n[expe]     ; 
        nb    = t_nb[expe]    ; 
        nprow = t_nprow[expe] ; 
        npcol = t_npcol[expe] ; 

        if (nb>n)
                nb = n;
        if (nprow*npcol>nprocs_mpi){
                if (myrank_mpi==0)
                        printf(" **** ERROR : we do not have enough processes available to make a p-by-q process grid ***\n");
                        printf(" **** Bye-bye                                                                         ***\n");
                MPI_Finalize(); exit(1);
        }
/**/
        Cblacs_pinfo( &iam, &nprocs ) ;
        Cblacs_get( -1, 0, &ictxt );
        Cblacs_gridinit( &ictxt, "Row", nprow, npcol );
        Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol );
/**/
        min_mn = min(m,n);
/**/
        //if (iam==0)
                //printf("\tm=%d\tn = %d\t\t(%d,%d)\t%dx%d\n",m,n,nprow,npcol,nb,nb);
        //printf("Hello World, I am proc %d over %d for MPI, proc %d over %d for BLACS in position (%d,%d) in the process grid\n", 
                        //myrank_mpi,nprocs_mpi,iam,nprocs,myrow,mycol);
/*
*
*     Work only the process in the process grid
*
*/
        //if ((myrow < nprow)&(mycol < npcol)){
        if ((myrow>-1)&(mycol>-1)&(myrow<nprow)&(mycol<npcol)){

/*
*
*     Compute the size of the local matrices (thanks to numroc)
*
*/ 
                mpA    = numroc_( &m     , &nb, &myrow, &izero, &nprow );
                nqA    = numroc_( &n     , &nb, &mycol, &izero, &npcol );
                mpU    = numroc_( &m     , &nb, &myrow, &izero, &nprow );
                nqU    = numroc_( &min_mn, &nb, &mycol, &izero, &npcol );
                mpVT   = numroc_( &min_mn, &nb, &myrow, &izero, &nprow );
                nqVT   = numroc_( &n     , &nb, &mycol, &izero, &npcol );
/*
*
*     Allocate and fill the matrices A and B
*
*/ 
                A = (float *)calloc(mpA*nqA,sizeof(float)) ;
                if (A==NULL){ printf("error of memory allocation A on proc %dx%d\n",myrow,mycol); exit(0); }
/**/            
//              seed = iam*(mpA*nqA*2); srand(seed);
                idist = 2;
                iseed[0] = mpA%4096;
                iseed[1] = iam%4096;
                iseed[2] = nqA%4096;
                iseed[3] = 23;
/**/            
                k = 0;
                for (i = 0; i < mpA; i++) {
                        for (j = 0; j < nqA; j++) {
                                slarnv_( &idist, iseed, &ione, &(A[k]) );
                                k++;    
                        }
                }
/*
*
*     Initialize the array descriptor for the distributed matrices xA, U and VT
*
*/ 
                itemp = max( 1, mpA );
                descinit_( descA,  &m, &n, &nb, &nb, &izero, &izero, &ictxt, &itemp, &info );
                itemp = max( 1, mpA );
                descinit_( descU,  &m, &min_mn, &nb, &nb, &izero, &izero, &ictxt, &itemp, &info );
                itemp = max( 1, mpVT );
                descinit_( descVT, &min_mn, &n, &nb, &nb, &izero, &izero, &ictxt, &itemp, &info );
/**/
                eps = pslamch_( &ictxt, "Epsilon" );
/**/
                if ( ((jobU=='V')&(jobVT=='N')) ||(jobU == 'A' )||(jobVT=='A')){
                nbtestcase++;   
                U_VN = (float *)calloc(mpU*nqU,sizeof(float)) ;
                if (U_VN==NULL){ printf("error of memory allocation U_VN on proc %dx%d\n",myrow,mycol); exit(0); }
                S_VN = (float *)calloc(min_mn,sizeof(float)) ;
                if (S_VN==NULL){ printf("error of memory allocation S_VN on proc %dx%d\n",myrow,mycol); exit(0); }
                infoVN = driver_psgesvd( 'V', 'N', m, n, A, 1, 1, descA,
                        S_VN, U_VN, 1, 1, descU, VT_VN, 1, 1, descVT,
                        &MPIelapsedVN);
                orthU_VN  = verif_orthogonality(m,min_mn,U_VN , 1, 1, descU);
                res_repres_VN = verif_repres_VN( m, n, A, 1, 1, descA, U_VN, 1, 1, descU, S_VN);
                if (infoVN==min_mn+1) hetereogeneityVN = 'H'; else hetereogeneityVN = 'N';
                if ( iam==0 )
                        printf(" V    N   %6d %6d  %3d  %3d %3d  ||  %3d     %c   %7.1e   %7.1e                        %8.2f    %7.1e\n",
                                m,n,nb,nprow,npcol,infoVN,hetereogeneityVN,res_repres_VN/(S_VN[0]/S_VN[min_mn-1]),
                                orthU_VN,MPIelapsedVN,S_VN[0]/S_VN[min_mn-1]);
                if (infoVN==min_mn+1) nbhetereogeneity++ ;
                else if ((res_repres_VN/eps/(S_VN[0]/S_VN[min_mn-1])>threshold)||(orthU_VN/eps>threshold)||(infoVN!=0)) nbfailure++;
                }
/**/
                if (((jobU=='N')&(jobVT=='V'))||(jobU == 'A' )||(jobVT=='A')){
                nbtestcase++;   
                VT_NV = (float *)calloc(mpVT*nqVT,sizeof(float)) ;
                if (VT_NV==NULL){ printf("error of memory allocation VT_NV on proc %dx%d\n",myrow,mycol); exit(0); }
                S_NV = (float *)calloc(min_mn,sizeof(float)) ;
                if (S_NV==NULL){ printf("error of memory allocation S_NV on proc %dx%d\n",myrow,mycol); exit(0); }
                infoNV = driver_psgesvd( 'N', 'V', m, n, A, 1, 1, descA,
                        S_NV, U_NV, 1, 1, descU, VT_NV, 1, 1, descVT,
                        &MPIelapsedNV);
                orthVT_NV = verif_orthogonality(min_mn,n,VT_NV, 1, 1, descVT);
                res_repres_NV = verif_repres_NV( m, n, A, 1, 1, descA, VT_NV, 1, 1, descVT, S_NV);
                if (infoNV==min_mn+1) hetereogeneityNV = 'H'; else hetereogeneityNV = 'N';
                if ( iam==0 )
                        printf(" N    V   %6d %6d  %3d  %3d %3d  ||  %3d     %c   %7.1e             %7.1e              %8.2f    %7.1e\n",
                                m,n,nb,nprow,npcol,infoNV,hetereogeneityNV,res_repres_NV/(S_NV[0]/S_NV[min_mn-1]),
                                orthVT_NV,MPIelapsedNV,S_NV[0]/S_NV[min_mn-1]);
                if (infoNV==min_mn+1) nbhetereogeneity++ ;
                else if ((res_repres_NV/eps/(S_NV[0]/S_NV[min_mn-1])>threshold)||(orthVT_NV/eps>threshold)||(infoNV!=0)) nbfailure++;
                }
/**/
                if ( ((jobU=='N')&(jobVT=='N')) || ((jobU=='V')&(jobVT=='V')) || (jobU == 'A' ) || (jobVT=='A') ) {
                nbtestcase++;   
                U_VV = (float *)calloc(mpU*nqU,sizeof(float)) ;
                if (U_VV==NULL){ printf("error of memory allocation U_VV on proc %dx%d\n",myrow,mycol); exit(0); }
                VT_VV = (float *)calloc(mpVT*nqVT,sizeof(float)) ;
                if (VT_VV==NULL){ printf("error of memory allocation VT_VV on proc %dx%d\n",myrow,mycol); exit(0); }
                S_VV = (float *)calloc(min_mn,sizeof(float)) ;
                if (S_VV==NULL){ printf("error of memory allocation S_VV on proc %dx%d\n",myrow,mycol); exit(0); }
                infoVV = driver_psgesvd( 'V', 'V', m, n, A, 1, 1, descA,
                        S_VV, U_VV, 1, 1, descU, VT_VV, 1, 1, descVT,
                        &MPIelapsedVV);
                orthU_VV  = verif_orthogonality(m,min_mn,U_VV , 1, 1, descU);
                orthVT_VV = verif_orthogonality(min_mn,n,VT_VV, 1, 1, descVT);
                residF =  verif_representativity( m, n,     A, 1, 1, descA,
                                                         U_VV, 1, 1, descU,
                                                        VT_VV, 1, 1, descVT,
                                                         S_VV);
                if (infoVV==min_mn+1) hetereogeneityVV = 'H'; else hetereogeneityVV = 'N';
                if ( iam==0 )
                        printf(" V    V   %6d %6d  %3d  %3d %3d  ||  %3d     %c   %7.1e   %7.1e   %7.1e              %8.2f    %7.1e\n",
                                m,n,nb,nprow,npcol,infoVV,hetereogeneityVV,residF,orthU_VV,orthVT_VV,MPIelapsedVV,S_VV[0]/S_VV[min_mn-1]);
                if (infoVV==min_mn+1) nbhetereogeneity++ ;
                else if ((residF/eps>threshold)||(orthU_VV/eps>threshold)||(orthVT_VV/eps>threshold)||(infoVV!=0)) nbfailure++;
                }
/**/
                if (((jobU=='N')&(jobVT=='N'))||(jobU == 'A' )||(jobVT=='A')){
                nbtestcase++;   
                S_NN = (float *)calloc(min_mn,sizeof(float)) ;
                if (S_NN==NULL){ printf("error of memory allocation S_NN on proc %dx%d\n",myrow,mycol); exit(0); }
                infoNN = driver_psgesvd( 'N', 'N', m, n, A, 1, 1, descA,
                        S_NN, U_NN, 1, 1, descU, VT_NN, 1, 1, descVT,
                        &MPIelapsedNN);
                S_res_NN = (float *)calloc(min_mn,sizeof(float)) ;
                if (S_res_NN==NULL){ printf("error of memory allocation S on proc %dx%d\n",myrow,mycol); exit(0); }
                scopy_(&min_mn,S_VV,&ione,S_res_NN,&ione);
                saxpy_ (&min_mn,&rtmone,S_NN,&ione,S_res_NN,&ione);
                residS_NN = snrm2_(&min_mn,S_res_NN,&ione) / snrm2_(&min_mn,S_VV,&ione);
                free(S_res_NN);
                if (infoNN==min_mn+1) hetereogeneityNN = 'H'; else hetereogeneityNN = 'N';
                if ( iam==0 )
                        printf(" N    N   %6d %6d  %3d  %3d %3d  ||  %3d     %c                                  %7.1e   %8.2f    %7.1e\n",
                                m,n,nb,nprow,npcol,infoNN,hetereogeneityNN,residS_NN,MPIelapsedNN,S_NN[0]/S_NN[min_mn-1]);
                if (infoNN==min_mn+1) nbhetereogeneity++ ;
                else if ((residS_NN/eps>threshold)||(infoNN!=0)) nbfailure++;
                }
/**/
                if (((jobU=='V')&(jobVT=='N'))||(jobU == 'A' )||(jobVT=='A')){ free(S_VN); free(U_VN); }
                if (((jobU=='N')&(jobVT=='V'))||(jobU == 'A' )||(jobVT=='A')){ free(VT_NV); free(S_NV); }
                if (((jobU=='N')&(jobVT=='N'))||(jobU == 'A' )||(jobVT=='A')){ free(S_NN); }
                if (((jobU=='N')&(jobVT=='N'))||((jobU=='V')&(jobVT=='V'))||(jobU == 'A' )||(jobVT=='A')){ free(U_VV); free(S_VV); free(VT_VV);}
                free(A);
                Cblacs_gridexit( 0 );
        }
/*
*     Print ending messages
*/
        }
        if ( iam==0 ){
                printf("--------------------------------------------------------------------------------------------------------------------\n");
                printf("               [ nbhetereogeneity = %d / %d ]\n",nbhetereogeneity, nbtestcase);
                printf("               [ nbfailure        = %d / %d ]\n",nbfailure, nbtestcase-nbhetereogeneity);
                printf("--------------------------------------------------------------------------------------------------------------------\n");
                printf("\n");
        }
/**/
        free(t_jobU  );
        free(t_jobVT );
        free(t_m     );
        free(t_n     );
        free(t_nb    );
        free(t_nprow );
        free(t_npcol );
        MPI_Finalize();
        exit(0);
}