slpp::int_t pdgesvdSlave(void* bufs[], size_t sizes[], unsigned count, bool debugOverwriteArgs) { // TODO: exit()S and SLAVE_ASSERT()s need to use MPI_abort() / blacs_abort() instead for(size_t i=0; i < count; i++) { if(DBG) { std::cerr << "doPdgesvd: buffer at:"<< bufs[i] << std::endl; std::cerr << "doPdgesvd: bufsize =" << sizes[i] << std::endl; } } if(count < NUM_BUFS) { std::cerr << "pdgesvdSlave: master sent " << count << " buffers, but " << NUM_BUFS << " are required." << std::endl; ::abort(); } // size check and get reference to args enum dummy {BUF_ARGS=0}; // NOTE bufs[BUF_ARGS] should not be referenced by pdgesvdSlave2 SLAVE_ASSERT_ALWAYS( sizes[BUF_ARGS] >= sizeof(PdgesvdArgs)); scidb::PdgesvdArgs args = *reinterpret_cast<PdgesvdArgs*>(bufs[BUF_ARGS]) ; // set up the scalapack grid, this has to be done before we generate the fake // problem slpp::int_t ICTXT=-1; // will be overwritten by sl_init sl_init_(ICTXT/*out*/, args.NPROW/*in*/, args.NPCOL/*in*/); // sl_init calls blacs_grid_init if(DBG) std::cerr << "pdgesvdSlave: sl_init(NPROW: "<<args.NPROW<<", NPCOL:"<<args.NPCOL<<") -> ICTXT: " << ICTXT << std::endl; // blacs_grid_info is legal after this // take a COPY of args, because we may have to overwrite it (for debug) when overwriteArgs is set // NOTE bufs[BUF_ARGS] should not be referenced after this point if(debugOverwriteArgs) { slpp::int_t NPROW, NPCOL, MYPROW, MYPCOL, MYPNUM; getSlInfo(ICTXT/*in*/, NPROW/*in*/, NPCOL/*in*/, MYPROW/*out*/, MYPCOL/*out*/, MYPNUM/*out*/); size_t matrixCells = sizes[1]/sizeof(double); size_t matrixOrder = floor(sqrt(matrixCells)); // TODO: should be multiplied by NPROW*NPCOL args = pdgesvdGenTestArgs(ICTXT, NPROW, NPCOL, MYPROW, MYPCOL, MYPNUM, matrixOrder); } return pdgesvdSlave2(ICTXT, args, bufs, sizes, count); }
/// /// This is the new standard style for implementing a slave routine for a ScaLAPACK operator, /// in this case, pdgesvd_(). The difference from the old style is that the new style /// requires that the ScaLAPACK context, ICTXT, be provided. Until that requirement can be pushed /// up into the "mpi_slave_xxx" files, the existing pdgesvdSlave() routine will create the context /// and then call this routine. /// slpp::int_t pdgesvdSlave2(const slpp::int_t ICTXT, PdgesvdArgs args, void* bufs[], size_t sizes[], unsigned count) { // find out where I am in the scalapack grid slpp::int_t NPROW, NPCOL, MYPROW, MYPCOL, MYPNUM; getSlInfo(ICTXT/*in*/, NPROW/*in*/, NPCOL/*in*/, MYPROW/*out*/, MYPCOL/*out*/, MYPNUM/*out*/); if(NPROW != args.NPROW || NPCOL != args.NPCOL || MYPROW != args.MYPROW || MYPCOL != args.MYPCOL || MYPNUM != args.MYPNUM){ if(DBG) { std::cerr << "scalapack general parameter mismatch" << std::endl; std::cerr << "args NPROW:"<<args.NPROW<<" NPCOL:"<<args.NPCOL << "MYPROW:"<<args.MYPROW<<" MYPCOL:"<<args.MYPCOL<<"MYPNUM:"<<MYPNUM << std::endl; std::cerr << "ScaLAPACK NPROW:"<<NPROW<<" NPCOL:"<<NPCOL << "MYPROW:"<<MYPROW<<" MYPCOL:"<<MYPCOL<<"MYPNUM:"<<MYPNUM << std::endl; } } // setup MB,NB const slpp::int_t& M = args.A.DESC.M ; const slpp::int_t& N = args.A.DESC.N ; const slpp::int_t& MB = args.A.DESC.MB ; const slpp::int_t& NB = args.A.DESC.NB ; const slpp::int_t& LLD_A = args.A.DESC.LLD ; const slpp::int_t one = 1 ; const slpp::int_t LTD_A = std::max(one, numroc_( N, NB, MYPCOL, /*CSRC_A*/0, NPCOL )); const slpp::int_t& MP = LLD_A ; const slpp::int_t& NQ = LTD_A ; // size check A, S, U, VT slpp::int_t SIZE_A = MP * NQ ; slpp::int_t SIZE_S = std::min(M, N); slpp::int_t size_p = std::max(one, numroc_( SIZE_S, MB, MYPROW, /*RSRC_A*/0, NPROW )); slpp::int_t size_q = std::max(one, numroc_( SIZE_S, NB, MYPCOL, /*RSRC_A*/0, NPCOL )); slpp::int_t SIZE_U = MP * size_q; slpp::int_t SIZE_VT= size_p * NQ; if(DBG) { std::cerr << "##################################################" << std::endl; std::cerr << "####pdgesvdSlave##################################" << std::endl; std::cerr << "one:" << one << std::endl; std::cerr << "SIZE_S:" << SIZE_S << std::endl; std::cerr << "MB:" << MB << std::endl; std::cerr << "MYPROW:" << MYPROW << std::endl; std::cerr << "NPROW:" << NPROW << std::endl; } // TODO: >= because master is permitted to use a larger buffer // to allow to see if rounding up to chunksize eliminates some errors // before we put the roundUp formula everywhere SLAVE_ASSERT_ALWAYS(sizes[BUF_A] >= SIZE_A * sizeof(double)); SLAVE_ASSERT_ALWAYS(sizes[BUF_S] >= SIZE_S * sizeof(double)); if (args.jobU == 'V') { SLAVE_ASSERT_ALWAYS( sizes[BUF_U] >= SIZE_U *sizeof(double)); } if (args.jobVT == 'V') { SLAVE_ASSERT_ALWAYS( sizes[BUF_VT] >= SIZE_VT *sizeof(double)); } // sizes are correct, give the pointers their names double* A = reinterpret_cast<double*>(bufs[BUF_A]) ; double* S = reinterpret_cast<double*>(bufs[BUF_S]) ; double* U = reinterpret_cast<double*>(bufs[BUF_U]) ; double* VT = reinterpret_cast<double*>(bufs[BUF_VT]) ; // debug that the input is readable and show its contents if(DBG) { for(int ii=0; ii < SIZE_A; ii++) { std::cerr << "("<< MYPROW << "," << MYPCOL << ") A["<<ii<<"] = " << A[ii] << std::endl; } } if(false) { // debug that outputs are writeable: for(int ii=0; ii < SIZE_S; ii++) { S[ii] = -9999.0 ; } if (args.jobU == 'V') { for(int ii=0; ii < SIZE_U; ii++) { U[ii] = -9999.0 ; } } if (args.jobVT == 'V') { for(int ii=0; ii < SIZE_VT; ii++) { VT[ii] = -9999.0 ; } } } // ScaLAPACK: the DESCS are complete except for the correct context args.A.DESC.CTXT= ICTXT ; // note: no DESC for S, it is not distributed, all have a copy args.U.DESC.CTXT= ICTXT ; args.VT.DESC.CTXT= ICTXT ; if(DBG) { std::cerr << "pdgesvdSlave: argsBuf is: {" << std::endl; std::cerr << args << std::endl; std::cerr << "}" << std::endl << std::endl; std::cerr << "pdgesvdSlave: calling pdgesvd_ for computation, with args:" << std::endl ; std::cerr << "jobU: " << args.jobU << ", jobVT: " << args.jobVT << ", M: " << args.M << ", N: " << args.N << std::endl; std::cerr << "A: " << (void*)(A) << ", A.I: " << args.A.I << ", A.J: " << args.A.J << std::endl; std::cerr << ", A.DESC: " << args.A.DESC << std::endl; std::cerr << "S: " << (void*)(S) << std::endl; std::cerr << "U: " << (void*)(U) << ", U.I: " << args.U.I << ", U.J: " << args.U.J << std::endl; std::cerr << ", U.DESC: " << args.U.DESC << std::endl; std::cerr << "VT: " << (void*)(VT) << ", VT.I: " << args.VT.I << ", VT.J: " << args.VT.J << std::endl; std::cerr << ", VT.DESC: " << args.VT.DESC << std::endl; } if(DBG) std::cerr << "pdgesvdSlave calling PDGESVD to get work size" << std:: endl; slpp::int_t INFO = 0; double LWORK_DOUBLE; pdgesvd_(args.jobU, args.jobVT, args.M, args.N, A, args.A.I, args.A.J, args.A.DESC, S, U, args.U.I, args.U.J, args.U.DESC, VT, args.VT.I, args.VT.J, args.VT.DESC, &LWORK_DOUBLE, -1, INFO); if(INFO < 0) { // argument error std::cerr << "pdgesvdSlave(r:"<<MYPNUM<<"): " << "ERROR: pdgesvd_() for work size, argument error, argument # " << -INFO << std::endl; } else if(INFO == (std::min(args.M, args.N)+1)) { // should not happen when checking work size // heterogeneity detected (eigenvalues did not match on all nodes) std::cerr << "pdgesvdSlave(r:"<<MYPNUM<<"): " << "WARNING: pdgesvd_() for work size, eigenvalues did not match across all instances" << std::endl; } else if (INFO > 0) { // other + value of INFO // should not happen when checking work size // DBDSQR did not converge std::cerr << "pdgesvdSlave(r:"<<MYPNUM<<"): " << "ERROR: pdgesvd_() for work size, DBDSQR did not converge: " << INFO << std::endl; } if ( LWORK_DOUBLE < 0.0 || LWORK_DOUBLE > double(numeric_limits<slpp::int_t>::max())) { // Houston, we have a problem ... the user wants to do more than 1 instance // can handle through the slpp::int ... the size of which is determined // by which binary for ScaLAPACK/BLAS we are using .. .32-bit or 64-bit FORTRAN INTEGER // noting that 32-bit INTEGER is what is shipped with RHEL, CentOS, etc, even on // 64-bit systems, for some unknown reason. std::cerr << "pdgesvdSlave(r:"<<MYPNUM<<"): " << "ERROR: LWORK_DOUBLE, " << LWORK_DOUBLE << ", is too large for the ScaLAPACK API to accept" << INFO << std::endl; if (INFO >= 0) { // make up our own argument error... -22 (there are 20 arguments) INFO = -22; } return INFO; } slpp::int_t LWORK = int(LWORK_DOUBLE); // get the cast from SVDPhysical.cpp std::cerr << "pdgesvdSlave(): info: LWORK is " << LWORK << std::endl; // ALLOCATE an array WORK size LWORK boost::scoped_array<double> WORKtmp(new double[LWORK]); double* WORK = WORKtmp.get(); ////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////// if(DBG) std::cerr << "pdgesvdSlave: calling pdgesvd_ for computation." << std::endl ; INFO=0; pdgesvd_(args.jobU, args.jobVT, args.M, args.N, A, args.A.I, args.A.J, args.A.DESC, S, U, args.U.I, args.U.J, args.U.DESC, VT, args.VT.I, args.VT.J, args.VT.DESC, WORK, LWORK, INFO); slpp::int_t numToPrintAtStart=4 ; if (MYPNUM==0 && DBG) { int ii; // used in 2nd loop for(ii=0; ii < std::min(SIZE_S, numToPrintAtStart); ii++) { std::cerr << "pdgesvdSlave: S["<<ii<<"] = " << S[ii] << std::endl; } // now skip to numToPrintAtStart before the end, (without repeating) and print to the end // to see if the test cases are producing zero eigenvalues (don't want that) slpp::int_t numToPrintAtEnd=4; for(int ii=std::max(ii, SIZE_S-numToPrintAtEnd); ii < SIZE_S; ii++) { std::cerr << "pdgesvdSlave: S["<<ii<<"] = " << S[ii] << std::endl; } } if (DBG) { if (args.jobU == 'V') { for(int ii=0; ii < std::min(SIZE_U, numToPrintAtStart); ii++) { std::cerr << "pdgesvdSlave: U["<<ii<<"] = " << U[ii] << std::endl; } } if (args.jobVT == 'V') { for(int ii=0; ii < std::min(SIZE_VT, numToPrintAtStart); ii++) { std::cerr << "pdgesvdSlave: VT["<<ii<<"] = " << VT[ii] << std::endl; } } } if(MYPNUM == 0) { if(INFO < 0) { // argument error std::cerr << "pdgesvdSlave(r:"<<MYPNUM<<"): " << "ERROR: argument error, argument # " << -INFO << std::endl; } else if(INFO == (std::min(args.M, args.N)+1)) { // heterogeneity detected (eigenvalues did not match on all nodes) std::cerr << "pdgesvdSlave(r:"<<MYPNUM<<"): " << "WARNING: eigenvalues did not match across all instances" << std::endl; } else if (INFO > 0) { // other + value of INFO // DBDSQR did not converge std::cerr << "pdgesvdSlave(r:"<<MYPNUM<<"): " << "ERROR: DBDSQR did not converge: " << INFO << std::endl; } } return INFO ; }
/// /// @return INFO = the status of the psgemm_() /// slpp::int_t pdgemmSlave(void* bufs[], size_t sizes[], unsigned count) { enum dummy {BUF_ARGS=0, BUF_A, BUF_B, BUF_C, NUM_BUFS }; for(size_t i=0; i < count; i++) { if(DBG) { std::cerr << "pdgemmSlave: buffer at:"<< bufs[i] << std::endl; std::cerr << "pdgemmSlave: bufsize =" << sizes[i] << std::endl; } } if(count < NUM_BUFS) { std::cerr << "pdgemmSlave: master sent " << count << " buffers, but " << NUM_BUFS << " are required." << std::endl; ::exit(99); // something that does not look like a signal } // take a COPY of args (because we will have to patch DESC.CTXT) scidb::PdgemmArgs args = *reinterpret_cast<PdgemmArgs*>(bufs[BUF_ARGS]) ; if(DBG) { std::cerr << "pdgemmSlave: args {" << std::endl ; std::cerr << args << std::endl; std::cerr << "}" << std::endl ; } // set up the scalapack grid if(DBG) std::cerr << "pdgemmSlave: NPROW:"<<args.NPROW<<" NPCOL:"<<args.NPCOL<<std::endl; slpp::int_t ICTXT=-1; // will be overwritten by sl_init // call scalapack tools routine to initialize a scalapack grid and give us its // context sl_init_(ICTXT/*out*/, args.NPROW/*in*/, args.NPCOL/*in*/); slpp::int_t NPROW=-1, NPCOL=-1, MYPROW=-1, MYPCOL=-1, MYPNUM=-1; // illegal vals getSlaveBLACSInfo(ICTXT/*in*/, NPROW, NPCOL, MYPROW, MYPCOL, MYPNUM); if(NPROW != args.NPROW || NPCOL != args.NPCOL || MYPROW != args.MYPROW || MYPCOL != args.MYPCOL || MYPNUM != args.MYPNUM){ if(DBG) { std::cerr << "scalapack general parameter mismatch" << std::endl; std::cerr << "args NPROW:"<<args.NPROW<<" NPCOL:"<<args.NPCOL << "MYPROW:"<<args.MYPROW<<" MYPCOL:"<<args.MYPCOL<<"MYPNUM:"<<MYPNUM << std::endl; std::cerr << "ScaLAPACK NPROW:"<<NPROW<<" NPCOL:"<<NPCOL << "MYPROW:"<<MYPROW<<" MYPCOL:"<<MYPCOL<<"MYPNUM:"<<MYPNUM << std::endl; } } const slpp::int_t one = 1 ; const slpp::int_t LTD_A = std::max(one, numroc_( args.A.DESC.N, args.A.DESC.NB, MYPCOL, /*CSRC_A*/0, NPCOL )); const slpp::int_t LTD_B = std::max(one, numroc_( args.B.DESC.N, args.B.DESC.NB, MYPCOL, /*CSRC_B*/0, NPCOL )); const slpp::int_t LTD_C = std::max(one, numroc_( args.C.DESC.N, args.C.DESC.NB, MYPCOL, /*CSRC_C*/0, NPCOL )); if(DBG) { std::cerr << "##################################################" << std::endl; std::cerr << "####pdgemmSlave##################################" << std::endl; std::cerr << "one:" << one << std::endl; std::cerr << "args.A.DESC.MB:" << args.A.DESC.MB << std::endl; std::cerr << "MYPROW:" << MYPROW << std::endl; std::cerr << "NPROW:" << NPROW << std::endl; } // size check args SLAVE_ASSERT_ALWAYS( sizes[BUF_ARGS] >= sizeof(PdgemmArgs)); // size check A,B,C -- debugs first slpp::int_t SIZE_A = args.A.DESC.LLD * LTD_A ; slpp::int_t SIZE_B = args.B.DESC.LLD * LTD_B ; slpp::int_t SIZE_C = args.C.DESC.LLD * LTD_C ; if(DBG) { if(sizes[BUF_A] != SIZE_A *sizeof(double)) { std::cerr << "sizes[BUF_A]:" << sizes[BUF_A] << " != args.A.DESC.LLD:" << args.A.DESC.LLD << "* LTD_A" << LTD_A << "*" << sizeof(double) << std::endl; } if(sizes[BUF_B] != SIZE_B *sizeof(double)) { std::cerr << "sizes[BUF_B]:" << sizes[BUF_B] << " != args.B.DESC.LLD:" << args.B.DESC.LLD << "* LTD_B" << LTD_B << "*" << sizeof(double) << std::endl; } if(sizes[BUF_C] != SIZE_C *sizeof(double)) { std::cerr << "sizes[BUF_C]:" << sizes[BUF_C] << " != args.C.DESC.LLD:" << args.C.DESC.LLD << "* LTD_C" << LTD_C << "*" << sizeof(double) << std::endl; } } SLAVE_ASSERT_ALWAYS(sizes[BUF_A] >= SIZE_A * sizeof(double)); SLAVE_ASSERT_ALWAYS(sizes[BUF_B] >= SIZE_B * sizeof(double)); SLAVE_ASSERT_ALWAYS(sizes[BUF_C] >= SIZE_C * sizeof(double)); // sizes are correct, give the pointers their names double* A = reinterpret_cast<double*>(bufs[BUF_A]) ; double* B = reinterpret_cast<double*>(bufs[BUF_B]) ; double* C = reinterpret_cast<double*>(bufs[BUF_C]) ; // debug that the input is readable and show its contents if(DBG) { for(int ii=0; ii < SIZE_A; ii++) { std::cerr << "Pgrid("<< MYPROW << "," << MYPCOL << ") A["<<ii<<"] = " << A[ii] << std::endl; } for(int ii=0; ii < SIZE_B; ii++) { std::cerr << "Pgrid("<< MYPROW << "," << MYPCOL << ") B["<<ii<<"] = " << B[ii] << std::endl; } for(int ii=0; ii < SIZE_C; ii++) { std::cerr << "Pgrid("<< MYPROW << "," << MYPCOL << ") C["<<ii<<"] = " << C[ii] << std::endl; } } // ScaLAPACK: the DESCS are complete except for the correct context args.A.DESC.CTXT= ICTXT ; // (no DESC for S) args.B.DESC.CTXT= ICTXT ; args.C.DESC.CTXT= ICTXT ; if(true || DBG) { // we'll leave this on in Cheshire.0 and re-evaluate later std::cerr << "pdgemmSlave: argsBuf is: {" << std::endl; std::cerr << args << std::endl; std::cerr << "}" << std::endl << std::endl; std::cerr << "pdgemmSlave: calling pdgemm_ for computation, with args:" << std::endl ; std::cerr << "TRANSA: " << args.TRANSA << ", TRANSB: " << args.TRANSB << ", M: " << args.M << ", N: " << args.N << ", K: " << args.K << std::endl; std::cerr << "ALPHA: " << args.ALPHA << std::endl; std::cerr << "A: " << (void*)(A) << ", A.I: " << args.A.I << ", A.J: " << args.A.J << std::endl; std::cerr << ", A.DESC: " << args.A.DESC << std::endl; std::cerr << "B: " << (void*)(B) << ", B.I: " << args.B.I << ", B.J: " << args.B.J << std::endl; std::cerr << ", B.DESC: " << args.B.DESC << std::endl; std::cerr << "BETA: " << args.BETA << std::endl; std::cerr << "C: " << (void*)(C) << ", C.I: " << args.C.I << ", C.J: " << args.C.J << std::endl; std::cerr << ", C.DESC: " << args.C.DESC << std::endl; } ////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////// pdgemm_( args.TRANSA, args.TRANSB, args.M, args.N, args.K, &args.ALPHA, A, args.A.I, args.A.J, args.A.DESC, B, args.B.I, args.B.J, args.B.DESC, &args.BETA, C, args.C.I, args.C.J, args.C.DESC); if(true || DBG) { // we'll leave this on in Cheshire.0 and re-evaluate later std::cerr << "pdgemmSlave: pdgemm_ complete (pdgemm_ has no result INFO)" << std::endl; } if (DBG) { std::cerr << "pdgemmSlave outputs: {" << std::endl; // debug prints of the outputs: for(int ii=0; ii < SIZE_C; ii++) { std::cerr << " C["<<ii<<"] = " << C[ii] << std::endl; } std::cerr << "}" << std::endl; } // TODO: what is the check on the pdgemm_ (pblas call) for successful completion? if (DBG) std::cerr << "pdgemmSlave returning successfully:" << std::endl; slpp::int_t INFO = 0 ; return INFO ; }