void MpiLauncher::completeLaunch(pid_t pid, const std::string& pidFile, int status) { // rm args file boost::scoped_ptr<SharedMemoryIpc> shmIpc(mpi::newSharedMemoryIpc(_ipcName)); shmIpc->remove(); shmIpc.reset(); // rm pid file scidb::File::remove(pidFile.c_str(), false); // rm log file if (!logger->isTraceEnabled() && !_inError) { string logFileName = mpi::getLauncherLogFile(_installPath, _queryId, _launchId); scidb::File::remove(logFileName.c_str(), false); } if (WIFSIGNALED(status)) { LOG4CXX_ERROR(logger, "SciDB MPI launcher (pid="<<pid<<") terminated by signal = " << WTERMSIG(status) << (WCOREDUMP(status)? ", core dumped" : "")); throw SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED) << "MPI launcher process"; } else if (WIFEXITED(status)) { int rc = WEXITSTATUS(status); if (rc != 0) { LOG4CXX_ERROR(logger, "SciDB MPI launcher (pid="<<_pid<<") exited with status = " << rc); throw SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED) << "MPI launcher process"; } else { LOG4CXX_DEBUG(logger, "SciDB MPI launcher (pid="<<_pid<<") exited with status = " << rc); return; } } throw SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNREACHABLE_CODE); }
void MpiSlaveProxy::destroy(bool error) { QueryID queryIdForKill(INVALID_QUERY_ID); if (error) { _inError=true; queryIdForKill = _queryId; } const string clusterUuid = Cluster::getInstance()->getUuid(); // kill the slave proc and its parent orted for ( std::vector<pid_t>::const_iterator iter=_pids.begin(); iter!=_pids.end(); ++iter) { pid_t pid = *iter; //XXX TODO tigor: kill proceess group (-pid) ? LOG4CXX_DEBUG(logger, "MpiSlaveProxy::destroy: killing slave pid = "<<pid); MpiErrorHandler::killProc(_installPath, clusterUuid, pid, queryIdForKill); } std::string pidFile = mpi::getSlavePidFile(_installPath, _queryId, _launchId); MpiErrorHandler::cleanupSlavePidFile(_installPath, clusterUuid, pidFile, queryIdForKill); // rm log file if (!logger->isTraceEnabled() && !_inError) { string logFileName = mpi::getSlaveLogFile(_installPath, _queryId, _launchId); scidb::File::remove(logFileName.c_str(), false); } }
void init() { //calculate log directory #ifdef _MSC_VER std::string log_file_name_str = ros::file_log::getLogDirectory() + "/rosout.log"; LOG4CXX_DECODE_CHAR(log_file_name, log_file_name_str); // this instantiates log_file_name as type LogString as well std::string empty_str = ""; LOG4CXX_DECODE_CHAR(log_empty, empty_str); #else std::string log_file_name = ros::file_log::getLogDirectory() + "/rosout.log"; std::string log_empty = ""; #endif logger_ = log4cxx::Logger::getRootLogger(); log4cxx::LayoutPtr layout = new log4cxx::PatternLayout(log_empty); log4cxx::RollingFileAppenderPtr appender = new log4cxx::RollingFileAppender(layout, log_file_name, true); logger_->addAppender( appender ); appender->setMaximumFileSize(100*1024*1024); appender->setMaxBackupIndex(10); log4cxx::helpers::Pool pool; appender->activateOptions(pool); std::cout << "logging to " << log_file_name.c_str() << std::endl; LOG4CXX_INFO(logger_, "\n\n" << ros::Time::now() << " Node Startup\n"); agg_pub_ = node_.advertise<rosgraph_msgs::Log>("/rosout_agg", 0); std::cout << "re-publishing aggregated messages to /rosout_agg" << std::endl; rosout_sub_ = node_.subscribe("/rosout", 0, &Rosout::rosoutCallback, this); std::cout << "subscribed to /rosout" << std::endl; }
bool LogImpl::validatePath(log4cxx::LoggerPtr logger) { if(activatePathValidation) { vector<AppenderPtr> appenders = logger->getAllAppenders(); RollingFileAppender* appenderTemp = new RollingFileAppender(); for(int i = 0; i < (int)appenders.size(); i++) { std::string currentAppenderName = typeid(*appenders[i]).name(); std::string typeName = typeid(*appenderTemp).name(); //RollingFile if(currentAppenderName.find("RollingFile") != string::npos) { RollingFileAppender* apender = (RollingFileAppender*) &appenders[i]; std::string currentFile; log4cxx::helpers::Transcoder::encode(apender->getFile(), currentFile); if(DirectoryExists(currentFile)) { return true; } } } delete(appenderTemp); } else return true; return false; }
// XXX TODO: consider returning std::vector<scidb::SharedMemoryPtr> // XXX TODO: which would require supporting different types of memory (double, char etc.) std::vector<MPIPhysical::SMIptr_t> MPIPhysical::allocateMPISharedMemory(size_t numBufs, size_t elemSizes[], size_t numElems[], string dbgNames[]) { LOG4CXX_DEBUG(logger, "MPIPhysical::allocateMPISharedMemory(numBufs "<<numBufs<<",,,)"); if(logger->isTraceEnabled()) { LOG4CXX_TRACE(logger, "MPIPhysical::allocateMPISharedMemory(): allocations are: "); for(size_t ii=0; ii< numBufs; ii++) { LOG4CXX_TRACE(logger, "MPIPhysical::allocateMPISharedMemory():" << " elemSizes["<<ii<<"] "<< dbgNames[ii] << " len " << numElems[ii]); } } std::vector<SMIptr_t> shmIpc(numBufs); bool preallocate = Config::getInstance()->getOption<bool>(CONFIG_PREALLOCATE_SHM); for(size_t ii=0; ii<numBufs; ii++) { std::stringstream suffix; suffix << "." << ii ; std::string ipcNameFull= _ipcName + suffix.str(); LOG4CXX_TRACE(logger, "IPC name = " << ipcNameFull); shmIpc[ii] = SMIptr_t(mpi::newSharedMemoryIpc(ipcNameFull, preallocate)); // can I get 'em off ctx instead? _ctx->addSharedMemoryIpc(_launchId, shmIpc[ii]); char* ptr = MpiLauncher::initIpcForWrite(shmIpc[ii].get(), (elemSizes[ii] * numElems[ii])); assert(ptr); ptr=ptr; } return shmIpc; }
void log4cxx_debug_dimensions(const std::string& prefix, const Dimensions& dims) { if(logger->isDebugEnabled()) { for (size_t i=0; i<dims.size(); i++) { LOG4CXX_DEBUG(logger, prefix << " dims["<<i<<"] from " << dims[i].getStartMin() << " to " << dims[i].getEndMax()); } } }
void __cdecl AppLog(int level, const char *format, ...) { static log4cxx::LoggerPtr logger(Logger::getLogger("App")); va_list args; va_start(args, format); int nBuf; char szBuffer[4096]= ""; #if _WIN32 nBuf = _vsnprintf(szBuffer, _countof(szBuffer), format, args); #else nBuf = vsnprintf(szBuffer, _countof(szBuffer), format, args); #endif va_end(args); switch(level) { case APP_LOG_DEBUG: logger->debug(szBuffer); break; case APP_LOG_INFO: logger->info(szBuffer); break; case APP_LOG_WARN: logger->warn(szBuffer); break; case APP_LOG_ERR: logger->error(szBuffer); break; default: logger->debug(szBuffer); } }
void configure_default_logger(log4cxx::LoggerPtr logger, log4cxx::LevelPtr level, std::string fname, bool dual) { if (fname.empty() && dual) throw std::logic_error("dual log mode requires a filename"); logger->setLevel(level); if (fname.empty() || dual) { log4cxx::AppenderPtr app = logger_write_to_cout(logger); app->setName("COUT"); } if (!fname.empty()) { log4cxx::AppenderPtr app = logger_write_to_file(fname, logger); app->setName("FILE"); } }
void MpiSlaveProxy::destroy(bool error) { if (error) { _inError=true; } // kill the slave proc and its parent orted for ( std::vector<pid_t>::const_iterator iter=_pids.begin(); iter!=_pids.end(); ++iter) { pid_t pid = *iter; //XXX TODO tigor: kill proceess group (-pid) ? MpiErrorHandler::killProc(_installPath, pid); } // rm pid file std::string pidFile = mpi::getSlavePidFile(_installPath, _queryId, _launchId); scidb::File::remove(pidFile.c_str(), false); // rm log file if (!logger->isTraceEnabled() && !_inError) { string logFileName = mpi::getSlaveLogFile(_installPath, _queryId, _launchId); scidb::File::remove(logFileName.c_str(), false); } }
void log_interface(int severity, const char *msg) { static log4cxx::LoggerPtr logger(Logger::getLogger("Libevent")); printf("%s", msg); switch(severity) { case EVENT_LOG_DEBUG: logger->debug(msg); break; case EVENT_LOG_MSG: logger->info(msg); break; case EVENT_LOG_WARN: logger->warn(msg); break; case EVENT_LOG_ERR: logger->error(msg); break; default: logger->debug(msg); } }
void TracingDemoModel::populateDetectedBlobs_CentroidPerNeighbourBugsGroup(const std::vector<BugCreature>& bugs, std::vector<DetectedBlob>& resultBlobs) { const float CloseBlobsDist = 7; std::vector<uchar> processedBugs(bugs.size(), false); for (size_t i = 0; i < bugs.size(); ++i) { if (processedBugs[i]) continue; processedBugs[i] = true; auto& bug = bugs[i]; float centroidX = bug.Pos.x(); float centroidY = bug.Pos.y(); int neighboursCount = 1; for (size_t j = i + 1; j < bugs.size(); ++j) { if (processedBugs[j]) continue; auto& nghBug = bugs[j]; float len = PoolWatch::sqr(nghBug.Pos.x() - bug.Pos.x()) + PoolWatch::sqr(nghBug.Pos.y() - bug.Pos.y()); len = std::sqrtf(len); if (len < CloseBlobsDist) { centroidX += nghBug.Pos.x(); centroidY += nghBug.Pos.y(); neighboursCount++; processedBugs[j] = true; } } centroidX /= neighboursCount; centroidY /= neighboursCount; const int blobW = 10; const int blobH = 10; DetectedBlob blob; blob.Id = i + 1; blob.Centroid = cv::Point2f(centroidX,centroidY); blob.CentroidWorld = cv::Point3f(centroidX, centroidY, 0); blob.BoundingBox = cv::Rect2f(centroidX - 5, centroidY-5, blobW,blobH); blob.FilledImage = cv::Mat(blobW, blobH, CV_8UC1); blob.FilledImage.setTo(255); fixBlobFilledImageRgb(bug, blobW, blobH, blob); blob.AreaPix = blobW * blobH; resultBlobs.push_back(blob); } if (log_->isDebugEnabled()) { std::stringstream bld; bld << "Found " << resultBlobs.size() << " blobs" << std::endl; for (const auto& blob : resultBlobs) bld << " Id=" << blob.Id << " Centroid=" << blob.Centroid << std::endl; LOG4CXX_DEBUG(log_, bld.str()); } }
int main(int argc, char** argv) { log4cxx::PropertyConfigurator::configure("logger.conf"); Util::SyncHandler spipe(SIGPIPE, [](int, siginfo_t *, void *){;}); po::options_description desc("Program options"); desc.add_options() ("help,h", "show usage information") ("port", po::value<int>(&opt_port), "bind port (2080)") ("perf", po::bool_switch(&opt_perf), "turn off logging") ; //("coro", po::bool_switch(&opt_coro), "enable coro server") //("time", po::value<int>(&opt_time), "time to perform call, us (250000)") //("threads", po::value<int>(&opt_threads), "number of worker threads (no threads)") po::variables_map vm; try { po::store(po::parse_command_line(argc, argv, desc), vm); } catch (const std::exception& e) { ERROR(e.what()); return -1; } po::notify(vm); if (vm.count("help")) { std::cout << desc << std::endl; return 0; } Server s; RPC::Library impl(s); boost::asio::io_service io; Util::ThreadGroup tg; boost::asio::signal_set signals(io, SIGINT, SIGTERM); signals.async_wait([&io](auto error, auto number){ if (!error) { INFO("terminating server"); io.stop(); } }); Util::Server server(io); server.run(opt_port, [&impl](boost::asio::streambuf& data, Util::FramedSocket ptr) { cbor::binary result; impl.process(data.data(), result); Util::FramedMessage reply(std::move(result)); ptr->write(std::move(reply)); }); // start event loop tg.run([&io](){ log4cxx::NDC ndc("server"); TRACE("starting ... "); io.run(); TRACE("terminated"); }); if (opt_perf) logger->setLevel(log4cxx::Level::getFatal()); tg.wait(); return 0; }
void ProxyServer::run() { TRACE(std::cout, ""); zmq::context_t context(1); zmq::socket_t socket(context, ZMQ_REP); std::string addr = m_protocol + "://*:" + m_port; socket.bind(addr.c_str()); while (true) { TRACE(std::cout, ""); zmq::message_t z_req; socket.recv(&z_req); std::string s_req((char *)z_req.data(), z_req.size()); Request req; req.ParseFromString(s_req); // if trace is in Request_Header, then change log level of appender // to trace if (m_msgDriver->requestHasTrace(req)) { TRACE(std::cout, ""); logger->setLevel(log4cxx::Level::getTrace()); } if (m_reqPrinter) { TRACE(std::cout, ""); LOG4CXX_TRACE(logger, "Request: "<<(m_reqPrinter(req))); } Response *resp = NULL; // All the anlytics will be logged in this function switch (req.header().type()) { case Request_Header_Type_LOOKUP: { TRACE(std::cout, ""); try { m_worker->lookup(req); } catch (KeyNotFoundException &e) { TRACE(std::cout, ""); LOG4CXX_DEBUG(logger, "Key not found in store: " <<e.what()); } resp = m_worker->response(); break; } case Request_Header_Type_INSERT: { TRACE(std::cout, ""); try { m_worker->insert(req); } catch (KeyPresentException &e) { TRACE(std::cout, ""); LOG4CXX_DEBUG(logger, "Key already present in" " store: "<<e.what()); } catch (InsertionException &e) { TRACE(std::cout, ""); LOG4CXX_DEBUG(logger, "Insertion error: " <<e.what()); } resp = m_worker->response(); break; } case Request_Header_Type_REMOVE: { TRACE(std::cout, ""); try { m_worker->remove(req); } catch (KeyNotFoundException &e) { TRACE(std::cout, ""); LOG4CXX_DEBUG(logger, "Key not found in store: " <<e.what()); } catch (DeletionException &e) { TRACE(std::cout, ""); LOG4CXX_DEBUG(logger, "Deletion error: " <<e.what()); } resp = m_worker->response(); break; } default: { TRACE(std::cout, ""); break; } }; if (m_respPrinter) { TRACE(std::cout, ""); LOG4CXX_TRACE(logger, "Response : "<<(m_respPrinter(*resp))); } std::string s_resp; resp->SerializeToString(&s_resp); delete resp; zmq::message_t reply(s_resp.size()); memcpy((void *)reply.data(), (void *)s_resp.c_str(), s_resp.size()); socket.send(reply); } }
namespace scidb { using namespace scidb; using namespace boost; static log4cxx::LoggerPtr logger(log4cxx::Logger::getLogger("scidb.libdense_linear_algebra.ops.gemm")); static const bool DBG_CERR = false; static const bool DBG_REFORMAT = false; /** * A Physical multiply operator implemented using ScaLAPACK * The interesting work is done in invokeMPI(), above * */ class GEMMPhysical : public ScaLAPACKPhysical { public: GEMMPhysical(const std::string& logicalName, const std::string& physicalName, const Parameters& parameters, const ArrayDesc& schema) : ScaLAPACKPhysical(logicalName, physicalName, parameters, schema) { } std::shared_ptr<Array> invokeMPI(std::vector< std::shared_ptr<Array> >& inputArrays, const GEMMOptions options, std::shared_ptr<Query>& query, ArrayDesc& outSchema); virtual std::shared_ptr<Array> execute(std::vector< std::shared_ptr<Array> >& inputArrays, std::shared_ptr<Query> query); private: }; char getTransposeCode(bool transpose) { return transpose ? 'T' : 'N' ; } std::shared_ptr<Array> GEMMPhysical::invokeMPI(std::vector< std::shared_ptr<Array> >& inputArrays, const GEMMOptions options, std::shared_ptr<Query>& query, ArrayDesc& outSchema) { // // Everything about the execute() method concerning the MPI execution of the arrays // is factored into this method. This does not include the re-distribution of data // chunks into the ScaLAPACK distribution scheme, as the supplied inputArrays // must already be in that scheme. // // + intersects the array chunkGrids with the maximum process grid // + sets up the ScaLAPACK grid accordingly and if not participating, return early // + start and connect to an MPI slave process // + create ScaLAPACK descriptors for the input arrays // + convert the inputArrays into in-memory ScaLAPACK layout in shared memory // + call a "master" routine that passes the ScaLAPACK operator name, parameters, // and shared memory descriptors to the ScaLAPACK MPI process that will do the // actual computation. // + wait for successful completion // + construct an "OpArray" that make and Array API view of the output memory. // + return that output array. // enum dummy {R=0, C=1}; // row column enum dummy2 {AA=0, BB, CC, NUM_MATRICES}; // which matrix: alpha AA * BB + beta CC -> result LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): begin"); size_t numArray = inputArrays.size(); if (numArray != NUM_MATRICES) { // for now ... may make CC optional when beta is 0, later LOG4CXX_ERROR(logger, "GEMMPhysical::invokeMPI(): " << numArray << " != NUM_MATRICES " << size_t(NUM_MATRICES)); throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED) << "GEMMPhysical::invokeMPI(): requires 3 input Arrays/matrices."); } // // Initialize the (emulated) BLACS and get the proces grid info // blacs::context_t blacsContext = doBlacsInit(inputArrays, query, "GEMMPhysical"); bool isParticipatingInScaLAPACK = blacsContext.isParticipating(); if (isParticipatingInScaLAPACK) { checkBlacsInfo(query, blacsContext, "GEMMPhysical"); } blacs::int_t NPROW=-1, NPCOL=-1, MYPROW=-1 , MYPCOL=-1 ; scidb_blacs_gridinfo_(blacsContext, NPROW, NPCOL, MYPROW, MYPCOL); LOG4CXX_TRACE(logger, "GEMMPhysical::invokeMPI() NPROW="<<NPROW<<", NPCOL="<<NPCOL); // // launch MPISlave if we participate // TODO: move this down into the ScaLAPACK code ... something that does // the doBlacsInit, launchMPISlaves, and the check that they agree // bool isParticipatingInMPI = launchMPISlaves(query, NPROW*NPCOL); if (isParticipatingInScaLAPACK != isParticipatingInMPI) { LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():" << " isParticipatingInScaLAPACK " << isParticipatingInScaLAPACK << " isParticipatingInMPI " << isParticipatingInMPI); throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED) << "GEMMPhysical::invokeMPI(): internal inconsistency in MPI slave launch."); } if (isParticipatingInMPI) { LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): participating in MPI"); } else { LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): not participating in MPI"); LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): only participating in redistribute of the input"); // redistribute to psScaLAPACK // NOTE: this must be kept in sync with the particpatingInMPI version of the redistribute, below // NOTE: this redistribution must be kept in sync with the particpatingInMPI redistributeInputArrays, above procRowCol_t firstChunkSize = { chunkRow(inputArrays[0]), chunkCol(inputArrays[0]) }; std::shared_ptr<PartitioningSchemaDataForScaLAPACK> schemeData = make_shared<PartitioningSchemaDataForScaLAPACK>(getBlacsGridSize(inputArrays, query, "GEMMPhysical"), firstChunkSize); for(size_t mat=0; mat < numArray; mat++ ) { std::stringstream labelStream; labelStream << "GEMMPhysical input[" << mat << "]"; std::shared_ptr<Array> tmpRedistedInput = redistributeInputArray(inputArrays[mat], schemeData, query, labelStream.str()); bool wasConverted = (tmpRedistedInput != inputArrays[mat]) ; // only when redistribute was actually done (sometimes optimize away) if (wasConverted) { SynchableArray* syncArray = safe_dynamic_cast<SynchableArray*>(tmpRedistedInput.get()); syncArray->sync(); } // free potentially large amount of memory, e.g. when inputArrays[mat] was significantly memory-materialized inputArrays[mat].reset(); // TODO: validate that the redistribute brought no chunks to the instance by // getting an array iterator and make sure it returns no chunks // (factor to ScaLAPACKPhysical.cpp) // after validating, we don't need tmpRedistedInput anymore, either tmpRedistedInput.reset(); } unlaunchMPISlavesNonParticipating(); return std::shared_ptr<Array>(new MemArray(_schema,query)); // NOTE: must not happen before redistribute is done. } // // get dimension information about the input arrays // TODO: REFACTOR, this is a pattern in DLAs // // matrix sizes from arrays A,B,C matSize_t size[NUM_MATRICES]; // does not change even after redistributeInputArray for(size_t i=0; i < numArray; i++ ) { size[i] = getMatSize(inputArrays[i]); LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():" << " size["<<i<<"] " << size[i][R] << "," << size[i][C]); } // TODO JHM : convert 1d arrays to nrows x 1 so we can use vectors as input to GEMM without requiring // the user to add a dimension of size 1. for(size_t i=0; i < numArray; i++ ) { // TODO JHM : check inputArrays[i] to make sure we are only using 2D arrays, // that may or may not be done by checkInputArrays checkInputArray(inputArrays[i]); // check block size constraints, etc } // //.... Set up ScaLAPACK array descriptors ........................................ // // we would like to do the automatic repart() [not yet implemented] inside the same loop as the // redistribute() and extractToScaLAPACK() in order to release each array after it is consumed. // unfortunately, we have made some of the routines below dependent on the MB,NB we are going to use, // which has recently become determined by the chunkSize of the inputArrays[] since it is no longer // a fixed value, but may vary over a legal range. // but when automatic repart() is done, we will want to use the chunksize of the output of the repart(). // so we will need to decide by this point what the MB,NB is going to be, even if we haven't reparted // to it yet. // to make it clear we mean ScaLAPACK MB,NB // (which may become different from the inputArray[i] chunkSize in the future) // we will call the array of ScaLAPACK MB,NB pairs, MB_NB[]. matSize_t MB_NB[NUM_MATRICES]; // this one should be moved after redistributeInputArrays() for when it really reparts for(size_t i=0; i < numArray; i++ ) { MB_NB[i] = getMatChunkSize(inputArrays[i]); LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():" << " using MB_NB["<<i<<"] " << MB_NB[i][R] << "," << MB_NB[i][C]); } // these formulas for LLD (local leading dimension) and LTD (local trailing dimension) // are found in the headers of the ScaLAPACK functions such as pdgemm_() const slpp::int_t one = 1 ; // TODO: turn these pairs into matSize_t matrixLocalSize[NUM_MATRICES]; slpp::int_t LLD[NUM_MATRICES]; // local leading dimension slpp::int_t LTD[NUM_MATRICES]; // local trailing dimension for(size_t i=0; i < numArray; i++ ) { slpp::int_t RSRC = 0 ; LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():" << " M["<<i<<"][R]"<<size[i][R] <<" MB["<<i<<"][R]:"<<MB_NB[i][R] << " N["<<i<<"][R]"<<size[i][C] <<" NB["<<i<<"][R]:"<<MB_NB[i][C] << " MYPROW:"<<MYPROW << " NPROW:"<< NPROW); LLD[i] = std::max(one, scidb_numroc_( size[i][R], MB_NB[i][R], MYPROW, RSRC, NPROW )); LTD[i] = std::max(one, scidb_numroc_( size[i][C], MB_NB[i][C], MYPCOL, RSRC, NPCOL )); LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():" << " LLD["<<i<<"] = " << LLD[i] << " LTD["<<i<<"] = " << LTD[i]); } // create ScaLAPACK array descriptors // TODO: lets factor this to a method on ScaLAPACKPhysical slpp::desc_t DESC[NUM_MATRICES]; for(size_t i=0; i < numArray; i++ ) { LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():" << " scidb_descinit_(DESC["<<i<<"], M=" << size[i][R] << ", N=" << size[i][C] << ", MB=" << MB_NB[i][R] << ", NB=" << MB_NB[i][R] << ", IRSRC=" << 0 << ", ICSRC=" << 0 << ", LLD=" << LLD[i]); slpp::int_t descinitINFO = 0; // an output implemented as non-const ref (due to Fortran calling conventions) scidb_descinit_(DESC[i], size[i][R], size[i][C], MB_NB[i][R], MB_NB[i][C], 0, 0, blacsContext, LLD[i], descinitINFO); if (descinitINFO != 0) { LOG4CXX_ERROR(logger, "GEMMPhysical::invokeMPI(): scidb_descinit(DESC) failed, INFO " << descinitINFO << " DESC " << DESC); throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED) << "GEMMPhysical::invokeMPI(): scidb_descinit(DESC) failed"); } LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():" << " scidb_descinit_() returned DESC["<<i<<"] " << DESC[i]); // debugging for #1986 ... when #instances is prime, process grid is a row. When small chunk sizes are used, // desc.LLD is being set to a number larger than the chunk size ... I don't understand or expect this. bool doDebugTicket1986=true; // remains on until fixed, can't ship with this not understood. if(doDebugTicket1986) { if (DESC[i].LLD > DESC[i].MB) { LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): ticket 1986 issue" << ", DESC["<<i<<"].LLD " << DESC[i].LLD << " > DESC["<<i<<"].MB: " << DESC[i].MB); } } } // matrix allocations are of local size, not global size size_t matrixLocalSize[NUM_MATRICES]; for(size_t i=0; i < numArray; i++ ) { matrixLocalSize[i] = size_t(LLD[i]) * LTD[i] ; LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): " << " LLD[" << i << "] ( " << LLD[i] << " ) x " << " LTD[" << i << "] ( " << LTD[i] << " ) = " << " matrixLocalSize[" << i << "] " << matrixLocalSize[i]); } // // Create IPC buffers // enum dummy3 {BUF_ARGS=0, BUF_MAT_AA, BUF_MAT_BB, BUF_MAT_CC, NUM_BUFS }; assert(numArray < NUM_BUFS); size_t bufElemBytes[NUM_BUFS]; size_t bufNumElem[NUM_BUFS]; std::string bufDbgNames[NUM_BUFS]; bufElemBytes[BUF_ARGS]= 1 ; bufNumElem[BUF_ARGS]= sizeof(scidb::PdgemmArgs) ; bufDbgNames[BUF_ARGS] = "PdgemmArgs"; bufElemBytes[BUF_MAT_AA]= sizeof(double) ; bufNumElem[BUF_MAT_AA]= matrixLocalSize[AA]; bufDbgNames[BUF_MAT_AA] = "A" ; bufElemBytes[BUF_MAT_BB]= sizeof(double) ; bufNumElem[BUF_MAT_BB]= matrixLocalSize[BB]; bufDbgNames[BUF_MAT_BB] = "B" ; bufElemBytes[BUF_MAT_CC]= sizeof(double) ; bufNumElem[BUF_MAT_CC]= matrixLocalSize[CC]; bufDbgNames[BUF_MAT_CC] = "C" ; typedef scidb::SharedMemoryPtr<double> shmSharedPtr_t ; for(size_t i=0; i < numArray; i++ ) { LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): " << " bufElemBytes[" << i << "] = " << bufElemBytes[i]); } std::vector<MPIPhysical::SMIptr_t> shmIpc = allocateMPISharedMemory(NUM_BUFS, bufElemBytes, bufNumElem, bufDbgNames); // the following used to determine the PDGEMM() "K" argument just prior to pdgemm, // but now it has to be done before inputArrays[AA] is .reset() in the following loop. // // Comments on PDGEMM input "K", taken from the netlib PDGEMM argument header: // If transa = 'T' or 'C'(true), it is the number of rows in submatrix A." // If transa = 'N'(false), it is the number of columns in submatrix A." slpp::int_t K = nCol(inputArrays[AA], options.transposeA); // the following also used to be done just prior to pdgemm, // but now must be done before inputArrays[CC] is .reset() in the following loop. // it must also now be a copy, and not a reference, for the same reason. Dimensions const dimsCC = inputArrays[CC]->getArrayDesc().getDimensions(); // now for each input matrix, do the following: // 1. redistribute to psScaLAPACK (when not already correct). // 2. if actual conversion happened, release the inputArray, which might be a lot of memory, e.g. when inputArray[i] is materialized. // 2. zero the ScaLAPACK local block-cyclic storage in shared mem. (so that empty cells will become zeros). // 3. extract the (redistributed array) where not-empty, into the ScaLAPACK local matrix memory. // 4. release the redistributed array, which might be a lot of memory since SG is currently materializing. // // The only caller of this routine is the execute() method, and neither the execute() method, nor the executor that calls it, // access the inputArrays after calling execute, which is why we can reset() the shared_ptrs to the arrays after consuming the // arrrays into the ScaLAPACK memory. // // redistribute to psScaLAPACK, and convert to ScaLAPACK format. // NOTE: this redistribution must be kept in sync with the particpatingInMPI redistributeInputArrays, above procRowCol_t firstChunkSize = { chunkRow(inputArrays[0]), chunkCol(inputArrays[0]) }; std::shared_ptr<PartitioningSchemaDataForScaLAPACK> schemeData = make_shared<PartitioningSchemaDataForScaLAPACK>(getBlacsGridSize(inputArrays, query, "GEMMPhysical"), firstChunkSize); double* asDoubles[NUM_MATRICES]; for(size_t mat=0; mat < numArray; mat++ ) { std::stringstream labelStream; labelStream << "GEMMPhysical input[" << mat << "]"; std::shared_ptr<Array> tmpRedistedInput = redistributeInputArray(inputArrays[mat], schemeData, query, labelStream.str()); bool wasConverted = (tmpRedistedInput != inputArrays[mat]) ; // only when redistribute was actually done (sometimes optimize away) // TODO would be nice if we could allocate the ScaLAPACK memory after dropping the input array // in case the physical memory for the shmem can be reclaimed from the reset inputArrays[mat] size_t buf= mat+1; // buffer 0 is command buffer, buffers[1..n] correspond to inputs[0..n-1] assert(buf < NUM_BUFS); asDoubles[mat] = reinterpret_cast<double*>(shmIpc[buf]->get()); setInputMatrixToAlgebraDefault(asDoubles[mat], bufNumElem[buf]); // note asDoubles[CC] is input and output to/from ScaLAPACK extractArrayToScaLAPACK(tmpRedistedInput, asDoubles[mat], DESC[mat],NPROW, NPCOL, MYPROW, MYPCOL, query); if(wasConverted) { SynchableArray* syncArray = safe_dynamic_cast<SynchableArray*>(tmpRedistedInput.get()); syncArray->sync(); } // free potentially large amount of memory, e.g. when inputArrays[mat] was significantly memory-materialized inputArrays[mat].reset(); tmpRedistedInput.reset(); // and drop this array before iterating on the loop to the next repart/redist if(DBG_REFORMAT) { // that the reformat worked correctly for(size_t ii=0; ii < matrixLocalSize[mat]; ii++) { LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():" << " @myPPos("<< MYPROW << "," << MYPCOL << ")" << " array["<<mat<<"]["<<ii<<"] = " << asDoubles[mat][ii]); } } } size_t resultShmIpcIndx = BUF_MAT_CC; // by default, GEMM assumes it will return something for C // but this will change if find we don't particpate in the output shmSharedPtr_t Cx(shmIpc[resultShmIpcIndx]); // //.... Call pdgemm to compute the product of A and B ............................. // LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): calling pdgemm_ M,N,K:" << size[AA][R] << "," << size[BB][R] << "," << size[CC][C] << " MB,NB:" << MB_NB[AA][R] << "," << MB_NB[AA][C]); if(DBG_CERR) std::cerr << "GEMMPhysical::invokeMPI(): calling pdgemm to compute" << std:: endl; std::shared_ptr<MpiSlaveProxy> slave = _ctx->getSlave(_launchId); slpp::int_t MYPE = query->getInstanceID() ; // we map 1-to-1 between instanceID and MPI rank slpp::int_t INFO = DEFAULT_BAD_INFO ; pdgemmMaster(query.get(), _ctx, slave, _ipcName, shmIpc[BUF_ARGS]->get(), NPROW, NPCOL, MYPROW, MYPCOL, MYPE, getTransposeCode(options.transposeA), getTransposeCode(options.transposeB), size[CC][R], size[CC][C], K, &options.alpha, asDoubles[AA], one, one, DESC[AA], asDoubles[BB], one, one, DESC[BB], &options.beta, asDoubles[CC], one, one, DESC[CC], INFO); raiseIfBadResultInfo(INFO, "pdgemm"); boost::shared_array<char> resPtrDummy(reinterpret_cast<char*>(NULL)); typedef scidb::ReformatFromScalapack<shmSharedPtr_t> reformatOp_t ; if(logger->isTraceEnabled()) { LOG4CXX_TRACE(logger, "GEMMPhysical::invokeMPI():--------------------------------------"); LOG4CXX_TRACE(logger, "GEMMPhysical::invokeMPI(): sequential values from 'C' memory"); for(size_t ii=0; ii < matrixLocalSize[CC]; ii++) { LOG4CXX_TRACE(logger, "GEMMPhysical::invokeMPI(): ("<< MYPROW << "," << MYPCOL << ") C["<<ii<<"] = " << asDoubles[CC][ii]); } LOG4CXX_TRACE(logger, "GEMMPhysical::invokeMPI(): --------------------------------------"); LOG4CXX_TRACE(logger, "GEMMPhysical::invokeMPI(): using pdelgetOp to reformat Gemm left from memory to scidb array , start"); } // // an OpArray is a SplitArray that is filled on-the-fly by calling the operator // so all we have to do is create one with an upper-left corner equal to the // global position of the first local block we have. so we need to map // our "processor" coordinate into that position, which we do by multiplying // by the chunkSize // Coordinates first(2); first[R] = dimsCC[R].getStartMin() + MYPROW * MB_NB[CC][R]; first[C] = dimsCC[C].getStartMin() + MYPCOL * MB_NB[CC][C]; Coordinates last(2); last[R] = dimsCC[R].getStartMin() + size[CC][R] - 1; last[C] = dimsCC[C].getStartMin() + size[CC][C] - 1; std::shared_ptr<Array> result; // the process grid may be larger than the size of output in chunks... e.g multiplying A(1x100) * B(100x1) -> C(1x1) bool isParticipatingInOutput = first[R] <= last[R] && first[C] <= last[C] ; if (isParticipatingInOutput) { // there is in fact some output in our shared memory... hook it up to an OpArray Coordinates iterDelta(2); iterDelta[0] = NPROW * MB_NB[CC][R]; iterDelta[1] = NPCOL * MB_NB[CC][C]; LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():Creating OpArray from ("<<first[R]<<","<<first[C]<<") to (" << last[R] <<"," <<last[C]<<") delta:"<<iterDelta[R]<<","<<iterDelta[C]); reformatOp_t pdelgetOp(Cx, DESC[CC], dimsCC[R].getStartMin(), dimsCC[C].getStartMin(), NPROW, NPCOL, MYPROW, MYPCOL); result = std::shared_ptr<Array>(new OpArray<reformatOp_t>(outSchema, resPtrDummy, pdelgetOp, first, last, iterDelta, query)); assert(resultShmIpcIndx == BUF_MAT_CC); } else { LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): instance participated, but does not output: creating empty MemArray: first ("<<first[R]<<","<<first[C]<<"), last(" << last[R] <<"," <<last[C]<<")"); result = std::shared_ptr<Array>(new MemArray(_schema,query)); // same as when we don't participate at all resultShmIpcIndx = shmIpc.size(); // indicate we don't want to hold on to buffer BUF_MAT_CC after all } // TODO: common pattern in ScaLAPACK operators: factor to base class releaseMPISharedMemoryInputs(shmIpc, resultShmIpcIndx); unlaunchMPISlaves(); LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI() end"); return result; } std::shared_ptr<Array> GEMMPhysical::execute(std::vector< std::shared_ptr<Array> >& inputArrays, std::shared_ptr<Query> query) { // // + converts inputArrays to psScaLAPACK distribution // + calls invokeMPI() // + returns the output OpArray. // LOG4CXX_DEBUG(logger, "GEMMPhysical::execute(): begin."); // TODO: make a GEMMLogical checkArgs(inputArrays, query); which asserts two or three arrays // get string of parameters from the optional 4th argument: // (TRANSA, TRANSB, ALPHA, BETA) std::string namedOptionStr; if (_parameters.size() >= 1) { assert(_parameters[0]->getParamType() == PARAM_PHYSICAL_EXPRESSION); typedef std::shared_ptr<OperatorParamPhysicalExpression> ParamType_t ; ParamType_t& paramExpr = reinterpret_cast<ParamType_t&>(_parameters[0]); assert(paramExpr->isConstant()); namedOptionStr = paramExpr->getExpression()->evaluate().getString(); } GEMMOptions options(namedOptionStr); // // invokeMPI() // // invokeMPI does not manage an empty bitmap yet, but it is specified in _schema. // so to make it compatible, we first create a copy of _schema without the empty tag attribute Attributes attrsNoEmptyTag = _schema.getAttributes(true /*exclude empty bitmap*/); ArrayDesc schemaNoEmptyTag(_schema.getName(), attrsNoEmptyTag, _schema.getDimensions(), defaultPartitioning()); // and now invokeMPI produces an array without empty bitmap except when it is not participating std::shared_ptr<Array> arrayNoEmptyTag = invokeMPI(inputArrays, options, query, schemaNoEmptyTag); // now we place a wrapper array around arrayNoEmptyTag, that adds a fake emptyTag (true everywhere) // but otherwise passes through requests for iterators on the other attributes. // And yes, the class name is the complete opposite of what it shold be. std::shared_ptr<Array> result; if (arrayNoEmptyTag->getArrayDesc().getEmptyBitmapAttribute() == NULL) { result = make_shared<NonEmptyableArray>(arrayNoEmptyTag); } else { result = arrayNoEmptyTag; } // return the scidb array LOG4CXX_DEBUG(logger, "GEMMPhysical::execute(): (successful) end"); return result; } REGISTER_PHYSICAL_OPERATOR_FACTORY(GEMMPhysical, "gemm", "GEMMPhysical"); } // namespace
namespace scidb { static log4cxx::LoggerPtr logger(log4cxx::Logger::getLogger("scidb.mpi")); static bool checkLauncher(uint32_t testDelay, uint64_t launchId, MpiOperatorContext* ctx) { std::shared_ptr<MpiLauncher> launcher(ctx->getLauncher(launchId)); if (isDebug()) { if (launcher) { // when running tests, slow down to give launcher a chance to exit ::sleep(testDelay); } } if (launcher && !launcher->isRunning()) { throw SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED) << "MPI launcher process already terminated"; } return true; } static bool checkTimeout(double startTime, double timeout, uint64_t launchId, MpiOperatorContext* ctx) { if (mpi::hasExpired(startTime, timeout)) { throw SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED) << "MPI slave process failed to communicate in time"; } return true; } static bool checkLauncherWithTimeout(uint32_t testDelay, double startTime, double timeout, uint64_t launchId, MpiOperatorContext* ctx) { bool rc = checkLauncher(testDelay, launchId, ctx); assert(rc); return (checkTimeout(startTime, timeout, launchId, ctx) && rc); } void MpiSlaveProxy::waitForHandshake(std::shared_ptr<MpiOperatorContext>& ctx) { if (_connection) { throw (InvalidStateException(REL_FILE, __FUNCTION__, __LINE__) << "Connection to MPI slave already established"); } LOG4CXX_DEBUG(logger, "MpiSlaveProxy::waitForHandshake: launchId="<<_launchId); MpiOperatorContext::LaunchErrorChecker errChecker = boost::bind(&checkLauncherWithTimeout, _delayForTestingInSec, mpi::getTimeInSecs(), static_cast<double>(_MPI_SLAVE_RESPONSE_TIMEOUT), _1, _2); std::shared_ptr<scidb::ClientMessageDescription> msg = ctx->popMsg(_launchId, errChecker); assert(msg); _connection = msg->getClientContext(); if (msg->getMessageType() != scidb::mtMpiSlaveHandshake) { throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave handshake is invalid"); } std::shared_ptr<scidb_msg::MpiSlaveHandshake> handshake = std::dynamic_pointer_cast<scidb_msg::MpiSlaveHandshake>(msg->getRecord()); assert(handshake); // parse the handshake if (!handshake->has_pid()) { throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave handshake has no PID"); } const pid_t slavePid = handshake->pid(); if (slavePid == ::getpid() || slavePid == ::getppid() || slavePid < 2) { throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave handshake has invalid PID"); } if (!handshake->has_ppid()) { throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave handshake has no PPID"); } const pid_t slavePPid = handshake->ppid(); if (slavePPid == ::getpid() || slavePPid == ::getppid() || slavePPid < 2) { throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave handshake has invalid PPID"); } const string clusterUuid = Cluster::getInstance()->getUuid(); if (handshake->cluster_uuid() != clusterUuid) { throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave handshake has invalid clusterUuid"); } InstanceID instanceId = Cluster::getInstance()->getLocalInstanceId(); if (handshake->instance_id() != instanceId) { throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave handshake has invalid instanceId"); } if (handshake->launch_id() != _launchId) { throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave handshake has invalid launchId"); } std::shared_ptr<scidb::Query> query(Query::getValidQueryPtr(_query)); if (handshake->rank() != query->getInstanceID()) { // logical instance ID throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave handshake has invalid rank"); } _pids.push_back(slavePid); _pids.push_back(slavePPid); ClientContext::DisconnectHandler dh = boost::bind(&MpiMessageHandler::handleMpiSlaveDisconnect, _launchId, _1); _connection->attachQuery(query->getQueryID(), dh); LOG4CXX_DEBUG(logger, "MpiSlaveProxy::waitForHandshake: handshake: " <<" pid="<<handshake->pid() <<", ppid="<<handshake->ppid() <<", cluster_uuid="<<handshake->cluster_uuid() <<", instance_id="<<handshake->instance_id() <<", launch_id="<<handshake->launch_id() <<", rank="<<handshake->rank()); } void MpiSlaveProxy::sendCommand(mpi::Command& cmd, std::shared_ptr<MpiOperatorContext>& ctx) { if (!_connection) { throw (InvalidStateException(REL_FILE, __FUNCTION__, __LINE__) << "No connection to MPI slave"); } // set command std::shared_ptr<scidb_msg::MpiSlaveCommand> cmdPtr(new scidb_msg::MpiSlaveCommand()); cmdPtr->set_command(cmd.getCmd()); // set args const std::vector<std::string>& args = cmd.getArgs(); google::protobuf::RepeatedPtrField<std::string>* msgArgs = cmdPtr->mutable_args(); assert(msgArgs); msgArgs->Reserve(args.size()); for (std::vector<std::string>::const_iterator iter = args.begin(); iter != args.end(); ++iter) { cmdPtr->add_args(*iter); } // send to slave scidb::MessagePtr msgPtr(cmdPtr); boost::asio::const_buffer binary(NULL,0); try { scidb::sendAsyncClient(_connection, scidb::mtMpiSlaveCommand, msgPtr, binary); } catch (const scidb::SystemException& e) { LOG4CXX_ERROR(logger, "MpiSlaveProxy::sendCommand: " << "FAILED to send MpiSlaveCommand to slave because: " << e.what()); throw; } LOG4CXX_DEBUG(logger, "MpiSlaveProxy::sendCommand: MpiSlaveCommand " << cmd.toString() << " sent to slave"); } /// @todo XXX TODO tigor: make it timeout ? TBD int64_t MpiSlaveProxy::waitForStatus(std::shared_ptr<MpiOperatorContext>& ctx, bool raise) { if (!_connection) { throw (InvalidStateException(REL_FILE, __FUNCTION__, __LINE__) << "No connection to MPI slave"); } LOG4CXX_DEBUG(logger, "MpiSlaveProxy::waitForStatus: launchId="<<_launchId); MpiOperatorContext::LaunchErrorChecker errChecker = boost::bind(&checkLauncher, _delayForTestingInSec, _1, _2); std::shared_ptr<scidb::ClientMessageDescription> msg = ctx->popMsg(_launchId, errChecker); assert(msg); LOG4CXX_DEBUG(logger, "MpiSlaveProxy::waitForStatus: message from client: " <<" ctx = " << msg->getClientContext().get() <<", msg type = "<< msg->getMessageType() <<", queryID = "<<msg->getQueryId()); if (_connection != msg->getClientContext()) { if (!msg->getClientContext() && msg->getMessageType() == scidb::SYSTEM_NONE_MSG_ID) { throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave disconnected prematurely"); } throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave connection context mismatch"); } if (msg->getMessageType() != scidb::mtMpiSlaveResult) { throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave returned invalid status"); } std::shared_ptr<scidb_msg::MpiSlaveResult> result = std::dynamic_pointer_cast<scidb_msg::MpiSlaveResult>(msg->getRecord()); assert(result); if (!result->has_status()) { throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave returned no status"); } if (raise && result->status() != 0) { std::stringstream ss; ss << "MPI Slave Execution returned status " << result->status(); throw (SYSTEM_EXCEPTION(SCIDB_SE_OPERATOR, SCIDB_LE_OPERATION_FAILED) << ss.str()); } return result->status(); } void MpiSlaveProxy::waitForExit(std::shared_ptr<MpiOperatorContext>& ctx) { if (!_connection) { throw (InvalidStateException(REL_FILE, __FUNCTION__, __LINE__) << "No connection to MPI slave"); } LOG4CXX_DEBUG(logger, "MpiSlaveProxy::waitForExit: launchId="<<_launchId); MpiOperatorContext::LaunchErrorChecker errChecker = boost::bind(&checkTimeout, mpi::getTimeInSecs(), static_cast<double>(_MPI_SLAVE_RESPONSE_TIMEOUT), _1, _2); std::shared_ptr<scidb::ClientMessageDescription> msg = ctx->popMsg(_launchId, errChecker); assert(msg); LOG4CXX_DEBUG(logger, "MpiSlaveProxy::waitForExit: " <<" ctx = " << msg->getClientContext().get() <<", msg type = "<< msg->getMessageType() <<", queryID = "<<msg->getQueryId()); if (msg->getMessageType() != scidb::SYSTEM_NONE_MSG_ID) { throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave returned invalid status"); } assert(!msg->getClientContext()); _connection.reset(); } void MpiSlaveProxy::destroy(bool error) { QueryID queryIdForKill(INVALID_QUERY_ID); if (error) { _inError=true; queryIdForKill = _queryId; } const string clusterUuid = Cluster::getInstance()->getUuid(); // kill the slave proc and its parent orted for ( std::vector<pid_t>::const_iterator iter=_pids.begin(); iter!=_pids.end(); ++iter) { pid_t pid = *iter; //XXX TODO tigor: kill proceess group (-pid) ? LOG4CXX_DEBUG(logger, "MpiSlaveProxy::destroy: killing slave pid = "<<pid); MpiErrorHandler::killProc(_installPath, clusterUuid, pid, queryIdForKill); } std::string pidFile = mpi::getSlavePidFile(_installPath, _queryId, _launchId); MpiErrorHandler::cleanupSlavePidFile(_installPath, clusterUuid, pidFile, queryIdForKill); // rm log file if (!logger->isTraceEnabled() && !_inError) { string logFileName = mpi::getSlaveLogFile(_installPath, _queryId, _launchId); scidb::File::remove(logFileName.c_str(), false); } } } //namespace
namespace scidb { static log4cxx::LoggerPtr logger(log4cxx::Logger::getLogger("scidb.mpi")); #if defined(NDEBUG) static const bool DBG = false; #else static const bool DBG = true; #endif MpiLauncher::MpiLauncher(uint64_t launchId, const boost::shared_ptr<Query>& q) : _pid(0), _status(0), _queryId(q->getQueryID()), _launchId(launchId), _query(q), _waiting(false), _inError(false), _MPI_LAUNCHER_KILL_TIMEOUT(scidb::getLivenessTimeout()) { } MpiLauncher::MpiLauncher(uint64_t launchId, const boost::shared_ptr<Query>& q, uint32_t timeout) : _pid(0), _status(0), _queryId(q->getQueryID()), _launchId(launchId), _query(q), _waiting(false), _inError(false), _MPI_LAUNCHER_KILL_TIMEOUT(timeout) { } void MpiLauncher::getPids(vector<pid_t>& pids) { ScopedMutexLock lock(_mutex); if (_pid <= 1) { throw InvalidStateException(REL_FILE, __FUNCTION__, __LINE__) << " MPI launcher is not running"; } pids.push_back(_pid); } void MpiLauncher::launch(const vector<string>& slaveArgs, const boost::shared_ptr<const InstanceMembership>& membership, const size_t maxSlaves) { vector<string> args; { ScopedMutexLock lock(_mutex); if (_pid != 0 || _waiting) { throw InvalidStateException(REL_FILE, __FUNCTION__, __LINE__) << " MPI launcher is already running"; } boost::shared_ptr<Query> query = _query.lock(); Query::validateQueryPtr(query); buildArgs(args, slaveArgs, membership, query, maxSlaves); } pid_t pid = fork(); if (pid < 0) { // error int err = errno; throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_SYSCALL_ERROR) << "fork" << pid << err <<""); } else if (pid > 0) { // parent ScopedMutexLock lock(_mutex); if (_pid != 0 || _waiting) { throw InvalidStateException(REL_FILE, __FUNCTION__, __LINE__) << " MPI launcher is corrupted after launch"; } _pid = pid; LOG4CXX_DEBUG(logger, "MPI launcher process spawned, pid="<<_pid); return; } else { // child becomeProcGroupLeader(); recordPids(); setupLogging(); if (DBG) { std::cerr << "LAUNCHER pid="<<getpid() << ", pgid="<< ::getpgid(0) << ", ppid="<< ::getppid()<<std::endl; } closeFds(); boost::scoped_array<const char*> argv(new const char*[args.size()+1]); initExecArgs(args, argv); const char *path = argv[0]; if (DBG) { std::cerr << "LAUNCHER pid="<<::getpid()<<" args for "<<path<<" are ready" << std::endl; for (size_t i=0; i<args.size(); ++i) { const char * arg = argv[i]; if (!arg) break; cerr << "LAUNCHER arg["<<i<<"] = "<< argv[i] << std::endl; } } int rc = ::execv(path, const_cast<char* const*>(argv.get())); assert(rc == -1); rc=rc; // avoid compiler warning perror("LAUNCHER execv"); _exit(1); } throw SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNREACHABLE_CODE); } bool MpiLauncher::isRunning() { pid_t pid=0; int status=0; { ScopedMutexLock lock(_mutex); if (_pid<=0) { return false; } pid = _pid; } const bool doNotWait=true; bool rc = waitForExit(pid, &status, doNotWait); if (!rc) { return true; } ScopedMutexLock lock(_mutex); _pid = -pid; _status = status; return false; } void MpiLauncher::destroy(bool force) { pid_t pid=0; int status=0; string pidFile; { ScopedMutexLock lock(_mutex); if (_pid == 0 || _waiting) { throw (InvalidStateException(REL_FILE, __FUNCTION__, __LINE__) << " MPI launcher already destroyed"); } _waiting = true; pid = _pid; status = _status; pidFile = mpi::getLauncherPidFile(_installPath, _queryId, _launchId); if (pid > 0) { if (!force) { scheduleKillTimer(); } else { // kill right away boost::shared_ptr<boost::asio::deadline_timer> dummyTimer; boost::system::error_code dummyErr; handleKillTimeout(dummyTimer, dummyErr); } } if (force) { _inError=true; } } if (pid < 0) { completeLaunch(-pid, pidFile, status); return; } bool rc = waitForExit(pid,&status); assert(rc); rc=rc; { ScopedMutexLock lock(_mutex); if (!_waiting || pid != _pid) { throw InvalidStateException(REL_FILE, __FUNCTION__, __LINE__) << " MPI launcher is corrupted after collecting process exit code"; } _pid = -pid; _status = status; if (_killTimer) { size_t n = _killTimer->cancel(); assert(n<2); n=n; } } completeLaunch(pid, pidFile, status); } void MpiLauncher::completeLaunch(pid_t pid, const std::string& pidFile, int status) { // rm args file boost::scoped_ptr<SharedMemoryIpc> shmIpc(mpi::newSharedMemoryIpc(_ipcName)); shmIpc->remove(); shmIpc.reset(); // rm pid file scidb::File::remove(pidFile.c_str(), false); // rm log file if (!logger->isTraceEnabled() && !_inError) { string logFileName = mpi::getLauncherLogFile(_installPath, _queryId, _launchId); scidb::File::remove(logFileName.c_str(), false); } if (WIFSIGNALED(status)) { LOG4CXX_ERROR(logger, "SciDB MPI launcher (pid="<<pid<<") terminated by signal = " << WTERMSIG(status) << (WCOREDUMP(status)? ", core dumped" : "")); throw SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED) << "MPI launcher process"; } else if (WIFEXITED(status)) { int rc = WEXITSTATUS(status); if (rc != 0) { LOG4CXX_ERROR(logger, "SciDB MPI launcher (pid="<<_pid<<") exited with status = " << rc); throw SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED) << "MPI launcher process"; } else { LOG4CXX_DEBUG(logger, "SciDB MPI launcher (pid="<<_pid<<") exited with status = " << rc); return; } } throw SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNREACHABLE_CODE); } void MpiLauncher::handleKillTimeout(boost::shared_ptr<boost::asio::deadline_timer>& killTimer, const boost::system::error_code& error) { ScopedMutexLock lock(_mutex); if (error == boost::asio::error::operation_aborted) { assert(_pid < 0); LOG4CXX_TRACE(logger, " MPI launcher kill timer cancelled"); return; } if (error) { assert(false); LOG4CXX_WARN(logger, "MPI launcher kill timer encountered error"<<error); } if (_pid <= 0) { LOG4CXX_WARN(logger, "MPI launcher kill timer cannot kill pid="<<_pid); return; } if (!_waiting) { assert(false); LOG4CXX_ERROR(logger, "MPI launcher kill timer cannot kill pid="<<_pid); throw InvalidStateException(REL_FILE, __FUNCTION__, __LINE__) << " MPI launcher process cannot be killed"; } LOG4CXX_WARN(logger, "MPI launcher is about to kill group pid="<<_pid); // kill launcher's proc group MpiErrorHandler::killProc(_installPath, -_pid); } static void validateLauncherArg(const std::string& arg) { const char *notAllowed = " \n"; if (arg.find_first_of(notAllowed) != string::npos) { throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_INVALID_FUNCTION_ARGUMENT) << (string("MPI launcher argument with whitespace: ")+arg)); } } /// @todo XXX tigor: move command args into a file void MpiLauncher::buildArgs(vector<string>& args, const vector<string>& slaveArgs, const boost::shared_ptr<const InstanceMembership>& membership, const boost::shared_ptr<Query>& query, const size_t maxSlaves) { for (vector<string>::const_iterator iter=slaveArgs.begin(); iter!=slaveArgs.end(); ++iter) { validateLauncherArg(*iter); } const Instances& instances = membership->getInstanceConfigs(); map<InstanceID,const InstanceDesc*> sortedInstances; getSortedInstances(sortedInstances, instances, query); ostringstream buf; const string clusterUuid = Cluster::getInstance()->getUuid(); buf << _queryId; const string queryId = buf.str(); buf.str(""); buf << _launchId; const string launchId = buf.str(); // preallocate memory const size_t ARGS_PER_INSTANCE = 16; const size_t ARGS_PER_LAUNCH = 4; const size_t MPI_PREFIX_CORRECTION = 2; size_t totalArgsNum = ARGS_PER_LAUNCH + (ARGS_PER_INSTANCE+slaveArgs.size()) * std::min(maxSlaves, sortedInstances.size()) - MPI_PREFIX_CORRECTION; args.clear(); args.reserve(totalArgsNum); InstanceID myId = Cluster::getInstance()->getLocalInstanceId(); args.push_back(string("")); //place holder for the binary args.push_back(string("--verbose")); args.push_back(string("--tag-output")); args.push_back(string("--timestamp-output")); // first, find my own install path, and add coordinator arguments for (map<InstanceID,const InstanceDesc*>::const_iterator i = sortedInstances.begin(); i != sortedInstances.end(); ++i) { assert(i->first<sortedInstances.size()); const InstanceDesc* desc = i->second; assert(desc); InstanceID currId = desc->getInstanceId(); assert(currId < instances.size()); if (currId != myId) { continue; } assert(args[0].empty()); const string& installPath = desc->getPath(); _installPath = installPath; args[0] = MpiManager::getLauncherBinFile(installPath); addPerInstanceArgs(myId, desc, clusterUuid, queryId, launchId, slaveArgs, args); } assert(!args[0].empty()); // second, loop again to actually start all the instances size_t count = 1; for (map<InstanceID,const InstanceDesc*>::const_iterator i = sortedInstances.begin(); i != sortedInstances.end() && count<maxSlaves; ++i,++count) { const InstanceDesc* desc = i->second; InstanceID currId = desc->getInstanceId(); if (currId == myId) { --count; continue; } addPerInstanceArgs(myId, desc, clusterUuid, queryId, launchId, slaveArgs, args); } int64_t shmSize(0); vector<string>::iterator iter=args.begin(); iter += ARGS_PER_LAUNCH; // compute arguments size const size_t DELIM_SIZE=sizeof('\n'); for (; iter!=args.end(); ++iter) { string& arg = (*iter); shmSize += (arg.size()+DELIM_SIZE); } LOG4CXX_TRACE(logger, "MPI launcher arguments size = " << shmSize); // Create shared memory to pass the arguments to the launcher _ipcName = mpi::getIpcName(_installPath, clusterUuid, queryId, myId, launchId) + ".launch_args"; LOG4CXX_TRACE(logger, "MPI launcher arguments ipc = " << _ipcName); boost::scoped_ptr<SharedMemoryIpc> shmIpc(mpi::newSharedMemoryIpc(_ipcName)); char* ptr(NULL); try { shmIpc->create(SharedMemoryIpc::RDWR); shmIpc->truncate(shmSize); ptr = reinterpret_cast<char*>(shmIpc->get()); } catch(scidb::SharedMemoryIpc::SystemErrorException& e) { LOG4CXX_ERROR(logger, "Cannot map shared memory: " << e.what()); throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED) << "shared_memory_mmap"); } catch(scidb::SharedMemoryIpc::InvalidStateException& e) { LOG4CXX_ERROR(logger, "Unexpected error while mapping shared memory: " << e.what()); throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << e.what()); } assert(ptr); size_t off = 0; iter=args.begin(); iter += ARGS_PER_LAUNCH; for (; iter!=args.end(); ++iter) { string& arg = (*iter); if (off == 0) { } else if (arg == "-H") { *(ptr+off) = '\n'; ++off; } else { *(ptr+off) = ' '; ++off; } memcpy((ptr+off), arg.data(), arg.size()); off += arg.size(); arg.clear(); } *(ptr+off) = '\n'; ++off; assert(static_cast<int64_t>(off) <= shmSize); shmIpc->close(); shmIpc->flush(); assert(args.size() >= ARGS_PER_LAUNCH+2); args[ARGS_PER_LAUNCH+0] = "--app"; args[ARGS_PER_LAUNCH+1] = mpi::getIpcFile(_installPath,_ipcName); args.resize(ARGS_PER_LAUNCH+2); } void MpiLauncher::addPerInstanceArgs(const InstanceID myId, const InstanceDesc* desc, const string& clusterUuid, const string& queryId, const string& launchId, const vector<string>& slaveArgs, vector<string>& args) { InstanceID currId = desc->getInstanceId(); ostringstream instanceIdStr; instanceIdStr << currId; const string& host = desc->getHost(); const string& installPath = desc->getPath(); ostringstream portStr; portStr << desc->getPort(); // mpirun command line: // [":", "-H", <IP>, "-np", <#>, "-wd", <path>, "--prefix", <path>, "-x", "LD_LIBRARY_PATH"]* validateLauncherArg(host); args.push_back("-H"); args.push_back(host); args.push_back("-np"); args.push_back("1"); validateLauncherArg(installPath); args.push_back("-wd"); args.push_back(installPath); if (currId != myId) { // XXX NOTE: --prefix is not appended for this instance (the coordinator) // and this instance's arguments go first in the argument list because of // of an apparent bug in mpirun handling of --prefix const string mpiDir = MpiManager::getMpiDir(installPath); validateLauncherArg(mpiDir); args.push_back("--prefix"); args.push_back(mpiDir); } args.push_back("-x"); args.push_back("LD_LIBRARY_PATH"); const string slaveBinFile = mpi::getSlaveBinFile(installPath); validateLauncherArg(slaveBinFile); args.push_back(slaveBinFile); // slave args args.push_back(clusterUuid); args.push_back(queryId); args.push_back(instanceIdStr.str()); args.push_back(launchId); args.push_back(portStr.str()); args.insert(args.end(), slaveArgs.begin(), slaveArgs.end()); } void MpiLauncher::getSortedInstances(map<InstanceID,const InstanceDesc*>& sortedInstances, const Instances& instances, const boost::shared_ptr<Query>& query) { for (Instances::const_iterator i = instances.begin(); i != instances.end(); ++i) { InstanceID id = i->getInstanceId(); try { // lid should be equal mpi rank InstanceID lid = query->mapPhysicalToLogical(id); sortedInstances[lid] = &(*i); } catch(SystemException& e) { if (e.getLongErrorCode() != SCIDB_LE_INSTANCE_OFFLINE) { throw; } } } assert(sortedInstances.size() == query->getInstancesCount()); } void MpiLauncher::closeFds() { //XXX TODO: move to Sysinfo long maxfd = ::sysconf(_SC_OPEN_MAX); if (maxfd<2) { maxfd = 1024; } cerr << "LAUNCHER: maxfd = " << maxfd << endl; // close all fds except for stderr,stdout int rc = scidb::File::closeFd(0); //stdin rc=rc; // avoid compiler warning for (long fd=3; fd <= maxfd ; ++fd) { rc = scidb::File::closeFd(fd); rc=rc; // avoid compiler warning } } void MpiLauncher::becomeProcGroupLeader() { if (setpgid(0,0) != 0) { perror("setpgid"); _exit(1); } } void MpiLauncher::setupLogging() { std::string path = mpi::getLauncherLogFile(_installPath, _queryId, _launchId); mpi::connectStdIoToLog(path); } void MpiLauncher::recordPids() { assert(!_installPath.empty()); string path = mpi::getLauncherPidFile(_installPath, _queryId, _launchId); mpi::recordPids(path); } void MpiLauncher::initExecArgs(const vector<string>& args, boost::scoped_array<const char*>& argv) { size_t argsSize = args.size(); if (argsSize<1) { cerr << "LAUNCHER: initExecArgs failed to get args:" << argsSize << endl; _exit(1); } for (size_t i=0; i < argsSize; ++i) { argv[i] = args[i].c_str(); } argv[argsSize] = NULL; } void MpiLauncher::scheduleKillTimer() { // this->_mutex must be locked assert (_pid > 1); assert(!_killTimer); _killTimer = shared_ptr<boost::asio::deadline_timer>(new boost::asio::deadline_timer(getIOService())); int rc = _killTimer->expires_from_now(posix_time::seconds(_MPI_LAUNCHER_KILL_TIMEOUT)); if (rc != 0) { throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_SYSCALL_ERROR) << "boost::asio::expires_from_now" << rc << rc << _MPI_LAUNCHER_KILL_TIMEOUT); } _killTimer->async_wait(boost::bind(&MpiLauncher::handleKillTimeout, shared_from_this(), _killTimer, boost::asio::placeholders::error)); } bool MpiLauncher::waitForExit(pid_t pid, int *status, bool noWait) { int opts = 0; if (noWait) { opts = WNOHANG; } while(true) { pid_t rc = ::waitpid(pid,status,opts); if ((rc == -1) && (errno==EINTR)) { continue; } if (rc == 0 && noWait) { return false; } if ((rc <= 0) || (rc != pid)) { int err = errno; throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_SYSCALL_ERROR) << "wait" << rc << err << pid); } return true; } throw SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNREACHABLE_CODE); return false; } } //namespace
namespace scidb { static const bool DBG = false; static log4cxx::LoggerPtr logger(log4cxx::Logger::getLogger("scidb.query.ops.mpi")); /// /// some operators may not be able to work in degraded mode while they are being implemented /// this call can make them exit if that is the case. /// TODO: add a more explicit message of what is happening void throwIfDegradedMode(shared_ptr<Query>& query) { const boost::shared_ptr<const InstanceMembership> membership = Cluster::getInstance()->getInstanceMembership(); if ((membership->getViewId() != query->getCoordinatorLiveness()->getViewId()) || (membership->getInstances().size() != query->getInstancesCount())) { // because we can't yet handle the extra data from // replicas that we would be fed in "degraded mode" throw USER_EXCEPTION(SCIDB_SE_EXECUTION, SCIDB_LE_NO_QUORUM2); } } void MPIPhysical::setQuery(const boost::shared_ptr<Query>& query) { boost::shared_ptr<Query> myQuery = _query.lock(); if (myQuery) { assert(query==myQuery); assert(_ctx); return; } PhysicalOperator::setQuery(query); _ctx = boost::shared_ptr<MpiOperatorContext>(new MpiOperatorContext(query)); _ctx = MpiManager::getInstance()->checkAndSetCtx(query,_ctx); } void MPIPhysical::postSingleExecute(shared_ptr<Query> query) { // On a non-participating launcher instance it is difficult // to determine when the launch is complete without a sync point. // postSingleExecute() is run after all instances report success of their execute() phase, // that is effectively a sync point. assert(query->getCoordinatorID() == COORDINATOR_INSTANCE); assert(_mustLaunch); assert(_ctx); const uint64_t lastIdInUse = _ctx->getLastLaunchIdInUse(); boost::shared_ptr<MpiLauncher> launcher(_ctx->getLauncher(lastIdInUse)); assert(launcher); if (launcher && launcher == _launcher) { LOG4CXX_DEBUG(logger, "MPIPhysical::postSingleExecute: destroying last launcher for launch = " << lastIdInUse); assert(lastIdInUse == _launchId); launcher->destroy(); _launcher.reset(); } _ctx.reset(); } bool MPIPhysical::launchMPISlaves(shared_ptr<Query>& query, const size_t maxSlaves) { LOG4CXX_DEBUG(logger, "MPIPhysical::launchMPISlaves(query, maxSlaves: " << maxSlaves << ") called."); assert(maxSlaves <= query->getInstancesCount()); // This barrier guarantees MPIPhysical::setQuery is called on all instances // before any slaves are launched. // It also makes sure a non-participating launcher waits for the current launch to finish before starting a new one. syncBarrier(0, query); syncBarrier(1, query); _launchId = _ctx->getNextLaunchId(); // bump the launch ID by 1 Cluster* cluster = Cluster::getInstance(); const boost::shared_ptr<const InstanceMembership> membership = cluster->getInstanceMembership(); const string& installPath = MpiManager::getInstallPath(membership); uint64_t lastIdInUse = _ctx->getLastLaunchIdInUse(); assert(lastIdInUse < _launchId); boost::shared_ptr<MpiSlaveProxy> slave; // check if our logical ID is within the set of instances that will have a corresponding slave InstanceID iID = query->getInstanceID(); if ( iID < maxSlaves) { slave = boost::make_shared<MpiSlaveProxy>(_launchId, query, installPath); _ctx->setSlave(slave); } _mustLaunch = (query->getCoordinatorID() == COORDINATOR_INSTANCE); if (_mustLaunch) { boost::shared_ptr<MpiLauncher> oldLauncher = _ctx->getLauncher(lastIdInUse); if (oldLauncher) { assert(lastIdInUse == oldLauncher->getLaunchId()); LOG4CXX_DEBUG(logger, "MPIPhysical::launchMPISlaves(): destroying last launcher for launch = " << lastIdInUse); oldLauncher->destroy(); oldLauncher.reset(); } _launcher = boost::shared_ptr<MpiLauncher>(MpiManager::getInstance()->newMPILauncher(_launchId, query)); _ctx->setLauncher(_launcher); std::vector<std::string> args; _launcher->launch(args, membership, maxSlaves); } if ( iID < maxSlaves) { assert(slave); //-------------------- Get the handshake LOG4CXX_DEBUG(logger, "MPIPhysical::launchMPISlaves(): slave->waitForHandshake() 1 called."); slave->waitForHandshake(_ctx); LOG4CXX_DEBUG(logger, "MPIPhysical::launchMPISlaves(): slave->waitForHandshake() 1 returned."); } if ( iID < maxSlaves || _mustLaunch) { // After the handshake the old slave must be gone LOG4CXX_DEBUG(logger, "MPIPhysical::launchMPISlaves():" << " lastLaunchIdInUse=" << lastIdInUse << " launchId=" << _launchId); boost::shared_ptr<MpiSlaveProxy> oldSlave = _ctx->getSlave(lastIdInUse); if (oldSlave) { assert(lastIdInUse == oldSlave->getLaunchId()); LOG4CXX_DEBUG(logger, "MPIPhysical::launchMPISlaves(): oldSlave->destroy() & .reset()"); oldSlave->destroy(); oldSlave.reset(); } _ctx->complete(lastIdInUse); } if ( iID < maxSlaves) { _ipcName = mpi::getIpcName(installPath, cluster->getUuid(), query->getQueryID(), cluster->getLocalInstanceId(), _launchId); LOG4CXX_DEBUG(logger, "MPIPhysical::launchMPISlaves(): instance " << iID << " slave started."); return true; } else { LOG4CXX_DEBUG(logger, "MPIPhysical::launchMPISlaves(): instance " << iID << " slave bypass."); return false; } } // XXX TODO: consider returning std::vector<scidb::SharedMemoryPtr> // XXX TODO: which would require supporting different types of memory (double, char etc.) std::vector<MPIPhysical::SMIptr_t> MPIPhysical::allocateMPISharedMemory(size_t numBufs, size_t elemSizes[], size_t numElems[], string dbgNames[]) { LOG4CXX_DEBUG(logger, "MPIPhysical::allocateMPISharedMemory(numBufs "<<numBufs<<",,,)"); if(logger->isTraceEnabled()) { LOG4CXX_TRACE(logger, "MPIPhysical::allocateMPISharedMemory(): allocations are: "); for(size_t ii=0; ii< numBufs; ii++) { LOG4CXX_TRACE(logger, "MPIPhysical::allocateMPISharedMemory():" << " elemSizes["<<ii<<"] "<< dbgNames[ii] << " len " << numElems[ii]); } } std::vector<SMIptr_t> shmIpc(numBufs); bool preallocate = Config::getInstance()->getOption<bool>(CONFIG_PREALLOCATE_SHM); for(size_t ii=0; ii<numBufs; ii++) { std::stringstream suffix; suffix << "." << ii ; std::string ipcNameFull= _ipcName + suffix.str(); LOG4CXX_TRACE(logger, "IPC name = " << ipcNameFull); shmIpc[ii] = SMIptr_t(mpi::newSharedMemoryIpc(ipcNameFull, preallocate)); // can I get 'em off ctx instead? _ctx->addSharedMemoryIpc(_launchId, shmIpc[ii]); char* ptr = MpiLauncher::initIpcForWrite(shmIpc[ii].get(), (elemSizes[ii] * numElems[ii])); assert(ptr); ptr=ptr; } return shmIpc; } void MPIPhysical::releaseMPISharedMemoryInputs(std::vector<MPIPhysical::SMIptr_t>& shmIpc, size_t resultIpcIndx) { for(size_t i=0; i<shmIpc.size(); i++) { if (!shmIpc[i]) { continue; } SharedMemoryIpc *ipc = shmIpc[i].get(); ipc->close(); if (i!=resultIpcIndx) { ipc->unmap(); } if (!ipc->remove()) { throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED) << "shared_memory_remove"); } } } } // namespace
namespace scidb { static log4cxx::LoggerPtr logger(log4cxx::Logger::getLogger("scidb.mpi")); static double getTimeInSecs() { struct timespec ts; if (clock_gettime(CLOCK_REALTIME, &ts) == -1) { assert(false); throw SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_CANT_GET_SYSTEM_TIME); } return (ts.tv_sec + ts.tv_nsec*1e-9); } static bool checkForTimeout(double startTime, double timeout, uint64_t launchId, MpiOperatorContext* ctx) { if ((getTimeInSecs() - startTime) > timeout) { throw SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED) << "MPI slave process failed to communicate in time"; } return true; } static bool checkLauncher(double startTime, double timeout, uint64_t launchId, MpiOperatorContext* ctx) { boost::shared_ptr<MpiLauncher> launcher(ctx->getLauncher(launchId)); if (launcher && !launcher->isRunning()) { throw SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED) << "MPI launcher process"; } checkForTimeout(startTime, timeout, launchId, ctx); return true; } void MpiSlaveProxy::waitForHandshake(boost::shared_ptr<MpiOperatorContext>& ctx) { if (_connection) { throw (InvalidStateException(REL_FILE, __FUNCTION__, __LINE__) << "Connection to MPI slave already established"); } MpiOperatorContext::LaunchErrorChecker errChecker = boost::bind(&checkLauncher, getTimeInSecs(), static_cast<double>(_MPI_SLAVE_RESPONSE_TIMEOUT), _1, _2); boost::shared_ptr<scidb::ClientMessageDescription> msg = ctx->popMsg(_launchId, errChecker); assert(msg); _connection = msg->getClientContext(); if (msg->getMessageType() != scidb::mtMpiSlaveHandshake) { throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave handshake is invalid"); } boost::shared_ptr<scidb_msg::MpiSlaveHandshake> handshake = boost::dynamic_pointer_cast<scidb_msg::MpiSlaveHandshake>(msg->getRecord()); assert(handshake); // parse the handshake if (!handshake->has_pid()) { throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave handshake has no PID"); } if (!handshake->has_ppid()) { throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave handshake has no PPID"); } _pids.push_back(handshake->pid()); _pids.push_back(handshake->ppid()); string clusterUuid = Cluster::getInstance()->getUuid(); if (handshake->cluster_uuid() != clusterUuid) { throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave handshake has invalid clusterUuid"); } InstanceID instanceId = Cluster::getInstance()->getLocalInstanceId(); if (handshake->instance_id() != instanceId) { throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave handshake has invalid instanceId"); } if (handshake->launch_id() != _launchId) { throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave handshake has invalid launchId"); } boost::shared_ptr<scidb::Query> query( _query.lock()); Query::validateQueryPtr(query); if (handshake->rank() != query->getInstanceID()) { // logical instance ID throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave handshake has invalid rank"); } ClientContext::DisconnectHandler dh = boost::bind(&MpiMessageHandler::handleMpiSlaveDisconnect, _launchId, _1); _connection->attachQuery(query->getQueryID(), dh); LOG4CXX_DEBUG(logger, "MpiSlaveProxy::waitForHandshake: handshake: " <<" pid="<<handshake->pid() <<", ppid="<<handshake->ppid() <<", cluster_uuid="<<handshake->cluster_uuid() <<", instance_id="<<handshake->instance_id() <<", launch_id="<<handshake->launch_id() <<", rank="<<handshake->rank()); } void MpiSlaveProxy::sendCommand(mpi::Command& cmd, boost::shared_ptr<MpiOperatorContext>& ctx) { if (!_connection) { throw (InvalidStateException(REL_FILE, __FUNCTION__, __LINE__) << "No connection to MPI slave"); } // set command boost::shared_ptr<scidb_msg::MpiSlaveCommand> cmdPtr(new scidb_msg::MpiSlaveCommand()); cmdPtr->set_command(cmd.getCmd()); // set args const std::vector<std::string>& args = cmd.getArgs(); google::protobuf::RepeatedPtrField<std::string>* msgArgs = cmdPtr->mutable_args(); assert(msgArgs); msgArgs->Reserve(args.size()); for (std::vector<std::string>::const_iterator iter = args.begin(); iter != args.end(); ++iter) { cmdPtr->add_args(*iter); } // send to slave scidb::MessagePtr msgPtr(cmdPtr); boost::asio::const_buffer binary(NULL,0); try { scidb::sendAsync(_connection, scidb::mtMpiSlaveCommand, msgPtr, binary); } catch (const scidb::SystemException& e) { LOG4CXX_ERROR(logger, "MpiSlaveProxy::sendCommand: " << "FAILED to send MpiSlaveCommand to slave because: " << e.what()); throw; } LOG4CXX_DEBUG(logger, "MpiSlaveProxy::sendCommand: MpiSlaveCommand " << cmd.toString() << " sent to slave"); } /// @todo XXX TODO tigor: make it timeout ? TBD int64_t MpiSlaveProxy::waitForStatus(boost::shared_ptr<MpiOperatorContext>& ctx, bool raise) { if (!_connection) { throw (InvalidStateException(REL_FILE, __FUNCTION__, __LINE__) << "No connection to MPI slave"); } MpiOperatorContext::LaunchErrorChecker noopChecker; boost::shared_ptr<scidb::ClientMessageDescription> msg = ctx->popMsg(_launchId, noopChecker); assert(msg); LOG4CXX_DEBUG(logger, "MpiSlaveProxy::waitForStatus: message from client: " <<" ctx = " << msg->getClientContext().get() <<", msg type = "<< msg->getMessageType() <<", queryID = "<<msg->getQueryId()); if (_connection != msg->getClientContext()) { if (!msg->getClientContext() && msg->getMessageType() == scidb::SYSTEM_NONE_MSG_ID) { throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave disconnected prematurely"); } throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave connection context mismatch"); } if (msg->getMessageType() != scidb::mtMpiSlaveResult) { throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave returned invalid status"); } boost::shared_ptr<scidb_msg::MpiSlaveResult> result = boost::dynamic_pointer_cast<scidb_msg::MpiSlaveResult>(msg->getRecord()); assert(result); if (!result->has_status()) { throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave returned no status"); } if (raise && result->status() != 0) { throw (SYSTEM_EXCEPTION(SCIDB_SE_OPERATOR, SCIDB_LE_OPERATION_FAILED) << result->status()); } return result->status(); } void MpiSlaveProxy::waitForExit(boost::shared_ptr<MpiOperatorContext>& ctx) { if (!_connection) { throw (InvalidStateException(REL_FILE, __FUNCTION__, __LINE__) << "No connection to MPI slave"); } MpiOperatorContext::LaunchErrorChecker timeChecker = boost::bind(&checkForTimeout, getTimeInSecs(), static_cast<double>(_MPI_SLAVE_RESPONSE_TIMEOUT), _1, _2); boost::shared_ptr<scidb::ClientMessageDescription> msg = ctx->popMsg(_launchId, timeChecker); assert(msg); LOG4CXX_DEBUG(logger, "MpiSlaveProxy::waitForExit: " <<" ctx = " << msg->getClientContext().get() <<", msg type = "<< msg->getMessageType() <<", queryID = "<<msg->getQueryId()); if (msg->getMessageType() != scidb::SYSTEM_NONE_MSG_ID) { throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave returned invalid status"); } assert(!msg->getClientContext()); _connection.reset(); } void MpiSlaveProxy::destroy(bool error) { if (error) { _inError=true; } // kill the slave proc and its parent orted for ( std::vector<pid_t>::const_iterator iter=_pids.begin(); iter!=_pids.end(); ++iter) { pid_t pid = *iter; //XXX TODO tigor: kill proceess group (-pid) ? MpiErrorHandler::killProc(_installPath, pid); } // rm pid file std::string pidFile = mpi::getSlavePidFile(_installPath, _queryId, _launchId); scidb::File::remove(pidFile.c_str(), false); // rm log file if (!logger->isTraceEnabled() && !_inError) { string logFileName = mpi::getSlaveLogFile(_installPath, _queryId, _launchId); scidb::File::remove(logFileName.c_str(), false); } } } //namespace
namespace scidb { static log4cxx::LoggerPtr logger(log4cxx::Logger::getLogger("scidb.linear_algebra.ops.scalapack")); inline bool hasSingleAttribute(ArrayDesc const& desc) { return desc.getAttributes().size() == 1 || (desc.getAttributes().size() == 2 && desc.getAttributes()[1].isEmptyIndicator()); } void checkScaLAPACKInputs(std::vector<ArrayDesc> schemas, boost::shared_ptr<Query> query, size_t nMatsMin, size_t nMatsMax) { enum dummy {ROW=0, COL=1}; enum dummy2 { ATTR0=0 }; const size_t NUM_MATRICES = schemas.size(); if(schemas.size() < nMatsMin || schemas.size() > nMatsMax) { throw PLUGIN_USER_EXCEPTION(DLANameSpace, SCIDB_SE_INFER_SCHEMA, DLA_ERROR2); } // Check the properties first by argument, then by order property is determined in AFL statement: // size, chunkSize, overlap. // Check individual properties in the loop, and any inter-matrix properties after the loop // TODO: in all of these, name the argument # at fault for(size_t iArray=0; iArray < NUM_MATRICES; iArray++) { // check: attribute count == 1 if (!hasSingleAttribute(schemas[iArray])) { throw PLUGIN_USER_EXCEPTION(DLANameSpace, SCIDB_SE_INFER_SCHEMA, DLA_ERROR2); // TODO: offending matrix is iArray } // check: attribute type is double if (schemas[iArray].getAttributes()[ATTR0].getType() != TID_DOUBLE) { throw PLUGIN_USER_EXCEPTION(DLANameSpace, SCIDB_SE_INFER_SCHEMA, DLA_ERROR5); // TODO: offending matrix is iArray } // check: nDim == 2 (a matrix) // TODO: relax nDim to be 1 and have it imply NCOL=1 (column vector) // if you want a row vector, we could make transpose accept the column vector and output a 1 x N matrix // and call that a "row vector" The other way could never be acceptable. // const size_t SCALAPACK_IS_2D = 2 ; if (schemas[iArray].getDimensions().size() != SCALAPACK_IS_2D) { throw PLUGIN_USER_EXCEPTION(DLANameSpace, SCIDB_SE_INFER_SCHEMA, DLA_ERROR3); // TODO: offending matrix is iArray } // check: size is bounded const Dimensions& dims = schemas[iArray].getDimensions(); if (dims[ROW].getLength() == INFINITE_LENGTH || dims[COL].getLength() == INFINITE_LENGTH) { throw PLUGIN_USER_EXCEPTION(DLANameSpace, SCIDB_SE_INFER_SCHEMA, DLA_ERROR9); } // TODO: check: sizes are not larger than largest ScaLAPACK fortran INTEGER // TEMPORARY until #2202 defines how to interpret arrays not starting at 0 // "dimensions must start at 0" for(unsigned dim =ROW; dim <= COL; dim++) { if(dims[dim].getStart() != 0) { throw PLUGIN_USER_EXCEPTION(DLANameSpace, SCIDB_SE_INFER_SCHEMA, DLA_ERROR44); } } // check: chunk interval not too small if (dims[ROW].getChunkInterval() < slpp::SCALAPACK_MIN_BLOCK_SIZE || dims[COL].getChunkInterval() < slpp::SCALAPACK_MIN_BLOCK_SIZE ) { // the cache will thrash and performance will be unexplicably horrible to the user throw PLUGIN_USER_EXCEPTION(DLANameSpace, SCIDB_SE_INFER_SCHEMA, DLA_ERROR41); // too small } // check: chunk interval not too large if (dims[ROW].getChunkInterval() > slpp::SCALAPACK_MAX_BLOCK_SIZE || dims[COL].getChunkInterval() > slpp::SCALAPACK_MAX_BLOCK_SIZE ) { // the cache will thrash and performance will be unexplicably horrible to the user throw PLUGIN_USER_EXCEPTION(DLANameSpace, SCIDB_SE_INFER_SCHEMA, DLA_ERROR42); // too large } // TODO: the following does not work correctly. postWarning() itself uses SCIDB_WARNING // does not work correctly from a plugin, so seeking an example of how to do // postWarning() from a plugin. if (false) { // broken code inside postWarning(SCIDB_WARNING()) faults and needs a different argument. for(size_t d = ROW; d <= COL; d++) { if(dims[d].getChunkInterval() != slpp::SCALAPACK_EFFICIENT_BLOCK_SIZE) { query->postWarning(SCIDB_WARNING(DLA_WARNING4) << slpp::SCALAPACK_EFFICIENT_BLOCK_SIZE << slpp::SCALAPACK_EFFICIENT_BLOCK_SIZE); } } } // check: no overlap allowed // TODO: improvement? if there's overlap, we may be able to ignore it, // else invoke a common piece of code to remove it // and in both cases emit a warning about non-optimality if (dims[ROW].getChunkOverlap()!=0 || dims[COL].getChunkOverlap()!=0) { stringstream ss; ss<<"in matrix "<<iArray; throw (PLUGIN_USER_EXCEPTION(DLANameSpace, SCIDB_SE_INFER_SCHEMA, DLA_ERROR40) << ss.str()); } } // check: the chunkSizes from the user must be identical (until auto-repart is working) const bool AUTO_REPART_WORKING = false ; // #2032 if( ! AUTO_REPART_WORKING ) { int64_t commonChunkSize = schemas[0].getDimensions()[ROW].getChunkInterval(); // TODO: remove these checks if #2023 is fixed and requiresRepart() is functioning correctly for(size_t iArray=0; iArray < NUM_MATRICES; iArray++) { const Dimensions& dims = schemas[iArray].getDimensions(); // arbitrarily take first mentioned chunksize as the one for all to share if (dims[ROW].getChunkInterval() != commonChunkSize || dims[COL].getChunkInterval() != commonChunkSize ) { throw PLUGIN_USER_EXCEPTION(DLANameSpace, SCIDB_SE_INFER_SCHEMA, DLA_ERROR10); // TODO: name the matrix } } } // Chunksize matching critique // This is not what we want it to be, but has to be until #2023 is fixed, which // will allow the query planner and optimizer to repartition automatically, instead // of putting the burden on the user. // // (1) The required restriction to make ScaLAPACK work is that they are equal // in both dimensions (square chunks) and equal for all matrices. // (2) Legal values are in a range, expressed by SCALAPACK_{MIN,MAX}_BLOCK_SIZE // (3) So what do we do if the chunksize is not optimal? Can we go ahead and compute // the answer if the matrix is below a size where it will really matter? // Can we fix query->postWarning to warn in that case? // (4) If the user gives inputs that match, and don't need a repart, we can proceed. // (5) Else we will have to add reparts for the user [not implemented] // Should we repart some of them to another size? Or should we repart all of them // to the optimial aize? Unforunately, we don't have the information we would need // to make an intelligent choice ... // Due to the api of LogicalOperator::requiresRepart() we can't tell which situation // it is, because it still only functions on the first input only. // // TODO: after #2032 is fixed, have James fix note(4) above. // } // PGB: the requirement on names is that until such a time as we have syntax to disambiguate them by dimesion index // or other means, they must be distinct, else if stored, we will lose access to any but the first. // JHM: in math, its annoying to have the names keep getting longer for the same thing. So we only want to do // the appending of _? when required. std::pair<string, string> ScaLAPACKDistinctDimensionNames(const string& a, const string& b) { typedef std::pair<string, string> result_t ; result_t result; if (a != b) { // for algebra, avoid the renames when possible return result_t(a,b); } else { // fallback to appending _1 or _2 to both... would rather do it to just one, // but this is the only convention we have for conflicting in general. return result_t(a + "_1", b + "_2"); } } void log4cxx_debug_dimensions(const std::string& prefix, const Dimensions& dims) { if(logger->isDebugEnabled()) { for (size_t i=0; i<dims.size(); i++) { LOG4CXX_DEBUG(logger, prefix << " dims["<<i<<"] from " << dims[i].getStartMin() << " to " << dims[i].getEndMax()); } } } } // namespace
void PhysicalQueryPlanNode::supplantChild(const PhysNodePtr& targetChild, const PhysNodePtr& newChild) { assert(newChild); assert(targetChild); assert(newChild.get() != this); int removed = 0; std::vector<PhysNodePtr> newChildren; if (logger->isTraceEnabled()) { std::ostringstream os; os << "Supplanting targetChild Node:\n"; targetChild->toString(os, 0 /*indent*/,false /*children*/); os << "\nwith\n"; newChild->toString(os, 0 /*indent*/,false /*children*/); LOG4CXX_TRACE(logger, os.str()); } for(auto &child : _childNodes) { if (child != targetChild) { newChildren.push_back(child); } else { // Set the parent of the newChild to this node. newChild->_parent = shared_from_this(); // NOTE: Any existing children of the newChild are removed from the // Query Plan. if ((newChild->_childNodes).size() > 0) { LOG4CXX_INFO(logger, "Child nodes of supplanting node are being removed from the tree."); } // Re-parent the children of the targetChild to the newChild newChild->_childNodes.swap(targetChild->_childNodes); for (auto grandchild : newChild -> _childNodes) { assert(grandchild != newChild); grandchild->_parent = newChild; } // Remove any references to the children from the targetChild targetChild->_childNodes.clear(); targetChild->resetParent(); // Add the newChild to this node newChildren.push_back(newChild); ++removed; } } _childNodes.swap(newChildren); if (logger->isTraceEnabled()) { std::ostringstream os; newChild->toString(os); LOG4CXX_TRACE(logger, "New Node subplan:\n" << os.str()); } SCIDB_ASSERT(removed==1); }
namespace scidb { // Logger for query processor. static to prevent visibility of variable outside of file static log4cxx::LoggerPtr logger(log4cxx::Logger::getLogger("scidb.qproc.processor")); // LogicalQueryPlanNode LogicalQueryPlanNode::LogicalQueryPlanNode( const std::shared_ptr<ParsingContext>& parsingContext, const std::shared_ptr<LogicalOperator>& logicalOperator): _logicalOperator(logicalOperator), _parsingContext(parsingContext) { } LogicalQueryPlanNode::LogicalQueryPlanNode( const std::shared_ptr<ParsingContext>& parsingContext, const std::shared_ptr<LogicalOperator>& logicalOperator, const std::vector<std::shared_ptr<LogicalQueryPlanNode> > &childNodes): _logicalOperator(logicalOperator), _childNodes(childNodes), _parsingContext(parsingContext) { } const ArrayDesc& LogicalQueryPlanNode::inferTypes(std::shared_ptr< Query> query) { std::vector<ArrayDesc> inputSchemas; ArrayDesc outputSchema; for (size_t i=0, end=_childNodes.size(); i<end; i++) { inputSchemas.push_back(_childNodes[i]->inferTypes(query)); } outputSchema = _logicalOperator->inferSchema(inputSchemas, query); //FIXME: May be cover inferSchema method with another one and assign alias there? if (!_logicalOperator->getAliasName().empty()) { outputSchema.addAlias(_logicalOperator->getAliasName()); } _logicalOperator->setSchema(outputSchema); LOG4CXX_DEBUG(logger, "Inferred schema for operator " << _logicalOperator->getLogicalName() << ": " << outputSchema); return _logicalOperator->getSchema(); } void LogicalQueryPlanNode::inferArrayAccess(std::shared_ptr<Query>& query) { //XXX TODO: consider non-recursive implementation for (size_t i=0, end=_childNodes.size(); i<end; i++) { _childNodes[i]->inferArrayAccess(query); } assert(_logicalOperator); _logicalOperator->inferArrayAccess(query); } std::string LogicalQueryPlanNode::inferPermissions(std::shared_ptr<Query>& query) { std::stringstream ss; // Consider non-recursive implementation for (size_t i=0, end=_childNodes.size(); i<end; i++) { ss << _childNodes[i]->inferPermissions(query); } assert(_logicalOperator); ss << _logicalOperator->inferPermissions(query); std::string permissions = ss.str(); // Remove duplicates std::map<char, int> hashMap; std::string::const_iterator itStr; for ( itStr = permissions.begin(); itStr != permissions.end(); ++itStr) { // Add/overwrite the node in the hashMap hashMap[*itStr] = 1; } std::map<char, int>::const_iterator itHashMap; std::string response; for ( itHashMap = hashMap.begin(); itHashMap != hashMap.end(); ++itHashMap) { // Append each character from the hashMap response += itHashMap->first; } return response; } void LogicalQueryPlanNode::toString(std::ostream &out, int indent, bool children) const { Indent prefix(indent); out << prefix('>', false); out << "[lInstance] children "<<_childNodes.size()<<"\n"; _logicalOperator->toString(out,indent+1); if (children) { for (size_t i = 0; i< _childNodes.size(); i++) { _childNodes[i]->toString(out, indent+1); } } } PhysicalQueryPlanNode::PhysicalQueryPlanNode(const std::shared_ptr<PhysicalOperator>& physicalOperator, bool ddl, bool tile) : _physicalOperator(physicalOperator), _parent(), _ddl(ddl), _tile(tile), _isSgMovable(true), _isSgOffsetable(true), _distribution() { } PhysicalQueryPlanNode::PhysicalQueryPlanNode(const std::shared_ptr<PhysicalOperator>& physicalOperator, const std::vector<std::shared_ptr<PhysicalQueryPlanNode> > &childNodes, bool ddl, bool tile): _physicalOperator(physicalOperator), _childNodes(childNodes), _parent(), _ddl(ddl), _tile(tile), _isSgMovable(true), _isSgOffsetable(true), _distribution() { } void PhysicalQueryPlanNode::toString(std::ostream &out, int indent, bool children) const { Indent prefix(indent); out << prefix('>', false); out<<"[pNode] "<<_physicalOperator->getPhysicalName()<<" ddl "<<isDdl()<<" tile "<<supportsTileMode()<<" children "<<_childNodes.size()<<"\n"; _physicalOperator->toString(out,indent+1); if (children) { out << prefix(' '); out << "output full chunks: "; out << (outputFullChunks() ? "yes" : "no"); out << "\n"; out << prefix(' '); out << "changes dstribution: "; out << (changesDistribution() ? "yes" : "no"); out << "\n"; } out << prefix(' '); out<<"props sgm "<<_isSgMovable<<" sgo "<<_isSgOffsetable<<"\n"; out << prefix(' '); out<<"diout "<<_distribution<<"\n"; const ArrayDesc& schema = _physicalOperator->getSchema(); out << prefix(' '); out<<"bound "<<_boundaries <<" cells "<<_boundaries.getNumCells(); if (_boundaries.getStartCoords().size() == schema.getDimensions().size()) { out << " chunks "; try { uint64_t n = _boundaries.getNumChunks(schema.getDimensions()); out << n; } catch (PhysicalBoundaries::UnknownChunkIntervalException&) { out << '?'; } out << " est_bytes " << _boundaries.getSizeEstimateBytes(schema) << '\n'; } else { out <<" [improperly initialized]\n"; } if (children) { for (size_t i = 0; i< _childNodes.size(); i++) { _childNodes[i]->toString(out, indent+1); } } } bool PhysicalQueryPlanNode::isStoringSg() const { if ( isSgNode() ) { return (!getSgArrayName(_physicalOperator->getParameters()).empty()); } return false; } string PhysicalQueryPlanNode::getSgArrayName(const PhysicalOperator::Parameters& sgParameters) { std::string arrayName; if (sgParameters.size() >= 3) { arrayName = static_cast<OperatorParamReference*>(sgParameters[2].get())->getObjectName(); } return arrayName; } bool PhysicalQueryPlanNode::getRedimensionIsStrict(const PhysicalOperator::Parameters& redimParameters) { bool isStrict = true; if (redimParameters.size() == 2 && redimParameters[1]->getParamType() == scidb::PARAM_PHYSICAL_EXPRESSION) { OperatorParamPhysicalExpression* paramExpr = static_cast<OperatorParamPhysicalExpression*>(redimParameters[1].get()); SCIDB_ASSERT(paramExpr->isConstant()); isStrict = paramExpr->getExpression()->evaluate().getBool(); } return isStrict; } bool PhysicalQueryPlanNode::getInputIsStrict(const PhysicalOperator::Parameters& inputParameters) { bool isStrict = true; if (inputParameters.size() == 6 && inputParameters[5]->getParamType() == scidb::PARAM_PHYSICAL_EXPRESSION) { OperatorParamPhysicalExpression* paramExpr = static_cast<OperatorParamPhysicalExpression*>(inputParameters[5].get()); SCIDB_ASSERT(paramExpr->isConstant()); isStrict = paramExpr->getExpression()->evaluate().getBool(); } else if (inputParameters.size() == 7) { ASSERT_EXCEPTION((inputParameters[6]->getParamType() == scidb::PARAM_PHYSICAL_EXPRESSION), "Invalid input() parameters 6"); OperatorParamPhysicalExpression* paramExpr = static_cast<OperatorParamPhysicalExpression*>(inputParameters[6].get()); SCIDB_ASSERT(paramExpr->isConstant()); isStrict = paramExpr->getExpression()->evaluate().getBool(); } return isStrict; } void PhysicalQueryPlanNode::supplantChild(const PhysNodePtr& targetChild, const PhysNodePtr& newChild) { assert(newChild); assert(targetChild); assert(newChild.get() != this); int removed = 0; std::vector<PhysNodePtr> newChildren; if (logger->isTraceEnabled()) { std::ostringstream os; os << "Supplanting targetChild Node:\n"; targetChild->toString(os, 0 /*indent*/,false /*children*/); os << "\nwith\n"; newChild->toString(os, 0 /*indent*/,false /*children*/); LOG4CXX_TRACE(logger, os.str()); } for(auto &child : _childNodes) { if (child != targetChild) { newChildren.push_back(child); } else { // Set the parent of the newChild to this node. newChild->_parent = shared_from_this(); // NOTE: Any existing children of the newChild are removed from the // Query Plan. if ((newChild->_childNodes).size() > 0) { LOG4CXX_INFO(logger, "Child nodes of supplanting node are being removed from the tree."); } // Re-parent the children of the targetChild to the newChild newChild->_childNodes.swap(targetChild->_childNodes); for (auto grandchild : newChild -> _childNodes) { assert(grandchild != newChild); grandchild->_parent = newChild; } // Remove any references to the children from the targetChild targetChild->_childNodes.clear(); targetChild->resetParent(); // Add the newChild to this node newChildren.push_back(newChild); ++removed; } } _childNodes.swap(newChildren); if (logger->isTraceEnabled()) { std::ostringstream os; newChild->toString(os); LOG4CXX_TRACE(logger, "New Node subplan:\n" << os.str()); } SCIDB_ASSERT(removed==1); } // LogicalPlan LogicalPlan::LogicalPlan(const std::shared_ptr<LogicalQueryPlanNode>& root): _root(root) { } void LogicalPlan::toString(std::ostream &out, int indent, bool children) const { Indent prefix(indent); out << prefix('>', false); out << "[lPlan]:\n"; _root->toString(out, indent+1, children); } // PhysicalPlan PhysicalPlan::PhysicalPlan(const std::shared_ptr<PhysicalQueryPlanNode>& root): _root(root) { } void PhysicalPlan::toString(std::ostream &out, int const indent, bool children) const { Indent prefix(indent); out << prefix('>', false); out << "[pPlan]:"; if (_root.get() != NULL) { out << "\n"; _root->toString(out, indent+1, children); } else { out << "[NULL]\n"; } } } // namespace