void MpiSlaveProxy::waitForExit(boost::shared_ptr<MpiOperatorContext>& ctx) { if (!_connection) { throw (InvalidStateException(REL_FILE, __FUNCTION__, __LINE__) << "No connection to MPI slave"); } MpiOperatorContext::LaunchErrorChecker timeChecker = boost::bind(&checkForTimeout, getTimeInSecs(), static_cast<double>(_MPI_SLAVE_RESPONSE_TIMEOUT), _1, _2); boost::shared_ptr<scidb::ClientMessageDescription> msg = ctx->popMsg(_launchId, timeChecker); assert(msg); LOG4CXX_DEBUG(logger, "MpiSlaveProxy::waitForExit: " <<" ctx = " << msg->getClientContext().get() <<", msg type = "<< msg->getMessageType() <<", queryID = "<<msg->getQueryId()); if (msg->getMessageType() != scidb::SYSTEM_NONE_MSG_ID) { throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave returned invalid status"); } assert(!msg->getClientContext()); _connection.reset(); }
static bool checkForTimeout(double startTime, double timeout, uint64_t launchId, MpiOperatorContext* ctx) { if ((getTimeInSecs() - startTime) > timeout) { throw SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED) << "MPI slave process failed to communicate in time"; } return true; }
bool hasExpired(double startTime, double timeout) { if (timeout < 0) { return false; } if ((getTimeInSecs() - startTime) >= timeout) { return true; } return false; }
void MpiSlaveProxy::waitForHandshake(boost::shared_ptr<MpiOperatorContext>& ctx) { if (_connection) { throw (InvalidStateException(REL_FILE, __FUNCTION__, __LINE__) << "Connection to MPI slave already established"); } MpiOperatorContext::LaunchErrorChecker errChecker = boost::bind(&checkLauncher, getTimeInSecs(), static_cast<double>(_MPI_SLAVE_RESPONSE_TIMEOUT), _1, _2); boost::shared_ptr<scidb::ClientMessageDescription> msg = ctx->popMsg(_launchId, errChecker); assert(msg); _connection = msg->getClientContext(); if (msg->getMessageType() != scidb::mtMpiSlaveHandshake) { throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave handshake is invalid"); } boost::shared_ptr<scidb_msg::MpiSlaveHandshake> handshake = boost::dynamic_pointer_cast<scidb_msg::MpiSlaveHandshake>(msg->getRecord()); assert(handshake); // parse the handshake if (!handshake->has_pid()) { throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave handshake has no PID"); } if (!handshake->has_ppid()) { throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave handshake has no PPID"); } _pids.push_back(handshake->pid()); _pids.push_back(handshake->ppid()); string clusterUuid = Cluster::getInstance()->getUuid(); if (handshake->cluster_uuid() != clusterUuid) { throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave handshake has invalid clusterUuid"); } InstanceID instanceId = Cluster::getInstance()->getLocalInstanceId(); if (handshake->instance_id() != instanceId) { throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave handshake has invalid instanceId"); } if (handshake->launch_id() != _launchId) { throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave handshake has invalid launchId"); } boost::shared_ptr<scidb::Query> query( _query.lock()); Query::validateQueryPtr(query); if (handshake->rank() != query->getInstanceID()) { // logical instance ID throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << "MPI slave handshake has invalid rank"); } ClientContext::DisconnectHandler dh = boost::bind(&MpiMessageHandler::handleMpiSlaveDisconnect, _launchId, _1); _connection->attachQuery(query->getQueryID(), dh); LOG4CXX_DEBUG(logger, "MpiSlaveProxy::waitForHandshake: handshake: " <<" pid="<<handshake->pid() <<", ppid="<<handshake->ppid() <<", cluster_uuid="<<handshake->cluster_uuid() <<", instance_id="<<handshake->instance_id() <<", launch_id="<<handshake->launch_id() <<", rank="<<handshake->rank()); }