void MpiSlaveProxy::waitForExit(boost::shared_ptr<MpiOperatorContext>& ctx)
{
    if (!_connection) {
        throw (InvalidStateException(REL_FILE, __FUNCTION__, __LINE__)
               << "No connection to MPI slave");
    }

    MpiOperatorContext::LaunchErrorChecker timeChecker =
       boost::bind(&checkForTimeout, getTimeInSecs(),
                   static_cast<double>(_MPI_SLAVE_RESPONSE_TIMEOUT), _1, _2);
    boost::shared_ptr<scidb::ClientMessageDescription> msg = ctx->popMsg(_launchId, timeChecker);
    assert(msg);

    LOG4CXX_DEBUG(logger, "MpiSlaveProxy::waitForExit: "
                  <<" ctx = " << msg->getClientContext().get()
                  <<", msg type = "<< msg->getMessageType()
                  <<", queryID = "<<msg->getQueryId());

    if (msg->getMessageType() != scidb::SYSTEM_NONE_MSG_ID) {
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
               << "MPI slave returned invalid status");
    }
    assert(!msg->getClientContext());
    _connection.reset();
}
static bool checkForTimeout(double startTime, double timeout,
                            uint64_t launchId, MpiOperatorContext* ctx)
{
    if ((getTimeInSecs() - startTime) > timeout) {
        throw SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED)
            << "MPI slave process failed to communicate in time";
    }
    return true;
}
Exemplo n.º 3
0
bool hasExpired(double startTime, double timeout)
{
    if (timeout < 0) {
        return false;
    }
    if ((getTimeInSecs() - startTime) >= timeout) {
        return true;
    }
    return false;
}
void MpiSlaveProxy::waitForHandshake(boost::shared_ptr<MpiOperatorContext>& ctx)
{
    if (_connection) {
        throw (InvalidStateException(REL_FILE, __FUNCTION__, __LINE__)
               << "Connection to MPI slave already established");
    }

    MpiOperatorContext::LaunchErrorChecker errChecker =
        boost::bind(&checkLauncher, getTimeInSecs(),
                    static_cast<double>(_MPI_SLAVE_RESPONSE_TIMEOUT), _1, _2);
    boost::shared_ptr<scidb::ClientMessageDescription> msg = ctx->popMsg(_launchId, errChecker);
    assert(msg);

    _connection = msg->getClientContext();

    if (msg->getMessageType() != scidb::mtMpiSlaveHandshake) {
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
            << "MPI slave handshake is invalid");
    }

    boost::shared_ptr<scidb_msg::MpiSlaveHandshake> handshake =
        boost::dynamic_pointer_cast<scidb_msg::MpiSlaveHandshake>(msg->getRecord());
    assert(handshake);

    // parse the handshake
    if (!handshake->has_pid()) {
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
            << "MPI slave handshake has no PID");
    }
    if (!handshake->has_ppid()) {
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
               << "MPI slave handshake has no PPID");
    }

    _pids.push_back(handshake->pid());
    _pids.push_back(handshake->ppid());

    string clusterUuid = Cluster::getInstance()->getUuid();

    if (handshake->cluster_uuid() != clusterUuid) {
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
               << "MPI slave handshake has invalid clusterUuid");
    }
    InstanceID instanceId = Cluster::getInstance()->getLocalInstanceId();

    if (handshake->instance_id() != instanceId) {
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
               << "MPI slave handshake has invalid instanceId");
    }

    if (handshake->launch_id() != _launchId) {
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
               << "MPI slave handshake has invalid launchId");
    }

    boost::shared_ptr<scidb::Query> query( _query.lock());
    Query::validateQueryPtr(query);

    if (handshake->rank() != query->getInstanceID()) { // logical instance ID
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
               << "MPI slave handshake has invalid rank");
    }

    ClientContext::DisconnectHandler dh =
        boost::bind(&MpiMessageHandler::handleMpiSlaveDisconnect, _launchId, _1);
    _connection->attachQuery(query->getQueryID(), dh);

    LOG4CXX_DEBUG(logger, "MpiSlaveProxy::waitForHandshake: handshake: "
                  <<" pid="<<handshake->pid()
                  <<", ppid="<<handshake->ppid()
                  <<", cluster_uuid="<<handshake->cluster_uuid()
                  <<", instance_id="<<handshake->instance_id()
                  <<", launch_id="<<handshake->launch_id()
                  <<", rank="<<handshake->rank());
}