void start()
    {
        ActivityTimer s(totalCycles, timeActivities, NULL);
        dataLinkStart();

        eogPending = false;
        if (container.queryLocal() || firstNode())
        {
            CMessageBuffer reqMsg;
            reqMsg.setReplyTag(replyTag);
            reqMsg.append(smt_actMsg);
            reqMsg.append(container.queryOwner().queryGraphId());
            reqMsg.append(container.queryId());

            if (!container.queryJob().queryJobComm().sendRecv(reqMsg, 0, container.queryJob().querySlaveMpTag(), LONGTIMEOUT))
                throwUnexpected();

            masterReplyMsg.swapWith(reqMsg);
        }
    }
    void doBroadcast()
    {
        try
        {
            unsigned i = 0;
            unsigned n;
            if (1 == nodeindex)
                n = broadcastSlave-1;
            else if (broadcastSlave==nodeindex)
                n = 0;
            else
                n = nodeindex-1;
            loop {
                unsigned t = target(i++,n);
                if (t>numnodes)
                    break;
                if (t != broadcastSlave)
                {
#ifdef _TRACEBROADCAST
                    ActPrintLog(activity, "Broadcast node %d Sending to node %d size %d",nodeindex,t,broadcasting.length());
#endif
                    mptag_t rt = createReplyTag();
                    broadcasting.setReplyTag(rt); // simulate sendRecv
                    comm->send(broadcasting, t, mpTag);     
                    CMessageBuffer rMsg;
                    comm->recv(rMsg, t, rt);                    
#ifdef _TRACEBROADCAST
                    ActPrintLog(activity, "Broadcast node %d Sent to node %d size %d received back %d",nodeindex,t,broadcasting.length(),rMsg.length());
#endif
                }
            }
        }
        catch (IException *e)
        {
            ActPrintLog(activity, e, "CBroadcaster::broadcast exception");
            throw;
        }
#ifdef _TRACEBROADCAST
        ActPrintLog(activity, "do broadcast done done");
#endif
    }
Beispiel #3
0
static bool RegisterSelf(SocketEndpoint &masterEp)
{
    StringBuffer slfStr;
    StringBuffer masterStr;
    LOG(MCdebugProgress, thorJob, "registering %s - master %s",slfEp.getUrlStr(slfStr).str(),masterEp.getUrlStr(masterStr).str());
    try
    {
        SocketEndpoint ep = masterEp;
        ep.port = getFixedPort(getMasterPortBase(), TPORT_mp);
        Owned<INode> masterNode = createINode(ep);
        CMessageBuffer msg;
        if (!queryWorldCommunicator().recv(msg, masterNode, MPTAG_THORREGISTRATION))
            return false;
        PROGLOG("Initialization received");
        unsigned vmajor, vminor;
        msg.read(vmajor);
        msg.read(vminor);
        if (vmajor != THOR_VERSION_MAJOR || vminor != THOR_VERSION_MINOR)
        {
            replyError(TE_FailedToRegisterSlave, "Thor master/slave version mismatch");
            return false;
        }
        Owned<IGroup> rawGroup = deserializeIGroup(msg);
        globals->Release();
        globals = createPTree(msg);
        mergeCmdParams(globals); // cmd line

        unsigned slavesPerNode = globals->getPropInt("@slavesPerNode", 1);
        unsigned channelsPerSlave = globals->getPropInt("@channelsPerSlave", 1);
        unsigned localThorPortInc = globals->getPropInt("@localThorPortInc", DEFAULT_SLAVEPORTINC);
        unsigned slaveBasePort = globals->getPropInt("@slaveport", DEFAULT_THORSLAVEPORT);
        setClusterGroup(masterNode, rawGroup, slavesPerNode, channelsPerSlave, slaveBasePort, localThorPortInc);

        unsigned numStrands, blockSize;
        if (globals->hasProp("Debug/@forceNumStrands"))
            numStrands = globals->getPropInt("Debug/@forceNumStrands");
        else
        {
            numStrands = defaultForceNumStrands;
            globals->setPropInt("Debug/@forceNumStrands", defaultForceNumStrands);
        }
        if (globals->hasProp("Debug/@strandBlockSize"))
            blockSize = globals->getPropInt("Debug/@strandBlockSize");
        else
        {
            blockSize = defaultStrandBlockSize;
            globals->setPropInt("Debug/@strandBlockSize", defaultStrandBlockSize);
        }
        PROGLOG("Strand defaults: numStrands=%u, blockSize=%u", numStrands, blockSize);

        const char *_masterBuildTag = globals->queryProp("@masterBuildTag");
        const char *masterBuildTag = _masterBuildTag?_masterBuildTag:"no build tag";
        PROGLOG("Master build: %s", masterBuildTag);
        if (!_masterBuildTag || 0 != strcmp(BUILD_TAG, _masterBuildTag))
        {
            StringBuffer errStr("Thor master/slave build mismatch, master = ");
            errStr.append(masterBuildTag).append(", slave = ").append(BUILD_TAG);
            ERRLOG("%s", errStr.str());
#ifndef _DEBUG
            replyError(TE_FailedToRegisterSlave, errStr.str());
            return false;
#endif
        }
        msg.read((unsigned &)masterSlaveMpTag);
        msg.clear();
        msg.setReplyTag(MPTAG_THORREGISTRATION);
        if (!queryNodeComm().reply(msg))
            return false;

        PROGLOG("Registration confirmation sent");
        if (!queryNodeComm().recv(msg, 0, MPTAG_THORREGISTRATION)) // when all registered
            return false;

        ::masterNode = LINK(masterNode);

        PROGLOG("verifying mp connection to rest of cluster");
        if (!queryNodeComm().verifyAll())
            ERRLOG("Failed to connect to all nodes");
        else
            PROGLOG("verified mp connection to rest of cluster");
        LOG(MCdebugProgress, thorJob, "registered %s",slfStr.str());
    }
    catch (IException *e)
    {
        FLLOG(MCexception(e), thorJob, e,"slave registration error");
        e->Release();
        return false;
    }
    return true;
}
static bool RegisterSelf(SocketEndpoint &masterEp)
{
    StringBuffer slfStr;
    StringBuffer masterStr;
    LOG(MCdebugProgress, thorJob, "registering %s - master %s",slfEp.getUrlStr(slfStr).toCharArray(),masterEp.getUrlStr(masterStr).toCharArray());
    try
    {
        SocketEndpoint ep = masterEp;
        ep.port = getFixedPort(getMasterPortBase(), TPORT_mp);
        Owned<INode> masterNode = createINode(ep);
        CMessageBuffer msg;
        if (!queryWorldCommunicator().recv(msg, masterNode, MPTAG_THORREGISTRATION))
            return false;
        PROGLOG("Initialization received");
        unsigned vmajor, vminor;
        msg.read(vmajor);
        msg.read(vminor);
        if (vmajor != THOR_VERSION_MAJOR || vminor != THOR_VERSION_MINOR)
        {
            replyError("Thor master/slave version mismatch");
            return false;
        }
        Owned<IGroup> group = deserializeIGroup(msg);
        setClusterGroup(group);

        SocketEndpoint myEp = queryMyNode()->endpoint();
        rank_t groupPos = group->rank(queryMyNode());
        if (RANK_NULL == groupPos)
        {
            replyError("Node not part of thorgroup");
            return false;
        }
        if (globals->hasProp("@SLAVENUM") && (mySlaveNum != (unsigned)groupPos))
        {
            VStringBuffer errStr("Slave group rank[%d] does not match provided cmd line slaveNum[%d]", mySlaveNum, (unsigned)groupPos);
            replyError(errStr.str());
            return false;
        }
        globals->Release();
        globals = createPTree(msg);
        mergeCmdParams(globals); // cmd line 

        const char *_masterBuildTag = globals->queryProp("@masterBuildTag");
        const char *masterBuildTag = _masterBuildTag?_masterBuildTag:"no build tag";
        PROGLOG("Master build: %s", masterBuildTag);
#ifndef _DEBUG
        if (!_masterBuildTag || 0 != strcmp(BUILD_TAG, _masterBuildTag))
        {
            StringBuffer errStr("Thor master/slave build mismatch, master = ");
            replyError(errStr.append(masterBuildTag).append(", slave = ").append(BUILD_TAG).str());
            return false;
        }
#endif
        msg.read((unsigned &)masterSlaveMpTag);
        msg.clear();
        msg.setReplyTag(MPTAG_THORREGISTRATION);
        if (!queryClusterComm().reply(msg))
            return false;

        PROGLOG("Registration confirmation sent");
        if (!queryClusterComm().recv(msg, 0, MPTAG_THORREGISTRATION)) // when all registered
            return false;
        PROGLOG("verifying mp connection to rest of cluster");
        if (!queryClusterComm().verifyAll())
            ERRLOG("Failed to connect to all nodes");
        else
            PROGLOG("verified mp connection to rest of cluster");
        ::masterNode = LINK(masterNode);
        LOG(MCdebugProgress, thorJob, "registered %s",slfStr.toCharArray());
    }
    catch (IException *e)
    {
        FLLOG(MCexception(e), thorJob, e,"slave registration error");
        e->Release();
        return false;
    }
    return true;
}