void sendResult(const void *row, IOutputRowSerializer *serializer)
 {
     CMessageBuffer mb;
     size32_t start = mb.length();
     size32_t sz = 0;
     mb.append(sz);
     if (row&&hadElement) {
         CMemoryRowSerializer mbs(mb);
         serializer->serialize(mbs,(const byte *)row);
         sz = mb.length()-start-sizeof(size32_t);
         mb.writeDirect(start,sizeof(size32_t),&sz);
     }
     container.queryJob().queryJobComm().send(mb, 0, masterMpTag);
 }
const void *CPartialResultAggregator::getResult()
{
    CMessageBuffer mb;
    if (activity.receiveMsg(mb, 0, activity.queryMpTag()))
    {
        if (mb.length())
        {
            CThorStreamDeserializerSource ds(mb.length(), mb.readDirect(mb.length()));
            RtlDynamicRowBuilder rowBuilder(activity.queryRowAllocator());
            size32_t sz = activity.queryRowDeserializer()->deserialize(rowBuilder,ds);
            return rowBuilder.finalizeRowClear(sz);
        }
    }
    return NULL;
}
    virtual void process()
    {
        CChooseSetsActivityMaster::process();

        IHThorChooseSetsArg *helper = (IHThorChooseSetsArg *)queryHelper();
        unsigned numSets = helper->getNumSets();
        unsigned nslaves = container.queryJob().querySlaves();

        MemoryBuffer countMb;
        rowcount_t *counts = (rowcount_t *)countMb.reserveTruncate((numSets*(nslaves+2)) * sizeof(rowcount_t));
        rowcount_t *totals = counts + nslaves*numSets;
        rowcount_t *tallies = totals + numSets;
        memset(counts, 0, countMb.length());

        unsigned s=nslaves;
        CMessageBuffer msg;
        while (s--)
        {
            msg.clear();
            rank_t sender;
            if (!receiveMsg(msg, RANK_ALL, mpTag, &sender))
                return;
            assertex(msg.length() == numSets*sizeof(rowcount_t));
            unsigned set = (unsigned)sender - 1;
            memcpy(&counts[set*numSets], msg.toByteArray(), numSets*sizeof(rowcount_t));
        }
        for (s=0; s<nslaves; s++)
        {
            unsigned i=0;
            for (; i<numSets; i++)
                totals[i] += counts[s * numSets + i];
        }
        msg.clear();
        msg.append(numSets*sizeof(rowcount_t), totals);
        unsigned endTotalsPos = msg.length();
        for (s=0; s<nslaves; s++)
        {
            msg.rewrite(endTotalsPos);
            msg.append(numSets*sizeof(rowcount_t), tallies);
            container.queryJob().queryJobComm().send(msg, s+1, mpTag);
            unsigned i=0;
            for (; i<numSets; i++)
                tallies[i] += counts[s * numSets + i];
        }
    }
 virtual void flushResults(bool complete=false)
 {
     if (resultData.length() || complete)
     {
         ActPrintLog("flushing result");
         addResult(numResults, resultData, complete);
         resultData.clear();
         ActPrintLog("result flushed");
     }
 }
Beispiel #5
0
const void *getAggregate(CActivityBase &activity, unsigned partialResults, IRowInterfaces &rowIf, IHThorCompoundAggregateExtra &aggHelper, mptag_t mpTag)
{
    // JCSMORE - pity this isn't common routine with similar one in aggregate, but helper is not common
    CThorRowArray slaveResults;
    slaveResults.ensure(partialResults);
    unsigned _partialResults = partialResults;
    while (_partialResults--)
    {
        CMessageBuffer mb;
        rank_t sender;
        if (!activity.receiveMsg(mb, RANK_ALL, mpTag, &sender)) return false;
        if (activity.queryAbortSoon()) return 0;
        if (mb.length())
        {
            CThorStreamDeserializerSource ds(mb.length(), mb.readDirect(mb.length()));
            RtlDynamicRowBuilder rowBuilder(rowIf.queryRowAllocator());
            size32_t sz = rowIf.queryRowDeserializer()->deserialize(rowBuilder, ds);
            slaveResults.setRow(sender-1, rowBuilder.finalizeRowClear(sz));
        }
    }
    RtlDynamicRowBuilder result(rowIf.queryRowAllocator(), false);
    size32_t sz;
    bool first = true;
    _partialResults = 0;
    for (;_partialResults<partialResults; _partialResults++)
    {
        const void *partialResult = slaveResults.item(_partialResults);
        if (partialResult)
        {
            if (first)
            {
                first = false;
                sz = cloneRow(result, slaveResults.item(_partialResults), rowIf.queryRowMetaData());
            }
            else
                sz = aggHelper.mergeAggregate(result, partialResult);
        }
    }
    if (first)
        sz = aggHelper.clearAggregate(result);
    return result.finalizeRowClear(sz);
}
Beispiel #6
0
    void process()
    {
        processed = 0;

        input = inputs.item(0);
        startInput(input);

        processed = THORDATALINK_STARTED;

        OwnedConstThorRow row = input->ungroupedNextRow();
        CMessageBuffer mb;
        size32_t lenpos = mb.length(); // its 0 really
        mb.append((size32_t)0);
        if (row) {
            CMemoryRowSerializer msz(mb);
            ::queryRowSerializer(input)->serialize(msz,(const byte *)row.get());
            size32_t sz = mb.length()-lenpos-sizeof(size32_t);
            mb.writeDirect(lenpos,sizeof(size32_t),&sz);
            processed++;
        }
        container.queryJob().queryJobComm().send(mb, 0, masterMpTag);
    }
Beispiel #7
0
bool SortSlaveMP::sendRecv(CMessageBuffer &mb, unsigned timeout)
{
    if (!comm->sendRecv(mb,rank,tag,timeout))
        return false;
    byte ok = 255;
    if (mb.length()) {
        mb.read(ok);
        if (ok==1)
            return true;
        if (ok==0) {
            int err;
            mb.read(err);
            StringAttr errstr;
            mb.read(errstr);
            throw MakeStringException(err, "%s", errstr.get());
        }
    }
    throw MakeStringException(-1,"SortSlaveMP::sendRecv() protocol error %d",(int)ok);
    return false;
}
    void process()
    {
        CWorkUnitWriteMasterBase::process();

        unsigned nslaves = container.queryJob().querySlaves();

        CMessageBuffer mb;
        unsigned s=0;
        for (; s<nslaves; s++)
        {
            loop
            {
                if (!container.queryJob().queryJobComm().send(mb, s+1, mpTag)) return;
                if (!receiveMsg(mb, s+1, mpTag)) return;
                if (0 == mb.length())
                    break;
                unsigned numGot;
                mb.read(numGot);
                unsigned l=mb.remaining();
                if (workunitWriteLimit && totalSize+resultData.length()+l > workunitWriteLimit)
                {
                    StringBuffer errMsg("Dataset too large to output to workunit (limit is set to ");
                    errMsg.append(workunitWriteLimit/0x100000).append(") megabytes, in result (");
                    if (resultName.length())
                        errMsg.append("name=").append(resultName);
                    else
                        errMsg.append("sequence=").append(resultSeq);
                    errMsg.append(")");
                    throw MakeThorException(TE_WorkUnitWriteLimitExceeded, "%s", errMsg.str());
                }
                resultData.append(l, mb.readDirect(l));
                mb.clear();
                numResults += numGot;

                if (-1 != flushThreshold && resultData.length() >= (unsigned)flushThreshold)
                    flushResults();
            }
        }
        flushResults(true);
    }
    virtual void process()
    {
        CMasterActivity::process();

        IHThorDistributionArg * helper = (IHThorDistributionArg *)queryHelper();
        IOutputMetaData *rcSz = helper->queryInternalRecordSize();

        unsigned nslaves = container.queryJob().querySlaves();

        IDistributionTable * * result = (IDistributionTable * *)createThorRow(rcSz->getMinRecordSize()); // not a real row
        helper->clearAggregate(result);


        while (nslaves--)
        {
            rank_t sender;
            CMessageBuffer msg;
            if (!receiveMsg(msg, RANK_ALL, mpTag, &sender))
                return;
#if THOR_TRACE_LEVEL >= 5
            ActPrintLog("Received distribution result from node %d", (unsigned)sender);
#endif
            if (msg.length())
                helper->merge(result, msg);
        }

        StringBuffer tmp;
        tmp.append("<XML>");
        helper->gatherResult(result, tmp);
        tmp.append("</XML>");

#if THOR_TRACE_LEVEL >= 5
        ActPrintLog("Distribution result: %s", tmp.str());
#endif

        helper->sendResult(tmp.length(), tmp.str());

        destroyThorRow(result);
    }
    bool receive(MemoryBuffer &mb)
    {
#ifdef _TRACEBROADCAST
        ActPrintLog(activity, "Broadcast node %d Receiving on tag %d",nodeindex,(int)mpTag);
#endif
        CMessageBuffer msg;
        rank_t sender;
        BooleanOnOff onOff(receiving);
        if (comm->recv(msg, RANK_ALL, mpTag, &sender))
        {
#ifdef _TRACEBROADCAST
            ActPrintLog(activity, "Broadcast node %d Received %d from %d",nodeindex, msg.length(), sender);
#endif
            try
            {
                mb.swapWith(msg);
                msg.clear(); // send empty reply
#ifdef _TRACEBROADCAST
                ActPrintLog(activity, "Broadcast node %d reply to %d",nodeindex, sender);
#endif
                comm->reply(msg);
                if (aborted) 
                    return false;
#ifdef _TRACEBROADCAST
                ActPrintLog(activity, "Broadcast node %d Received %d",nodeindex, mb.length());
#endif
            }
            catch (IException *e)
            {
                ActPrintLog(activity, e, "CBroadcaster::recv(2): exception");
                throw;
            }
        }
#ifdef _TRACEBROADCAST
        ActPrintLog(activity, "receive done");
#endif
        return (0 != mb.length());
    }
Beispiel #11
0
    void processMessage(CMessageBuffer &mb)
    {
        ICoven &coven=queryCoven();
        SessionId id;
        int fn;
        mb.read(fn);
        switch (fn) {
        case MSR_REGISTER_PROCESS_SESSION: {
                acceptConnections.wait();
                acceptConnections.signal();
                Owned<INode> node(deserializeINode(mb));
                Owned<INode> servernode(deserializeINode(mb));  // hopefully me, but not if forwarded
                int role=0;
                if (mb.length()-mb.getPos()>=sizeof(role)) { // a capability block present
                    mb.read(role);
                    if (!manager.authorizeConnection(role,false)) {
                        SocketEndpoint sender = mb.getSender();
                        mb.clear();
                        coven.reply(mb);
                        MilliSleep(100+getRandom()%1000); // Causes client to 'work' for a short time.
                        Owned<INode> node = createINode(sender);
                        coven.disconnect(node);
                        break;
                    }
#ifdef _DEBUG
                    StringBuffer eps;
                    PROGLOG("Connection to %s authorized",mb.getSender().getUrlStr(eps).str());
#endif
                }
                
                IGroup *covengrp;
                id = manager.registerClientProcess(node.get(),covengrp,(DaliClientRole)role);
                mb.clear().append(id);
                if (covengrp->rank(servernode)==RANK_NULL) { // must have been redirected
                    covengrp->Release(); // no good, so just use one we know about (may use something more sophisticated later)
                    INode *na = servernode.get();
                    covengrp = createIGroup(1, &na);
                }
                covengrp->serialize(mb);
                covengrp->Release();
                coven.reply(mb);
            }
            break;
        case MSR_SECONDARY_REGISTER_PROCESS_SESSION: {
                mb.read(id);
                Owned<INode> node (deserializeINode(mb));
                int role;
                mb.read(role);
                manager.addProcessSession(id,node.get(),(DaliClientRole)role);
                mb.clear();
                coven.reply(mb);
            }
            break;
        case MSR_REGISTER_SESSION: {
                SecurityToken tok;
                SessionId parentid;
                mb.read(tok).read(parentid);
                SessionId id = manager.registerSession(tok,parentid);
                mb.clear().append(id);
                coven.reply(mb);
            }
            break;
        case MSR_SECONDARY_REGISTER_SESSION: {
                mb.read(id);
                manager.addSession(id);
                mb.clear();
                coven.reply(mb);
            }
            break;
        case MSR_LOOKUP_PROCESS_SESSION: {
                // looks up from node or from id
                Owned<INode> node (deserializeINode(mb));
                if (node->endpoint().isNull()&&(mb.length()-mb.getPos()>=sizeof(id))) {
                    mb.read(id);
                    INode *n = manager.getProcessSessionNode(id);
                    if (n)
                        node.setown(n);
                    node->serialize(mb.clear());
                }
                else {
                    id = manager.lookupProcessSession(node.get());
                    mb.clear().append(id);
                }
                coven.reply(mb);
            }
            break;
        case MSR_STOP_SESSION: {
                SessionId sessid;
                bool failed;
                mb.read(sessid).read(failed);
                manager.stopSession(sessid,failed);
                mb.clear();
                coven.reply(mb);
            }
            break;
        case MSR_LOOKUP_LDAP_PERMISSIONS: {
                StringAttr key;
                StringAttr obj;
                Owned<IUserDescriptor> udesc=createUserDescriptor();
                StringAttr username;
                StringAttr passwordenc;
                mb.read(key).read(obj);
                udesc->deserialize(mb);
#ifndef _NO_DALIUSER_STACKTRACE
                //following debug code to be removed
                StringBuffer sb;
                udesc->getUserName(sb);
                if (0==sb.length())
                {
                    DBGLOG("UNEXPECTED USER (NULL) in dasess.cpp CSessionRequestServer::processMessage() line %d", __LINE__);
                }
#endif
                unsigned auditflags = 0;
                if (mb.length()-mb.getPos()>=sizeof(auditflags))
                    mb.read(auditflags);
                int err = 0;
                int ret=manager.getPermissionsLDAP(key,obj,udesc,auditflags,&err);
                mb.clear().append(ret);
                if (err)
                    mb.append(err);
                coven.reply(mb);
            }
            break;
        }
    }
Beispiel #12
0
    bool send(INode *node,unsigned timeout)
    {
        unsigned retries = 3;
        loop {
            try {
                CMessageBuffer mb;
                serialize(mb);
                if (queryWorldCommunicator().sendRecv(mb,node,MPTAG_SASHA_REQUEST,timeout?timeout:12*60*60*1000)) { // could take a long time!
                    clearIds();
                    clearResults();
                    if (action==SCA_WORKUNIT_SERVICES_GET) {
                        mb.swapWith(wusbuf);
                    }
                    else {
                        unsigned n=0;
                        unsigned i;
                        if (mb.length()-mb.getPos()>=sizeof(unsigned)) {
                            mb.read(n);
                            for (i=0;i<n;i++) {
                                StringAttr s;
                                mb.read(s);
                                addId(s.get());
                            }
                            if (mb.length()-mb.getPos()>=sizeof(unsigned)+sizeof(bool)) {
                                mb.read(resultoverflow);
                                mb.read(n);
                                for (i=0;i<n;i++) {
                                    StringAttr res;
                                    mb.read(res);
                                    size32_t reslen = res.length();
                                    results.append(*new StringAttrItem(res,reslen));
                                    resultsize += reslen;
                                }
                                if (mb.length()-mb.getPos()>=sizeof(unsigned)) {
                                    mb.read(numdts);
                                    free(dts);
                                    dts = NULL;
                                    if (numdts) {
                                        dts = (CDateTime *)calloc(numdts,sizeof(CDateTime));
                                        for (i=0;i<numdts;i++)
                                            dts[i].deserialize(mb);
                                    }
                                }

                            }
                        }
                    }
                    return true;
                }   
                else
                    break;
            }
            catch (IException *e) {
                if ((--retries==0)||(action==SCA_STOP))
                    throw;
                EXCLOG(e,"CSashaCommand send");
                ::Release(e);
            }
            try { // shouldn't really be necessary but make sure socket really closed
                queryWorldCommunicator().disconnect(node);
            }
            catch (IException *e) {
                EXCLOG(e,"CSashaCommand disconnect");
                ::Release(e);
            }
        }; 
        return false;
    }
    virtual void process() override
    {
        ActPrintLog("INDEXWRITE: Start");
        init();

        IRowStream *stream = inputStream;
        ThorDataLinkMetaInfo info;
        input->getMetaInfo(info);
        outRowAllocator.setown(getRowAllocator(helper->queryDiskRecordSize()));
        start();
        if (refactor)
        {
            assertex(isLocal);
            if (active)
            {
                unsigned targetWidth = partDesc->queryOwner().numParts()-(buildTlk?1:0);
                assertex(0 == container.queryJob().querySlaves() % targetWidth);
                unsigned partsPerNode = container.queryJob().querySlaves() / targetWidth;
                unsigned myPart = queryJobChannel().queryMyRank();

                IArrayOf<IRowStream> streams;
                streams.append(*LINK(stream));
                --partsPerNode;

 // Should this be merging 1,11,21,31 etc.
                unsigned p=0;
                unsigned fromPart = targetWidth+1 + (partsPerNode * (myPart-1));
                for (; p<partsPerNode; p++)
                {
                    streams.append(*createRowStreamFromNode(*this, fromPart++, queryJobChannel().queryJobComm(), mpTag, abortSoon));
                }
                ICompare *icompare = helper->queryCompare();
                assertex(icompare);
                Owned<IRowLinkCounter> linkCounter = new CThorRowLinkCounter;
                myInputStream.setown(createRowStreamMerger(streams.ordinality(), streams.getArray(), icompare, false, linkCounter));
                stream = myInputStream;
            }
            else // serve nodes, creating merged parts
                rowServer.setown(createRowServer(this, stream, queryJobChannel().queryJobComm(), mpTag));
        }
        processed = THORDATALINK_STARTED;

        // single part key support
        // has to serially pull all data fron nodes 2-N
        // nodes 2-N, could/should start pushing some data (as it's supposed to be small) to cut down on serial nature.
        unsigned node = queryJobChannel().queryMyRank();
        if (singlePartKey)
        {
            if (1 == node)
            {
                try
                {
                    open(*partDesc, false, helper->queryDiskRecordSize()->isVariableSize());
                    loop
                    {
                        OwnedConstThorRow row = inputStream->ungroupedNextRow();
                        if (!row)
                            break;
                        if (abortSoon) return;
                        processRow(row);
                    }

                    unsigned node = 2;
                    while (node <= container.queryJob().querySlaves())
                    {
                        Linked<IOutputRowDeserializer> deserializer = ::queryRowDeserializer(input);
                        CMessageBuffer mb;
                        Owned<ISerialStream> stream = createMemoryBufferSerialStream(mb);
                        CThorStreamDeserializerSource rowSource;
                        rowSource.setStream(stream);
                        bool successSR;
                        loop
                        {
                            {
                                BooleanOnOff tf(receivingTag2);
                                successSR = queryJobChannel().queryJobComm().sendRecv(mb, node, mpTag2);
                            }
                            if (successSR)
                            {
                                if (rowSource.eos())
                                    break;
                                Linked<IEngineRowAllocator> allocator = ::queryRowAllocator(input);
                                do
                                {
                                    RtlDynamicRowBuilder rowBuilder(allocator);
                                    size32_t sz = deserializer->deserialize(rowBuilder, rowSource);
                                    OwnedConstThorRow fRow = rowBuilder.finalizeRowClear(sz);
                                    processRow(fRow);
                                }
                                while (!rowSource.eos());
                            }
                        }
                        node++;
                    }
                }
                catch (CATCHALL)
                {
                    close(*partDesc, partCrc, true);
                    throw;
                }
                close(*partDesc, partCrc, true);
                doStopInput();
            }
            else
            {
                CMessageBuffer mb;
                CMemoryRowSerializer mbs(mb);
                Linked<IOutputRowSerializer> serializer = ::queryRowSerializer(input);
                loop
                {
                    BooleanOnOff tf(receivingTag2);
                    if (queryJobChannel().queryJobComm().recv(mb, 1, mpTag2)) // node 1 asking for more..
                    {
                        if (abortSoon) break;
                        mb.clear();
                        do
                        {
                            OwnedConstThorRow row = inputStream->ungroupedNextRow();
                            if (!row) break;
                            serializer->serialize(mbs, (const byte *)row.get());
                        } while (mb.length() < SINGLEPART_KEY_TRANSFER_SIZE); // NB: at least one row
                        if (!queryJobChannel().queryJobComm().reply(mb))
                            throw MakeThorException(0, "Failed to send index data to node 1, from node %d", node);
                        if (0 == mb.length())
                            break;
                    }
                }
            }
        }
Beispiel #14
0
bool SortSlaveMP::marshall(ISortSlaveMP &slave, ICommunicator* comm, mptag_t tag)
{
    CMessageBuffer mb;
    rank_t sender;
    comm->recv(mb,0,tag,&sender);       // NB only recv from master
    if (mb.length()==0) {
        PROGLOG("Stopping SortSlaveMP::marshall");
        return false;
    }
    byte fn;
    mb.read(fn);
    CMessageBuffer mbout;
    mbout.init(mb.getSender(),tag,mb.getReplyTag());
    byte okout=1;
    mbout.append(okout);
#ifdef FULLTRACE
    StringBuffer tmp1;
    PROGLOG(">SortSlaveMP::marshall(%d) got %d from %s tag %d replytag %d",(int)fn, mb.length(), mb.getSender().getUrlStr(tmp1).str(),tag,mb.getReplyTag());
#endif
    bool replydone = false;
    Owned<IException> err;
    try {
        switch ((MPSlaveFunctions)(int)fn) {
            case FN_Connect: {
                unsigned _part;
                unsigned _numnodes;
                mb.read(_part).read(_numnodes);
                bool ret = slave.Connect(_part,_numnodes);
                mbout.append(ret);
            }
            break;
            case FN_StartGather: {
                slave.StartGather();
            }
            break;
            case FN_GetGatherInfo: {
                bool hasserializer;
                mb.read(hasserializer);
                rowcount_t numlocal;
                unsigned overflowscale;
                offset_t totalsize;
                slave.GetGatherInfo(numlocal,totalsize,overflowscale,hasserializer);
                mbout.append(numlocal).append(totalsize).append(overflowscale);
            }
            break;
            case FN_GetMinMax: {
                size32_t keybuffsize;
                void *keybuff;
                size32_t avrecsize;
                rowcount_t ret = slave.GetMinMax(keybuffsize,keybuff,avrecsize);
                serializeblk(mbout,keybuffsize,keybuff).append(avrecsize).append(ret);
                free(keybuff);
            }
            break;
            case FN_GetMultiMidPointStart: {
                replydone = true;
                comm->reply(mbout);
                size32_t lkeybuffsize;
                void * lkeybuff;
                size32_t hkeybuffsize;
                void * hkeybuff;
                deserializeblk(mb,lkeybuffsize,lkeybuff);
                deserializeblk(mb,hkeybuffsize,hkeybuff);
                slave.GetMultiMidPointStart(lkeybuffsize,lkeybuff,hkeybuffsize,hkeybuff);
                free(lkeybuff);
                free(hkeybuff);
            }
            break;
            case FN_MultiBinChopStop: {
                unsigned num;
                mb.read(num);
                void *out = mbout.reserveTruncate(num*sizeof(rowcount_t));
                slave.MultiBinChopStop(num,(rowcount_t *)out);
            }
            break;
            case FN_GetMultiMidPointStop: {
                size32_t mkeybuffsize=0;
                void * mkeybuff = NULL;
                slave.GetMultiMidPointStop(mkeybuffsize,mkeybuff);
                serializeblk(mbout,mkeybuffsize,mkeybuff);
                free(mkeybuff);
            }
            break;
            case FN_MultiBinChopStart: {
                replydone = true;
                comm->reply(mbout);
                size32_t keybuffsize;
                void * keybuff;
                deserializeblk(mb,keybuffsize,keybuff);
                byte cmpfn;
                mb.read(cmpfn);
                slave.MultiBinChopStart(keybuffsize,(const byte *)keybuff,cmpfn);
                free(keybuff);
            }
            break;
            case FN_MultiBinChop: {
                size32_t keybuffsize;
                void * keybuff;
                deserializeblk(mb,keybuffsize,keybuff);
                unsigned num;
                byte cmpfn;
                mb.read(num).read(cmpfn);
                void *out = mbout.reserveTruncate(num*sizeof(rowcount_t));
                slave.MultiBinChop(keybuffsize,(const byte *)keybuff,num,(rowcount_t *)out,cmpfn);
                free(keybuff);
            }
            break;
            case FN_OverflowAdjustMapStart: {
                replydone = true;
                comm->reply(mbout);
                unsigned mapsize;
                mb.read(mapsize);
                const void * map = mb.readDirect(mapsize*sizeof(rowcount_t));
                size32_t keybuffsize;
                void * keybuff;
                deserializeblk(mb,keybuffsize,keybuff);
                byte cmpfn;
                mb.read(cmpfn);
                bool useaux;
                mb.read(useaux);
                slave.OverflowAdjustMapStart(mapsize,(rowcount_t *)map,keybuffsize,(const byte *)keybuff,cmpfn,useaux);
                free(keybuff);
            }
            break;
            case FN_OverflowAdjustMapStop: {
                unsigned mapsize;
                mb.read(mapsize);
                rowcount_t ret=0;
                size32_t retofs = mbout.length();
                mbout.append(ret);
                void *map=mbout.reserveTruncate(mapsize*sizeof(rowcount_t));
                ret = slave.OverflowAdjustMapStop(mapsize,(rowcount_t *)map);     // could avoid copy here if passed mb
                mbout.writeDirect(retofs,sizeof(ret),&ret);
            }
            break;
            case FN_MultiMerge: {
                replydone = true;
                comm->reply(mbout);
                unsigned mapsize;
                mb.read(mapsize);
                const void *map = mb.readDirect(mapsize*sizeof(rowcount_t));
                unsigned num;
                mb.read(num);
                SocketEndpointArray epa;
                for (unsigned i=0;i<num;i++) {
                    SocketEndpoint ep;
                    ep.deserialize(mb);
                    epa.append(ep);
                }
                slave.MultiMerge(mapsize,(rowcount_t *)map,num,epa.getArray());
            }
            break;
            case FN_MultiMergeBetween: {
                replydone = true;
                comm->reply(mbout);
                unsigned mapsize;
                mb.read(mapsize);
                const void *map = mb.readDirect(mapsize*sizeof(rowcount_t));
                const void *mapupper = mb.readDirect(mapsize*sizeof(rowcount_t));
                unsigned num;
                mb.read(num);
                SocketEndpointArray epa;
                for (unsigned i=0;i<num;i++) {
                    SocketEndpoint ep;
                    ep.deserialize(mb);
                    epa.append(ep);
                }
                slave.MultiMergeBetween(mapsize,(rowcount_t *)map,(rowcount_t *)mapupper,num,epa.getArray());
            }
            break;
            case FN_SingleMerge: {
                replydone = true;   
                comm->reply(mbout);     // async
                slave.SingleMerge();
            }
            break;
            case FN_FirstRowOfFile: {
                StringAttr filename;
                mb.read(filename);
                size32_t rowbufsize = 0;
                byte *rowbuf = NULL;
                bool ret = slave.FirstRowOfFile(filename,rowbufsize,rowbuf);
                serializeblk(mbout,rowbufsize,rowbuf);
                free(rowbuf);
                mbout.append(ret);
            }
            break;
            case FN_GetMultiNthRow: {
                unsigned numsplits;
                mb.read(numsplits);
                size32_t mkeybuffsize = 0;
                void * mkeybuf  = NULL;
                slave.GetMultiNthRow(numsplits,mkeybuffsize,mkeybuf);
                serializeblk(mbout,mkeybuffsize,mkeybuf);
                free(mkeybuf);
            }
            break;
            case FN_StartMiniSort: {
                replydone = true;   
                rowcount_t totalrows;
                mb.read(totalrows);
                comm->reply(mbout);     // async
                slave.StartMiniSort(totalrows);
            }
            break;
            case FN_Close: {
                replydone = true;   
                comm->reply(mbout);     // async
                slave.Close();
            }
            break;
            case FN_CloseWait: {
                slave.CloseWait();
            }
            break;
            case FN_Disconnect: {
                comm->reply(mbout);     // async
                replydone = true;   
                slave.Disconnect();
            }
            // fall through
            return false;
            default:
                throw MakeStringException(-1,"unknown function %d",(int)fn);
        }
    }
    catch (IException *e) {
        EXCLOG(e,"SortSlaveMP::marshall");
        if (!replydone)  {
            mbout.clear();
            okout = 0;
            mbout.append(okout);
            int err = e->errorCode();
            mbout.append(err);
            StringBuffer outs;
            e->errorMessage(outs);
            mbout.append(outs.str());
        }
        err.setown(e);
    }
    if (!replydone) {
#ifdef FULLTRACE
        StringBuffer tmp1;
        PROGLOG("<SortSlaveMP::marshall(%d) send %d to %s tag %d",(int)fn, mbout.length(), mbout.getSender().getUrlStr(tmp1).str(),mbout.getReplyTag());
#endif
        comm->reply(mbout);
    }
    if (err.get())
        throw err.getClear();
    return true;
}
    void doBroadcast()
    {
        try
        {
            unsigned i = 0;
            unsigned n;
            if (1 == nodeindex)
                n = broadcastSlave-1;
            else if (broadcastSlave==nodeindex)
                n = 0;
            else
                n = nodeindex-1;
            loop {
                unsigned t = target(i++,n);
                if (t>numnodes)
                    break;
                if (t != broadcastSlave)
                {
#ifdef _TRACEBROADCAST
                    ActPrintLog(activity, "Broadcast node %d Sending to node %d size %d",nodeindex,t,broadcasting.length());
#endif
                    mptag_t rt = createReplyTag();
                    broadcasting.setReplyTag(rt); // simulate sendRecv
                    comm->send(broadcasting, t, mpTag);     
                    CMessageBuffer rMsg;
                    comm->recv(rMsg, t, rt);                    
#ifdef _TRACEBROADCAST
                    ActPrintLog(activity, "Broadcast node %d Sent to node %d size %d received back %d",nodeindex,t,broadcasting.length(),rMsg.length());
#endif
                }
            }
        }
        catch (IException *e)
        {
            ActPrintLog(activity, e, "CBroadcaster::broadcast exception");
            throw;
        }
#ifdef _TRACEBROADCAST
        ActPrintLog(activity, "do broadcast done done");
#endif
    }
    virtual void process() override
    {
        ActPrintLog("INDEXWRITE: Start");
        init();

        IRowStream *stream = inputStream;
        ThorDataLinkMetaInfo info;
        input->getMetaInfo(info);
        outRowAllocator.setown(getRowAllocator(helper->queryDiskRecordSize()));
        start();
        if (refactor)
        {
            assertex(isLocal);
            if (active)
            {
                unsigned targetWidth = partDesc->queryOwner().numParts()-(buildTlk?1:0);
                assertex(0 == container.queryJob().querySlaves() % targetWidth);
                unsigned partsPerNode = container.queryJob().querySlaves() / targetWidth;
                unsigned myPart = queryJobChannel().queryMyRank();

                IArrayOf<IRowStream> streams;
                streams.append(*LINK(stream));
                --partsPerNode;

 // Should this be merging 1,11,21,31 etc.
                unsigned p=0;
                unsigned fromPart = targetWidth+1 + (partsPerNode * (myPart-1));
                for (; p<partsPerNode; p++)
                {
                    streams.append(*createRowStreamFromNode(*this, fromPart++, queryJobChannel().queryJobComm(), mpTag, abortSoon));
                }
                ICompare *icompare = helper->queryCompare();
                assertex(icompare);
                Owned<IRowLinkCounter> linkCounter = new CThorRowLinkCounter;
                myInputStream.setown(createRowStreamMerger(streams.ordinality(), streams.getArray(), icompare, false, linkCounter));
                stream = myInputStream;
            }
            else // serve nodes, creating merged parts
                rowServer.setown(createRowServer(this, stream, queryJobChannel().queryJobComm(), mpTag));
        }
        processed = THORDATALINK_STARTED;

        // single part key support
        // has to serially pull all data fron nodes 2-N
        // nodes 2-N, could/should start pushing some data (as it's supposed to be small) to cut down on serial nature.
        unsigned node = queryJobChannel().queryMyRank();
        if (singlePartKey)
        {
            if (1 == node)
            {
                try
                {
                    open(*partDesc, false, helper->queryDiskRecordSize()->isVariableSize());
                    for (;;)
                    {
                        OwnedConstThorRow row = inputStream->ungroupedNextRow();
                        if (!row)
                            break;
                        if (abortSoon) return;
                        processRow(row);
                    }

                    unsigned node = 2;
                    while (node <= container.queryJob().querySlaves())
                    {
                        Linked<IOutputRowDeserializer> deserializer = ::queryRowDeserializer(input);
                        CMessageBuffer mb;
                        Owned<ISerialStream> stream = createMemoryBufferSerialStream(mb);
                        CThorStreamDeserializerSource rowSource;
                        rowSource.setStream(stream);
                        bool successSR;
                        for (;;)
                        {
                            {
                                BooleanOnOff tf(receivingTag2);
                                successSR = queryJobChannel().queryJobComm().sendRecv(mb, node, mpTag2);
                            }
                            if (successSR)
                            {
                                if (rowSource.eos())
                                    break;
                                Linked<IEngineRowAllocator> allocator = ::queryRowAllocator(input);
                                do
                                {
                                    RtlDynamicRowBuilder rowBuilder(allocator);
                                    size32_t sz = deserializer->deserialize(rowBuilder, rowSource);
                                    OwnedConstThorRow fRow = rowBuilder.finalizeRowClear(sz);
                                    processRow(fRow);
                                }
                                while (!rowSource.eos());
                            }
                        }
                        node++;
                    }
                }
                catch (CATCHALL)
                {
                    close(*partDesc, partCrc, true);
                    throw;
                }
                close(*partDesc, partCrc, true);
                stop();
            }
            else
            {
                CMessageBuffer mb;
                CMemoryRowSerializer mbs(mb);
                Linked<IOutputRowSerializer> serializer = ::queryRowSerializer(input);
                for (;;)
                {
                    BooleanOnOff tf(receivingTag2);
                    if (queryJobChannel().queryJobComm().recv(mb, 1, mpTag2)) // node 1 asking for more..
                    {
                        if (abortSoon) break;
                        mb.clear();
                        do
                        {
                            OwnedConstThorRow row = inputStream->ungroupedNextRow();
                            if (!row) break;
                            serializer->serialize(mbs, (const byte *)row.get());
                        } while (mb.length() < SINGLEPART_KEY_TRANSFER_SIZE); // NB: at least one row
                        if (!queryJobChannel().queryJobComm().reply(mb))
                            throw MakeThorException(0, "Failed to send index data to node 1, from node %d", node);
                        if (0 == mb.length())
                            break;
                    }
                }
            }
        }
        else
        {
            if (!refactor || active)
            {
                try
                {
                    StringBuffer partFname;
                    getPartFilename(*partDesc, 0, partFname);
                    ActPrintLog("INDEXWRITE: process: handling fname : %s", partFname.str());
                    open(*partDesc, false, helper->queryDiskRecordSize()->isVariableSize());
                    ActPrintLog("INDEXWRITE: write");

                    BooleanOnOff tf(receiving);
                    if (!refactor || !active)
                        receiving = false;
                    do
                    {
                        OwnedConstThorRow row = inputStream->ungroupedNextRow();
                        if (!row)
                            break;
                        processRow(row);
                    } while (!abortSoon);
                    ActPrintLog("INDEXWRITE: write level 0 complete");
                }
                catch (CATCHALL)
                {
                    close(*partDesc, partCrc, isLocal && !buildTlk && 1 == node);
                    throw;
                }
                close(*partDesc, partCrc, isLocal && !buildTlk && 1 == node);
                stop();

                ActPrintLog("INDEXWRITE: Wrote %" RCPF "d records", processed & THORDATALINK_COUNT_MASK);

                if (buildTlk)
                {
                    ActPrintLog("INDEXWRITE: sending rows");
                    NodeInfoArray tlkRows;

                    CMessageBuffer msg;
                    if (firstNode())
                    {
                        if (processed & THORDATALINK_COUNT_MASK)
                        {
                            if (enableTlkPart0)
                                tlkRows.append(* new CNodeInfo(0, firstRow.get(), firstRowSize, totalCount));
                            tlkRows.append(* new CNodeInfo(1, lastRow.get(), lastRowSize, totalCount));
                        }
                    }
                    else
                    {
                        if (processed & THORDATALINK_COUNT_MASK)
                        {
                            CNodeInfo row(queryJobChannel().queryMyRank(), lastRow.get(), lastRowSize, totalCount);
                            row.serialize(msg);
                        }
                        queryJobChannel().queryJobComm().send(msg, 1, mpTag);
                    }

                    if (firstNode())
                    {
                        ActPrintLog("INDEXWRITE: Waiting on tlk to complete");

                        // JCSMORE if refactor==true, is rowsToReceive here right??
                        unsigned rowsToReceive = (refactor ? (tlkDesc->queryOwner().numParts()-1) : container.queryJob().querySlaves()) -1; // -1 'cos got my own in array already
                        ActPrintLog("INDEXWRITE: will wait for info from %d slaves before writing TLK", rowsToReceive);
                        while (rowsToReceive--)
                        {
                            msg.clear();
                            receiveMsg(msg, RANK_ALL, mpTag); // NH->JCS RANK_ALL_OTHER not supported for recv
                            if (abortSoon)
                                return;
                            if (msg.length())
                            {
                                CNodeInfo *ni = new CNodeInfo();
                                ni->deserialize(msg);
                                tlkRows.append(*ni);
                            }
                        }
                        tlkRows.sort(CNodeInfo::compare);

                        StringBuffer path;
                        getPartFilename(*tlkDesc, 0, path);
                        ActPrintLog("INDEXWRITE: creating toplevel key file : %s", path.str());
                        try
                        {
                            open(*tlkDesc, true, helper->queryDiskRecordSize()->isVariableSize());
                            if (tlkRows.length())
                            {
                                CNodeInfo &lastNode = tlkRows.item(tlkRows.length()-1);
                                memset(lastNode.value, 0xff, lastNode.size);
                            }
                            ForEachItemIn(idx, tlkRows)
                            {
                                CNodeInfo &info = tlkRows.item(idx);
                                builder->processKeyData((char *)info.value, info.pos, info.size);
                            }
                            close(*tlkDesc, tlkCrc, true);
                        }
                        catch (CATCHALL)
                        {
                            abortSoon = true;
                            close(*tlkDesc, tlkCrc, true);
                            removeFiles(*partDesc);
                            throw;
                        }
                    }
                }
                else if (!isLocal && firstNode())