LoadBalanceScheme
LoadBalancer::DetermineAppropriateScheme(avtContract_p input)
{

    //
    // See if we have already have decided.  If so, just return our cached
    // decision.
    //
    int index = input->GetPipelineIndex();
    const LBInfo &lbinfo = pipelineInfo[index];
    std::string dbname = lbinfo.db;
    avtDatabase *db = dbMap[dbname];

    avtDataRequest_p data = input->GetDataRequest();
    avtDatabaseMetaData *md = db->GetMetaData(db->GetMostRecentTimestep());
    string meshName;

    TRY
    {
        meshName = md->MeshForVar(data->GetVariable());
    }
    CATCHALL
    {
        // Probably a CMFE.
        return scheme;
    }
    ENDTRY;

    if (md->GetFormatCanDoDomainDecomposition())
        return LOAD_BALANCE_DBPLUGIN_DYNAMIC;

    const avtMeshMetaData *mmd = md->GetMesh(meshName);

    if (mmd && mmd->loadBalanceScheme != LOAD_BALANCE_UNKNOWN)
    {
        debug1 << "Default load balance scheme \""
               << LoadBalanceSchemeToString(scheme).c_str() << "\""
               << " being overridden in favor of \""
               << LoadBalanceSchemeToString(mmd->loadBalanceScheme).c_str() << "\""
               << " for mesh \"" << meshName.c_str() << "\"" << endl;
        return mmd->loadBalanceScheme;
    }

    return scheme;
}
std::string
LoadBalancer::GetMeshName(avtContract_p input, int stateIndex)
{
    const LBInfo &lbinfo = pipelineInfo[input->GetPipelineIndex()];
    avtDatabase *db = dbMap[lbinfo.db];

    avtDataRequest_p data = input->GetDataRequest();
    avtDatabaseMetaData *md = db->GetMetaData(stateIndex);
    string meshName;

    TRY
    {
        meshName = md->MeshForVar(data->GetVariable());
    }
    CATCHALL
    {
    }
    ENDTRY

    return meshName;
}
avtDataRequest_p
LoadBalancer::Reduce(avtContract_p input)
{
    avtDataRequest_p data = input->GetDataRequest();

    //
    // It is difficult for the load balancer to communicate with the originating
    // source because it is done through callbacks.
    // So we do it by setting a Boolean in the contract.  Since there is only
    // one path that involves actually doing data replication, and many that don't,
    // we will unset the Boolean now and reset it in the case we actually do
    // data replication.
    //

#ifdef PARALLEL
    // only used in parallel
    bool dataReplicationRequested = input->ReplicateSingleDomainOnAllProcessors();
#endif

    input->SetReplicateSingleDomainOnAllProcessors(false);

    //
    // Pipeline index 0 is reserved for meta-data.  It should already be
    // load balanced.
    //
    if (input->GetPipelineIndex() == 0)
    {
        return data;
    }

    //
    // Assess load balancing specially for serial engines.
    //
    if (nProcs <= 1)
    {
        bool doDynLB = CheckDynamicLoadBalancing(input);
        if (!doDynLB && scheme != LOAD_BALANCE_STREAM)
        {
            pipelineInfo[input->GetPipelineIndex()].complete = true;
            return data;
        }
        else
        {
            avtDataObjectSource::RegisterProgressCallback(NULL,NULL);
            avtSILRestriction_p orig_silr   = data->GetRestriction();
            avtSILRestriction_p silr        = new avtSILRestriction(orig_silr);
            avtDataRequest_p new_data =
                                          new avtDataRequest(data, silr);
            avtSILRestrictionTraverser trav(silr);

            vector<int> list;
            trav.GetDomainList(list);
    
            if (pipelineInfo[input->GetPipelineIndex()].current < 0)
                pipelineInfo[input->GetPipelineIndex()].current  = 0;
            int domain = list[pipelineInfo[input->GetPipelineIndex()].current];
            int sggDomain = avtStreamingGhostGenerator::LBGetNextDomain();
            if (sggDomain >= 0)
                domain = sggDomain;
            vector<int> domainList(1, domain);
            new_data->GetRestriction()
                                   ->RestrictDomainsForLoadBalance(domainList);
            UpdateProgress(pipelineInfo[input->GetPipelineIndex()].current,
                           (int)list.size());
            pipelineInfo[input->GetPipelineIndex()].current++;
            if (pipelineInfo[input->GetPipelineIndex()].current == (int)list.size())
                pipelineInfo[input->GetPipelineIndex()].complete = true;
            return new_data;
        }
    }

#ifdef PARALLEL

    avtSILRestriction_p orig_silr   = data->GetRestriction();
    avtSILRestriction_p silr        = new avtSILRestriction(orig_silr);
    avtDataRequest_p new_data = new avtDataRequest(data, silr);
    avtSILRestrictionTraverser trav(silr);

    // set up MPI message tags
    static int lastDomDoneMsg = GetUniqueMessageTag();
    static int newDomToDoMsg = GetUniqueMessageTag();

    // Make sure that we have domain to file mapping available.
    LBInfo &lbInfo(pipelineInfo[input->GetPipelineIndex()]);
    std::string meshName = GetMeshName(input, dbState[lbInfo.db]);
    GetIOInformation(lbInfo.db, dbState[lbInfo.db], meshName);

    if (scheme == LOAD_BALANCE_STREAM)
    {
        if (pipelineInfo[input->GetPipelineIndex()].current < 0)
        {
            pipelineInfo[input->GetPipelineIndex()].current = 0;

            //
            // We probably want to do something more sophisticated in the future 
            // (like walking through a SIL).  For now, just use the "chunks"
            // mechanism set up with convenience methods.
            //
            vector<int> list;
            trav.GetDomainList(list);

            int amountPer = list.size() / nProcs;
            int oneExtraUntil = list.size() % nProcs;
            int lastDomain = 0;
            for (int i = 0 ; i < nProcs ; i++)
            {
                if (i == rank)
                {
                    int amount = amountPer + (i < oneExtraUntil ? 1 : 0);
                    for (int j = 0 ; j < amount ; j++)
                    {
                        domainListForStreaming.push_back(list[j+lastDomain]);
                    }
                }
                lastDomain += amountPer + (i < oneExtraUntil ? 1 : 0);
            }
        }

        int domain = domainListForStreaming[pipelineInfo[input->GetPipelineIndex()].current];
        int sggDomain = avtStreamingGhostGenerator::LBGetNextDomain();
        if (sggDomain >= 0)
            domain = sggDomain;
        vector<int> domainList(1, domain);
        new_data->GetRestriction()
                               ->RestrictDomainsForLoadBalance(domainList);
        UpdateProgress(pipelineInfo[input->GetPipelineIndex()].current,
                       domainListForStreaming.size());
        pipelineInfo[input->GetPipelineIndex()].current++;
        if (pipelineInfo[input->GetPipelineIndex()].current == (int)domainListForStreaming.size())
        {
            pipelineInfo[input->GetPipelineIndex()].complete = true;
            domainListForStreaming.clear();
        }
    }
    // Can we do dynamic load balancing?
    else if (! CheckDynamicLoadBalancing(input))
    {
        //
        // We probably want to do something more sophisticated in the future 
        // (like walking through a SIL).  For now, just use the "chunks"
        // mechanism set up with convenience methods.
        //
        vector<int> list;
        vector<int> mylist;
        trav.GetDomainList(list);

        if (dataReplicationRequested && list.size() == 1)
        {
            silr->RestrictDomainsForLoadBalance(list);
            pipelineInfo[input->GetPipelineIndex()].complete = true;

            // Communicate back to the pipeline that we are replicating.
            input->SetReplicateSingleDomainOnAllProcessors(true);

            return data;
        }

        //
        // For variables (including meshes) that require specific types of
        // load balancing, we override the scheme here
        //
        LoadBalanceScheme theScheme = DetermineAppropriateScheme(input);

        if (theScheme == LOAD_BALANCE_CONTIGUOUS_BLOCKS_TOGETHER)
        {
            int amountPer = list.size() / nProcs;
            int oneExtraUntil = list.size() % nProcs;
            int lastDomain = 0;
            for (int i = 0 ; i < nProcs ; i++)
            {
                if (i == rank)
                {
                    int amount = amountPer + (i < oneExtraUntil ? 1 : 0);
                    for (int j = 0 ; j < amount ; j++)
                    {
                        mylist.push_back(list[j+lastDomain]);
                    }
                }
                lastDomain += amountPer + (i < oneExtraUntil ? 1 : 0);
            }
        }
        else if (theScheme == LOAD_BALANCE_STRIDE_ACROSS_BLOCKS)
        {
            for (size_t j = 0 ; j < list.size() ; j++)
            {
                if (j % nProcs == (size_t)rank)
                    mylist.push_back(list[j]);
            }
        }
        else if (theScheme == LOAD_BALANCE_ABSOLUTE)
        {
            for (size_t j = 0 ; j < list.size() ; j++)
            {
                if (list[j] % nProcs == rank)
                    mylist.push_back(list[j]);
            }
        }
        else if (theScheme == LOAD_BALANCE_RESTRICTED)
        {
            LBInfo &lbInfo(pipelineInfo[input->GetPipelineIndex()]);
            IOInfo &ioInfo(ioMap[lbInfo.db]);
            const HintList &hints(ioInfo.ioInfo.GetHints());

            for (size_t j = 0 ; j < list.size() ; j++)
            {
                if (hints.size() >= (size_t)rank)
                {
                    const vector<int> &doms = hints[rank];
                    int ndoms = doms.size();
                    for (int h=0; h<ndoms; h++)
                    {
                        if (doms[h] == list[j])
                        {
                            mylist.push_back(list[j]);
                            break;
                        }
                    }
                }
            }
        }
        else if (theScheme == LOAD_BALANCE_RANDOM_ASSIGNMENT)
        {
            // all procs randomly jumble the list of domain ids
            // all procs compute same jumbled list due to same seed
            // [ which won't be true on a heterogeneous platform ]
            size_t j;
            vector<int> jumbledList = list;
            srand(0xDeadBeef);
            for (j = 0 ; j < list.size() * 5; j++)
            {
               int i1 = rand() % list.size();
               int i2 = rand() % list.size();
               int tmp = jumbledList[i1];
               jumbledList[i1] = jumbledList[i2];
               jumbledList[i2] = tmp;
            }
            // now, do round-robin assignment from the jumbled list
            for (j = 0 ; j < list.size() ; j++)
            {
                if (j % nProcs == (size_t)rank)
                    mylist.push_back(jumbledList[j]);
            }
        }
        else if (theScheme == LOAD_BALANCE_DBPLUGIN_DYNAMIC)
        {
            // Every processor gets the complete list
            mylist = list;
        }

        silr->RestrictDomainsForLoadBalance(mylist);
        pipelineInfo[input->GetPipelineIndex()].complete = true;
    }
    else
    {
        // disable progress updates from the filters this time around
        avtDataObjectSource::RegisterProgressCallback(NULL,NULL);

        LBInfo &lbInfo(pipelineInfo[input->GetPipelineIndex()]);
        IOInfo &ioInfo(ioMap[lbInfo.db]);
        if (rank == 0)
        {
            // -------------------------------------
            //     MASTER LOADBALANCER PROCESSES
            // -------------------------------------

            // Allocate enough space to hold the completed domains
            ioInfo.domains.resize(nProcs);
            ioInfo.files.resize(nProcs);
            bool validFileMap = (ioInfo.fileMap.size() != 0);

            // Get the list of domains to process
            vector<int> domainList;
            trav.GetDomainList(domainList);

            // Make a work list and a completed list
            size_t         totaldomains = domainList.size();
            deque<int>  incomplete(domainList.begin(), domainList.end());
            vector<int> complete;

            debug5 << "LoadBalancer Master -- starting with " 
                   << incomplete.size() << " domains\n";

            // pull from the incomplete list and push onto the complete list
            // until all domains are complete
            bool abort = false;
            int domain;
            UpdateProgress(0,0);
            while (complete.size() < totaldomains)
            {
                // check for an abort
                if (!abort &&
                    CheckAbort(false))
                {
                    abort = true;
                    totaldomains -= incomplete.size();
                    incomplete.clear();
                }

                // update the progress
                UpdateProgress(complete.size() + (domainList.size() - incomplete.size()),
                               domainList.size()*2);


                // get the completed domain number
                MPI_Status stat;
                MPI_Recv(&domain, 1, MPI_INT, MPI_ANY_SOURCE,
                         lastDomDoneMsg, VISIT_MPI_COMM, &stat);
                int processor = stat.MPI_SOURCE;

                // -1 means the first pass by the slave; nothing completed yet
                if (domain != -1)
                {
                    // add it to the complete list
                    complete.push_back(domain);
                }

                // figure out what to tell this processor to do
                if (incomplete.empty())
                    continue;

                // find a cached domain for next processor
                deque<int>::iterator i;
                for (i = incomplete.begin(); i != incomplete.end(); i++)
                {
                    if (ioInfo.domains[processor].find(*i) != 
                        ioInfo.domains[processor].end())
                        break;
                }
                // if no match, try to find one that is in a file
                // already opened by this processor
                if (i == incomplete.end())
                {
                    for (i = incomplete.begin(); i != incomplete.end(); i++)
                    {
                        int fileno = 0;
                        if (validFileMap)
                            fileno = ioInfo.fileMap[*i];
                        if (ioInfo.files[processor].count(fileno) > 0)
                            break;
                    }
                }
                // if still no match, find one that is in a file
                // opened by the fewest number of processors
                if (i == incomplete.end())
                {
                    int mindomain = -1;
                    int minopen   = 999999999;
                    for (i = incomplete.begin(); i != incomplete.end(); i++)
                    {
                        int fileno = 0;
                        if (validFileMap)
                            fileno = ioInfo.fileMap[*i];
                        // count the number of processors which have
                        // this file opened
                        int nopen = 0;
                        for (size_t j=0; j<ioInfo.files.size(); j++)
                            if (ioInfo.files[j].count(fileno) > 0)
                                nopen++;
                        if (nopen < minopen)
                        {
                            mindomain = *i;
                            minopen   = nopen;
                        }
                    }
                    for (i = incomplete.begin(); i != incomplete.end(); i++)
                    {
                        if (*i == mindomain)
                            break;
                    }
                }                    

                // if no match, just take the next one in line
                if (i == incomplete.end())
                    i=incomplete.begin();

                domain = *i;
                incomplete.erase(i);

                ioInfo.domains[processor].insert(domain);
                if (validFileMap)
                    ioInfo.files[processor].insert(ioInfo.fileMap[domain]);
                else
                    ioInfo.files[processor].insert(0);

                // send the new domain number to that processor
                debug5 << "LoadBalancer Master: sending domain " 
                       << domain << " to processor "<<processor<<"\n";
                MPI_Send(&domain, 1, MPI_INT, processor, newDomToDoMsg, VISIT_MPI_COMM);
            }

            // we're all done -- -2 means to abort, -1 means to send results
            int status = abort ? -2 : -1;
            for (int i=1; i<nProcs; i++)
                MPI_Send(&status, 1, MPI_INT, i, newDomToDoMsg,VISIT_MPI_COMM);

            if (abort)
                EXCEPTION0(AbortException);

            // all work is done
            UpdateProgress(1,0);
            lbInfo.complete = true;
            new_data->GetRestriction()->TurnOffAll();
            MPI_Barrier(VISIT_MPI_COMM);
        }
        else
        {
            // -------------------------------------
            //            SLAVE PROCESSES
            // -------------------------------------

            // send our last completed domain to the master
            int domain = lbInfo.current;
            MPI_Send(&domain, 1, MPI_INT, 0, lastDomDoneMsg, VISIT_MPI_COMM);

            // get our new work unit
            MPI_Status stat;
            MPI_Recv(&domain, 1, MPI_INT, 0, newDomToDoMsg, VISIT_MPI_COMM, &stat);
            lbInfo.current = domain;

            if (domain == -2)
            {
                EXCEPTION0(AbortException);
            }
            else if (domain == -1)
            {
                //  -1 is a tag for "no work" -- we are all done
                lbInfo.complete = true;
                new_data->GetRestriction()->TurnOffAll();
                MPI_Barrier(VISIT_MPI_COMM);
            }
            else
            {
                vector<int> domainList(1, domain);
                new_data->GetRestriction()
                                   ->RestrictDomainsForLoadBalance(domainList);
            }
        }
    }

    // By intersecting with the original restriction, we will ensure that
    // we are catching restrictions beyond domains, like materials, etc.
    // See comments in SIL restriction code regarding 'FastIntersect'.
    new_data->GetRestriction()->FastIntersect(orig_silr);

    return new_data;
#else
    EXCEPTION1(VisItException, "nprocs was > 1 in a non-parallel code");
#endif

}
bool
LoadBalancer::CheckDynamicLoadBalancing(avtContract_p input)
{
    //
    // See if we have already have decided.  If so, just return our cached
    // decision.
    //
    int index = input->GetPipelineIndex();
    LBInfo &lbinfo = pipelineInfo[index];
    if (lbinfo.haveInitializedDLB)
        return lbinfo.doDLB;

    //
    // If the user has not explicitly asked for DLB, then don't do it.
    //
    if (!allowDynamic)
    {
        // Almost always false.
        lbinfo.doDLB = false || (scheme == LOAD_BALANCE_STREAM);
        lbinfo.haveInitializedDLB = true;
        return lbinfo.doDLB;
    }

    //
    // Some hard and fast rules:
    //
    // Pipeline index 0 is reserved for meta-data and inlined pipelines.  So
    // no DLB for those.
    //
    // Cannot dynamic load balance some pipelines because of the filters
    // in the pipeline.
    //
    // We cannot do dynamic load balancing if the database does not believe
    // we can do dynamic load balancing (for example because we need ghost
    // data communicated or materials reconstructed).
    //
    avtDataRequest_p data = input->GetDataRequest();
    std::string dbname = lbinfo.db;
    avtDatabase *db = dbMap[dbname];
    if (input->GetPipelineIndex() == 0 ||
        input->ShouldUseStreaming() == false || 
        db->CanDoStreaming(data) == false)
    {
        lbinfo.doDLB = false;
        lbinfo.haveInitializedDLB = true;
        return false;
    }

    //
    // Don't do DLB if we have 2 or 3 procs.  It's not worth it.
    //
    if (nProcs == 2 || nProcs == 3)
    {
        lbinfo.doDLB = false;
        lbinfo.haveInitializedDLB = true;
        return false;
    }

    //
    // The user has asked for DLB.  And nothing in the pipeline is prevent it.
    // Do it!
    //
    lbinfo.doDLB = true;
    lbinfo.haveInitializedDLB = true;
    return true;
}