Example #1
0
void bitonic_sort_rec(_Iterator begin, _Iterator end, _Compare comp, const mxx::comm& comm, int pbeg, int size, bool dir) {

    // get next power of two
    int p2 = pow(2, ceil(log(size)/log(2)));

    // determine where the two sub-recursions are
    int pmid = pbeg + p2/2;
    int pend = pbeg + p2;
    if (pend > comm.size())
        pend = comm.size();
    // recursive base-case
    if (pend - pbeg <= 1)
        return;

    // sort the two subsequences, where the first is always a power of 2
    if (comm.rank() < pmid) {
        // sort descending
        bitonic_sort_rec(begin, end, comp, comm, pbeg, p2/2, !dir);
    } else {
        // sort ascending
        bitonic_sort_rec(begin, end, comp, comm, pmid, size - p2/2, dir);
    }
    // merge bitonic decreasing sequence
    bitonic_merge(begin, end, comp, comm, pbeg, pend, dir);
}
Example #2
0
      void printEdgeListDistribution(Iterator begin, Iterator end, mxx::comm &comm)
      {
        T localWorkLoad = std::distance(begin, end);

        T maxLoad  = mxx::reduce(localWorkLoad, 0, mxx::max<T>() , comm);
        T minLoad  = mxx::reduce(localWorkLoad, 0, mxx::min<T>() , comm);
        T meanLoad = mxx::reduce(localWorkLoad, 0, std::plus<T>(), comm)/ comm.size();

        auto sep = ",";
        LOG_IF(comm.rank() == 0, INFO) << "Distribution of edge list; min-mean-max : " << minLoad << sep << meanLoad << sep << maxLoad;
      }
Example #3
0
void bitonic_split(_Iterator begin, _Iterator end, _Compare comp, const mxx::comm& comm, int partner, int dir) {
    typedef typename std::iterator_traits<_Iterator>::value_type T;
    size_t np = std::distance(begin, end);
    //std::cout << "[Rank " << comm.rank() << "]: split with partner= " << partner << ", dir=" << dir << std::endl;
    std::vector<T> recv_buf(np);
    std::vector<T> merge_buf(np);

    // send/recv
    mxx::datatype dt = mxx::get_datatype<T>();
    MPI_Sendrecv(&*begin, np, dt.type(), partner, 0,
                 &recv_buf[0], np, dt.type(), partner, 0, comm, MPI_STATUS_IGNORE);

    // merge in `dir` direction into merge buffer
    if ((dir == asc && partner > comm.rank()) || (dir == desc && partner < comm.rank())) {
        // forward merge and keep the `np` smaller elements
        _Iterator l = begin;
        typename std::vector<T>::iterator r = recv_buf.begin();
        typename std::vector<T>::iterator o = merge_buf.begin();

        for (size_t i = 0; i < np; ++i) {
            if (comp(*l, *r)) {
                *o = *l;
                ++o;
                ++l;
            } else {
                *o = *r;
                ++o;
                ++r;
            }
        }
    } else {
        // backward merge and keep the `np` larger elements
        _Iterator l = begin+np-1;
        typename std::vector<T>::iterator r = recv_buf.begin()+np-1;
        typename std::vector<T>::iterator o = merge_buf.begin()+np-1;

        for (size_t i = 0; i < np; ++i) {
            if (comp(*l, *r)) {
                *o = *r;
                --o;
                --r;
            } else {
                *o = *l;
                --o;
                --l;
            }
        }
    }
    // copy results into local buffer
    std::copy(merge_buf.begin(), merge_buf.end(), begin);
}
Example #4
0
void bitonic_merge(_Iterator begin, _Iterator end, _Compare comp, const mxx::comm& comm, int pbeg, int pend, int dir) {
    MXX_ASSERT(pbeg <= comm.rank() && comm.rank() < pend);

    // get size and terminate at recursive base-case
    int size = pend - pbeg;
    if (size <= 1)
        return;

    // get next greater power of 2
    int p2 = pow(2, ceil(log(size)/log(2)));
    // merge with splits as is done in the power of 2 case
    int pmid = pbeg + p2/2;

    if (comm.rank() < pmid && comm.rank() + p2/2 < pend) {
        // this processor has a partner in the second half
        int partner_rank = comm.rank() + p2/2;
        bitonic_split(begin, end, comp, comm, partner_rank, dir);
        bitonic_merge(begin, end, comp, comm, pbeg, pmid, dir);
    } else if (comm.rank() < pmid) {
        // this process doesn't have a partner but has to recursively
        // participate in the next merge
        bitonic_merge(begin, end, comp, comm, pbeg, pmid, dir);
    } else { // if (comm.rank() >= pmid)
        int partner_rank = comm.rank() - p2/2;
        bitonic_split(begin, end, comp, comm, partner_rank, dir);
        bitonic_merge(begin, end, comp, comm, pmid, pend, dir);
    }
}
        void populateEdgeList( std::vector< std::pair<T, T> > &edgeList, 
            uint64_t chainLength, 
            uint8_t mode,
            const mxx::comm &comm = mxx::comm())
        {
          mxx::section_timer timer;

          //sanity check
          assert(mode == LOWTOHIGH_IDS);

          //Know the portion of graph to be generated locally 
          //Lets divide the chainLength value by the number of processes
          mxx::partition::block_decomposition<T> part(chainLength, comm.size(), comm.rank());

          if(mode == LOWTOHIGH_IDS)
          {
            //First node (vertex) id on this MPI node
            T beginNodeId = part.excl_prefix_size(); 
            T lastNodeId = part.excl_prefix_size() + part.local_size() -1 ;

            if(part.local_size() > 0)
            {
              for(int i = beginNodeId; i < lastNodeId; i++)
              {
                edgeList.emplace_back(i, i + 1);
                edgeList.emplace_back(i + 1, i);
              }

              //If not last rank, attach edges from last node of this rank to first node of next rank
              if(part.prefix_size() < chainLength)
              {
                edgeList.emplace_back(lastNodeId, lastNodeId+1);
                edgeList.emplace_back(lastNodeId+1, lastNodeId);
              }
            }
          }

          timer.end_section("graph generation completed");
        }
Example #6
0
void bitonic_sort(_Iterator begin, _Iterator end, _Compare comp, const mxx::comm& comm) {
    size_t np = std::distance(begin, end);
    if (!mxx::all_same(np, comm)) {
        throw std::runtime_error("bitonic sort only works for the same number of elements on each process.");
    }

    if (!std::is_sorted(begin, end, comp)) {
        // sort local elements first
        std::sort(begin, end, comp);
    }

    // recursively sort via bitonic sort
    impl::bitonic_sort_rec(begin, end, comp, comm, 0, comm.size(), impl::asc);
}
Example #7
0
void bulk_rmq(const std::size_t n, const std::vector<index_t>& local_els,
              std::vector<std::tuple<index_t, index_t, index_t>>& ranges,
              const mxx::comm& comm) {
    // 3.) bulk-parallel-distributed RMQ for ranges (B2[i-1],B2[i]+1) to get min_lcp[i]
    //     a.) allgather per processor minimum lcp
    //     b.) create messages to left-most and right-most processor (i, B2[i])
    //     c.) on rank==B2[i]: get either left-flank min (i < B2[i])
    //         or right-flank min (i > B2[i])
    //         -> in bulk by ordering messages into two groups and sorting by B2[i]
    //            then linear scan for min starting from left-most/right-most element
    //     d.) answer with message (i, min(...))
    //     e.) on rank==i: receive min answer from left-most and right-most
    //         processor for each range
    //          -> sort by i
    //     f.) for each range: get RMQ of intermediary processors
    //                         min(min_p_left, RMQ(p_left+1,p_right-1), min_p_right)

    mxx::partition::block_decomposition_buffered<index_t> part(n, comm.size(), comm.rank());

    // get size parameters
    std::size_t local_size = local_els.size();
    std::size_t prefix_size = part.excl_prefix_size();

    // create RMQ for local elements
    rmq<typename std::vector<index_t>::const_iterator> local_rmq(local_els.begin(), local_els.end());

    // get per processor minimum and it's position
    auto local_min_it = local_rmq.query(local_els.begin(), local_els.end());
    index_t min_pos = (local_min_it - local_els.begin()) + prefix_size;
    index_t local_min = *local_min_it;
    //assert(local_min == *std::min_element(local_els.begin(), local_els.end()));
    std::vector<index_t> proc_mins = mxx::allgather(local_min, comm);
    std::vector<index_t> proc_min_pos = mxx::allgather(min_pos, comm);

    // create range-minimum-query datastructure for the processor minimums
    rmq<typename std::vector<index_t>::iterator> proc_mins_rmq(proc_mins.begin(), proc_mins.end());

    // 1.) duplicate vector of triplets
    // 2.) (asserting that t[1] < t[2]) send to target processor for t[1]
    // 3.) on t[1]: return min and min index of right flank
    // 4.) send duplicated to t[2]
    // 5.) on t[2]: return min and min index of left flank
    // 6.) if (target-p(t[1]) == target-p(t[2])):
    //      a) target-p(t[1]) participates in twice (2x too much, could later be
    //         algorithmically optimized away)
    //         and returns min only from range (t[1],t[2])
    // 7.) on i: sort both by i, then pairwise iterate through and get target-p
    //     for both min indeces. if there are p's in between: use proc_min_rmq
    //     to get value and index of min p, then get global min index of that
    //     processor (needs to be gotten as well during the allgather)
    //  TODO: maybe template value type as different type than index_type
    //        (right now has to be identical, which suffices for LCP)

    std::vector<std::tuple<index_t, index_t, index_t> > ranges_right(ranges);

    // first communication
    mxx::all2all_func(
        ranges,
        [&](const std::tuple<index_t, index_t, index_t>& x) {
            return part.target_processor(std::get<1>(x));
        }, comm);
    // find mins from start to right border locally
    for (auto it = ranges.begin(); it != ranges.end(); ++it)
    {
        assert(std::get<1>(*it) < std::get<2>(*it));
        auto range_begin = local_els.begin() + (std::get<1>(*it) - prefix_size);
        auto range_end = local_els.end();
        if (std::get<2>(*it) < prefix_size + local_size)
        {
            range_end = local_els.begin() + (std::get<2>(*it) - prefix_size);
        }
        auto range_min = local_rmq.query(range_begin, range_end);
        std::get<1>(*it) = (range_min - local_els.begin()) + prefix_size;
        std::get<2>(*it) = *range_min;
    }
    // send results back to originator
    mxx::all2all_func(
        ranges,
        [&](const std::tuple<index_t, index_t, index_t>& x) {
            return part.target_processor(std::get<0>(x));
        }, comm);

    // second communication
    mxx::all2all_func(
        ranges_right,
        [&](const std::tuple<index_t, index_t, index_t>& x) {
            return part.target_processor(std::get<2>(x)-1);
        }, comm);
    // find mins from start to right border locally
    for (auto it = ranges_right.begin(); it != ranges_right.end(); ++it)
    {
        assert(std::get<1>(*it) < std::get<2>(*it));
        auto range_end = local_els.begin() + (std::get<2>(*it) - prefix_size);
        auto range_begin = local_els.begin();
        if (std::get<1>(*it) >= prefix_size)
        {
            range_begin = local_els.begin() + (std::get<1>(*it) - prefix_size);
        }
        auto range_min = local_rmq.query(range_begin, range_end);
        std::get<1>(*it) = (range_min - local_els.begin()) + prefix_size;
        std::get<2>(*it) = *range_min;
    }
    // send results back to originator
    mxx::all2all_func(
        ranges_right,
        [&](const std::tuple<index_t, index_t, index_t>& x) {
            return part.target_processor(std::get<0>(x));
        }, comm);

    // get total min and save into ranges
    assert(ranges.size() == ranges_right.size());

    // sort both by target index
    std::sort(ranges.begin(), ranges.end(),
              [](const std::tuple<index_t, index_t, index_t>& x,
                 const std::tuple<index_t, index_t, index_t>& y) {
        return std::get<0>(x) < std::get<0>(y);
    });
    std::sort(ranges_right.begin(), ranges_right.end(),
              [](const std::tuple<index_t, index_t, index_t>& x,
                 const std::tuple<index_t, index_t, index_t>& y) {
        return std::get<0>(x) < std::get<0>(y);
    });

    // iterate through both results (left and right)
    // and get overall minimum
    for (std::size_t i=0; i < ranges.size(); ++i)
    {
        assert(std::get<0>(ranges[i]) == std::get<0>(ranges_right[i]));
        std::size_t left_min_idx = std::get<1>(ranges[i]);
        std::size_t right_min_idx = std::get<1>(ranges_right[i]);
        // get min of both ranges
        if (std::get<2>(ranges[i]) > std::get<2>(ranges_right[i]))
        {
            ranges[i] = ranges_right[i];
        }

        // if the answer is different
        if (left_min_idx != right_min_idx)
        {
            int p_left = part.target_processor(left_min_idx);
            int p_right = part.target_processor(right_min_idx);
            // get minimum of both elements
            if (p_left + 1 < p_right)
            {
                // get min from per process min RMQ
                auto p_min_it =
                    proc_mins_rmq.query(proc_mins.begin() + p_left + 1,
                                        proc_mins.begin() + p_right);
                index_t p_min = *p_min_it;
                // if smaller, save as overall minimum
                if (p_min < std::get<2>(ranges[i]))
                {
                    std::get<1>(ranges[i]) = proc_min_pos[p_min_it - proc_mins.begin()];
                    std::get<2>(ranges[i]) = p_min;
                }
            }
        }
    }
}