Ejemplo n.º 1
0
static int process(VDB::Writer const &out, LineBuffer &ifs)
{
    auto active = std::vector<ContigPair>();
    
    auto ref = decltype(active.front().first.ref)(0); ///< the active reference (first read)
    auto end = decltype(active.front().first.end)(0); ///< the largest ending position (first read) seen so far; the is the end of the active window

    unsigned long long in_count = 0;
    unsigned long long out_count = 0;
    unsigned long long gapless_count = 0;
    auto time0 = time(nullptr);
    auto freq = 0.1;
    auto report = freq;

    for ( ; ; ) {
        auto pair = ContigPair(ifs);
        auto const isEOF = pair.count == 0;
        
        if ((!active.empty() && (pair.first.ref != ref || pair.first.start >= end)) || isEOF) {
            // new pair is outside the active window (or EOF);
            // output the active contig pairs and empty the window
            
            for (auto && i : active) {
                if (i.first.ref == i.second.ref && i.second.start < i.first.end) {
                    // the region is gapless, i.e. the mate-pair gap has been filled in
                    i.first.end = i.second.start = 0;
                }
            }
            for (auto i = decltype(active.size())(0); i < active.size(); ++i) {
                if (active[i].first.end != 0 || active[i].second.end != 0) continue;
                // active[i] is gapless

                auto const group = active[i].group;
                auto start = active[i].first.start;
                auto end = active[i].second.end;
            AGAIN:
                for (auto j = decltype(i)(0); j < active.size(); ++j) {
                    if (j == i) continue;
                    auto const &J = active[j];
                    if (J.group != group || J.second.ref != ref || J.first.start >= end || J.second.end <= start) continue;
                    
                    // active[j] overlaps active[i]
                    if ((J.first.end == 0 && J.second.start == 0) ///< active[j] is also gapless
                        || (start < J.first.end && J.second.start < end)) ///< or active[i] covers active[j]'s gap
                    {
                        start = std::min(start, J.first.start);
                        end = std::max(end, J.second.end);
                        active[i].first.start = start;
                        active[i].second.end = end;
                        active[i].count += J.count;
                        if (j < i)
                            --i;
                        active.erase(active.begin() + j);
                        goto AGAIN;
                    }
                }
            }
            std::sort(active.begin(), active.end(), ///< want order to be canonical; should be mostly in-order already
                      [](ContigPair const &a, ContigPair const &b) {
                          if (a.first.start < b.first.start) return true;
                          if (a.first.start > b.first.start) return false;
                          if (a.first.end == 0 && a.second.start == 0) {
                              if (b.first.end == 0 && b.second.start == 0) {
                                  if (a.second.end < b.second.end) return false; ///< longer one goes first
                                  if (a.second.end > b.second.end) return true;
                              }
                              else if (a.second.ref == b.second.ref) {
                                  return true; ///< gapless one goes first
                              }
                              else {
                                  return a.second.ref < b.second.ref;
                              }
                          }
                          else if (b.first.end == 0 && b.second.start == 0) {
                              if (a.second.ref == b.second.ref) {
                                  return false; ///< gapless one goes first
                              }
                              else {
                                  return a.second.ref < b.second.ref;
                              }
                          }
                          else {
                              // both have a gap
                              if (a.first.end < b.first.end) return true;
                              if (a.first.end > b.first.end) return false;
                              if (a.second.ref < b.second.ref) return true;
                              if (a.second.ref > b.second.ref) return false;
                              if (a.second.start < b.second.start) return true;
                              if (a.second.start > b.second.start) return false;
                              if (a.second.end < b.second.end) return true;
                              if (a.second.end > b.second.end) return false;
                          }
                          return a.group < b.group;
                      });
            for (auto && i : active) {
                if (i.second.start == 0 && i.first.end == 0)
                    ++gapless_count;
                i.write(out);
                ++out_count;
            }
            active.clear();
            if (isEOF) goto REPORT;
        }
        if (active.empty()) {
            ref = pair.first.ref;
            end = pair.first.end;
            active.emplace_back(pair);
        }
        else {
            for ( ; ; ) {
                unsigned maxOverlap = 0;
                auto merge = active.size(); ///< index of an existing contig pair into which the new pair should be merged
                
                for (auto i = active.size(); i != 0; ) {  ///< the best overlap is probably near the end of the list, so start at the back
                    --i;                            ///< and loop backwards
                    auto const &j = active[i];
                    if (j.group == pair.group && j.second.ref == pair.second.ref) {
                        if (j == pair) {
                            /// found an exact match, and since the list is unique, we're done
                            merge = i;
                            break;
                        }
                        
                        auto const start1 = std::max(pair.first.start, j.first.start);
                        auto const start2 = std::max(pair.second.start, j.second.start);
                        auto const end1 = std::min(pair.first.end, j.first.end);
                        auto const end2 = std::min(pair.second.end, j.second.end);
                        
                        /// the regions of overlap are [start1 - end1), [start2 - end2)
                        /// if either are empty (start >= end) then we aren't interested in the contig pair
                        if (start1 < end1 && start2 < end2) {
                            unsigned const overlap = (end1 - start1) + (end2 - start2);
                            if (maxOverlap < overlap) {
                                maxOverlap = overlap;
                                merge = i;
                            }
                        }
                    }
                }
                if (merge == active.size()) {
                    end = std::max(end, pair.first.end);
                    active.emplace_back(pair);
                    break;
                }
                auto const mergedPair = active[merge] + pair;
                if (active[merge] == mergedPair) {
                    active[merge] = mergedPair;
                    break;
                }
                pair = mergedPair;
                active.erase(active.begin() + merge);
            }
        }

        ++in_count;
        if (ifs.position() >= report) {
            report += freq;
        REPORT:
            auto elapsed = double(time(nullptr) - time0);
            if (elapsed > 0)
                std::cerr << "prog: " << unsigned(ifs.position() * 100.0) << "%; " << in_count << " alignments processed (" << in_count / elapsed << " per sec); (" << gapless_count << " gapless) " << out_count << " contig pairs generated (" << out_count / elapsed << " per sec); ratio: " << double(in_count) / out_count << std::endl;
            else
                std::cerr << "prog: " << unsigned(ifs.position() * 100.0) << "%; " << in_count << " alignments processed; (" << gapless_count << " gapless) " << out_count << " contig pairs generated; ratio: " << double(in_count) / out_count << std::endl;
            if (isEOF)
                return 0;
        }
    }
}