Example #1
0
void UnitigGraph::Refresh_() {
    omp_lock_t reassemble_lock;
    omp_init_lock(&reassemble_lock);
    static AtomicBitVector marked;
    marked.reset(vertices_.size());

    // update the sdbg
#pragma omp parallel for
    for (vertexID_t i = 0; i < vertices_.size(); ++i) {
        if (vertices_[i].is_dead && !vertices_[i].is_deleted) {
            int64_t cur_node = vertices_[i].end_node;
            while (cur_node != vertices_[i].start_node) {
                sdbg_->SetInvalid(cur_node);
                cur_node = sdbg_->UniqueIncoming(cur_node);
                assert(cur_node != -1);
                cur_node = sdbg_->GetLastIndex(cur_node);
            }
            sdbg_->SetInvalid(cur_node);

            if (vertices_[i].rev_end_node != vertices_[i].end_node) {
                cur_node = vertices_[i].rev_end_node;
                while (cur_node != vertices_[i].rev_start_node) {
                    sdbg_->SetInvalid(cur_node);
                    cur_node = sdbg_->UniqueIncoming(cur_node);
                    assert(cur_node != -1);
                    cur_node = sdbg_->GetLastIndex(cur_node);
                }
                sdbg_->SetInvalid(cur_node);
            }

            vertices_[i].is_deleted = true;
        }
    }

#pragma omp parallel for
    for (vertexID_t i = 0; i < vertices_.size(); ++i) {
        if (vertices_[i].is_deleted) { continue; }
        int dir;
        if (assembly_algorithms::PrevSimplePathNode(*sdbg_, vertices_[i].start_node) == -1) {
            dir = 0;
        } else if (assembly_algorithms::PrevSimplePathNode(*sdbg_, vertices_[i].rev_start_node) == -1) {
            dir = 1;
        } else {
            continue;
        }

        if (!marked.lock(i)) { continue; }

        std::vector<std::pair<vertexID_t, bool> > linear_path; // first: vertex_id, second: is_rc
        int64_t cur_end = dir == 0 ? vertices_[i].end_node : vertices_[i].rev_end_node;
        int64_t new_start = dir == 0 ? vertices_[i].start_node : vertices_[i].rev_start_node;
        int64_t new_rc_end = dir == 0 ? vertices_[i].rev_end_node : vertices_[i].end_node;

        while (true) {
            int64_t next_start = assembly_algorithms::NextSimplePathNode(*sdbg_, cur_end);
            if (next_start == -1) {
                break;
            }

            auto next_vertex_iter = start_node_map_.find(next_start);
            assert(next_vertex_iter != start_node_map_.end());
            UnitigGraphVertex &next_vertex = vertices_[next_vertex_iter->second];
            assert(!next_vertex.is_deleted);

            bool is_rc = next_vertex.start_node != next_start;
            linear_path.push_back(std::make_pair(next_vertex_iter->second, is_rc));

            cur_end = is_rc ? next_vertex.rev_end_node : next_vertex.end_node;
        }

        if (linear_path.empty()) { continue; }

        if (i != linear_path.back().first && !marked.lock(linear_path.back().first)) { // if i == linear_path.back().first it is a palindrome self loop
            if (linear_path.back().first > i) {
                marked.unset(i);
                continue;
            } else {
                while (!marked.lock(linear_path.back().first)) {
                    // wait for the other thread release the lock
                }
            }
        }

        // assemble the linear path

        int64_t depth = vertices_[i].depth;
        int64_t length = vertices_[i].length;

        for (unsigned j = 0; j < linear_path.size(); ++j) {
            UnitigGraphVertex &next_vertex = vertices_[linear_path[j].first];
            length += next_vertex.length;
            depth += next_vertex.depth;
            next_vertex.is_deleted = true;
        }

        vertices_[i].length = length;
        vertices_[i].depth = depth;

        int64_t new_end;
        int64_t new_rc_start;
        if (linear_path.back().second) {
            new_end = vertices_[linear_path.back().first].rev_end_node;
            new_rc_start = vertices_[linear_path.back().first].start_node;
        } else {
            new_end = vertices_[linear_path.back().first].end_node;
            new_rc_start = vertices_[linear_path.back().first].rev_start_node;
        }

        vertices_[i].start_node = new_start;
        vertices_[i].end_node = new_end;
        vertices_[i].rev_start_node = new_rc_start;
        vertices_[i].rev_end_node = new_rc_end;
        vertices_[i].is_changed = true;
        if (i == linear_path.back().first) {
            vertices_[i].is_deleted = false;
        }
    }

    // looped path
#pragma omp parallel for
    for (vertexID_t i = 0; i < vertices_.size(); ++i) {
        if (!vertices_[i].is_deleted && !marked.get(i)) {
            omp_set_lock(&reassemble_lock);
            if (!vertices_[i].is_deleted && !marked.get(i)) {
                uint32_t length = vertices_[i].length;
                int64_t depth = vertices_[i].depth;

                vertices_[i].is_changed = true;
                vertices_[i].is_loop = true;
                vertices_[i].is_deleted = true;
                bool is_palindrome = false;

                int64_t cur_end = vertices_[i].end_node;
                while (true) {
                    int64_t next_start = assembly_algorithms::NextSimplePathNode(*sdbg_, cur_end);
                    assert(next_start != -1);
                    if (next_start == vertices_[i].start_node) {
                        break;
                    }

                    auto next_vertex_iter = start_node_map_.find(next_start);
                    assert(next_vertex_iter != start_node_map_.end());
                    UnitigGraphVertex &next_vertex = vertices_[next_vertex_iter->second];

                    if (next_vertex.is_deleted) {
                        // that means the loop has alrealy gone through its rc
                        is_palindrome = true;
                    }

                    length += next_vertex.length;
                    depth += next_vertex.depth;
                    next_vertex.is_deleted = true;

                    cur_end = (next_vertex.start_node == next_start) ? next_vertex.end_node : next_vertex.rev_end_node;
                }

                vertices_[i].depth = depth;
                vertices_[i].length = length;
                vertices_[i].is_palindrome = is_palindrome;
                vertices_[i].end_node = sdbg_->GetLastIndex(assembly_algorithms::PrevSimplePathNode(*sdbg_, vertices_[i].start_node));
                vertices_[i].rev_start_node = sdbg_->ReverseComplement(vertices_[i].end_node);
                vertices_[i].rev_end_node = sdbg_->ReverseComplement(vertices_[i].start_node);
            }
            omp_unset_lock(&reassemble_lock);
        }
    }

#pragma omp parallel for
    for (vertexID_t i = 0; i < vertices_.size(); ++i) {
        if (!vertices_[i].is_deleted) {
            start_node_map_[vertices_[i].rev_start_node] = i;
        }
    }

    omp_destroy_lock(&reassemble_lock);
}
Example #2
0
namespace assembly_algorithms {

static AtomicBitVector marked;
static map<int64_t, int> histogram;
static inline void MarkNode(SuccinctDBG &dbg, int64_t node_idx);

int64_t NextSimplePathNode(SuccinctDBG &dbg, int64_t cur_node) {
    int64_t next_node = dbg.UniqueOutgoing(cur_node);
    if (next_node != -1 && dbg.UniqueIncoming(next_node) != -1) {
        return next_node;
    } else {
        return -1;
    }
}

int64_t PrevSimplePathNode(SuccinctDBG &dbg, int64_t cur_node) {
    int64_t prev_node = dbg.UniqueIncoming(cur_node);
    if (prev_node != -1 && dbg.UniqueOutgoing(prev_node) != -1) {
        return prev_node;
    } else {
        return -1;
    }
}

int64_t Trim(SuccinctDBG &dbg, int len, int min_final_contig_len) {
    int64_t number_tips = 0;
    omp_lock_t path_lock;
    omp_init_lock(&path_lock);
    marked.reset(dbg.size);

#pragma omp parallel for reduction(+:number_tips)  
    for (int64_t node_idx = 0; node_idx < dbg.size; ++node_idx) {
        if (dbg.IsValidNode(node_idx) && !marked.get(node_idx) && dbg.IsLast(node_idx) && dbg.OutdegreeZero(node_idx)) {
            vector<int64_t> path = {node_idx};
            int64_t prev_node;
            int64_t cur_node = node_idx;
            bool is_tip = false;
            for (int i = 1; i < len; ++i) {
                prev_node = dbg.UniqueIncoming(cur_node);
                if (prev_node == -1) {
                    is_tip = dbg.IndegreeZero(cur_node) && (i + dbg.kmer_k - 1 < min_final_contig_len);
                    break;
                } else if (dbg.UniqueOutgoing(prev_node) == -1) {
                    is_tip = true;
                    break;
                } else {
                    path.push_back(prev_node);
                    cur_node = prev_node;
                }
            }

            if (is_tip) {
                for (unsigned i = 0; i < path.size(); ++i) {
                    MarkNode(dbg, path[i]);
                }
                ++number_tips;
            }
        }
    }

#pragma omp parallel for reduction(+:number_tips)
    for (int64_t node_idx = 0; node_idx < dbg.size; ++node_idx) {
        if (dbg.IsValidNode(node_idx) && dbg.IsLast(node_idx) && !marked.get(node_idx) && dbg.IndegreeZero(node_idx)) {
            vector<int64_t> path = {node_idx};
            int64_t next_node;
            int64_t cur_node = node_idx;
            bool is_tip = false;
            for (int i = 1; i < len; ++i) {
                next_node = dbg.UniqueOutgoing(cur_node);
                if (next_node == -1) {
                    is_tip = dbg.OutdegreeZero(cur_node) && (i + dbg.kmer_k - 1 < min_final_contig_len);
                    break;
                } else if (dbg.UniqueIncoming(next_node) == -1) {
                    is_tip = true;
                } else {
                    path.push_back(next_node);
                    cur_node = next_node;
                }
            }

            if (is_tip) {
                for (unsigned i = 0; i < path.size(); ++i) {
                    MarkNode(dbg, path[i]);
                }
                ++number_tips;
            }
        }
    }

#pragma omp parallel for
    for (int64_t node_idx = 0; node_idx < dbg.size; ++node_idx) {
        if (marked.get(node_idx)) {
            dbg.SetInvalid(node_idx);
        }
    }

    return number_tips;
}

int64_t RemoveTips(SuccinctDBG &dbg, int max_tip_len, int min_final_contig_len) {
    int64_t number_tips = 0;
    xtimer_t timer;
    for (int len = 2; len < max_tip_len; len *= 2) {
        printf("Removing tips with length less than %d\n", len);
        timer.reset();
        timer.start();
        number_tips += Trim(dbg, len, min_final_contig_len);
        timer.stop();
        printf("Accumulated tips removed: %ld; time elapsed: %.4f\n", number_tips, timer.elapsed());
    }
    printf("Removing tips with length less than %d\n", max_tip_len);
    timer.reset();
    timer.start();
    number_tips += Trim(dbg, max_tip_len, min_final_contig_len);
    timer.stop();
    printf("Accumulated tips removed: %ld; time elapsed: %.4f\n", number_tips, timer.elapsed());
    return number_tips;
}

int64_t PopBubbles(SuccinctDBG &dbg, int max_bubble_len, double low_depth_ratio) {
    omp_lock_t bubble_lock;
    omp_init_lock(&bubble_lock);
    const int kMaxBranchesPerGroup = 4;
    if (max_bubble_len <= 0) { max_bubble_len = dbg.kmer_k * 2 + 2; }
    vector<std::pair<int, int64_t> > bubble_candidates;
    int64_t num_bubbles = 0;

#pragma omp parallel for
    for (int64_t node_idx = 0; node_idx < dbg.size; ++node_idx) {
        if (dbg.IsValidNode(node_idx) && dbg.IsLast(node_idx) && dbg.Outdegree(node_idx) > 1) {
            BranchGroup bubble(&dbg, node_idx, kMaxBranchesPerGroup, max_bubble_len);
            if (bubble.Search()) {
                omp_set_lock(&bubble_lock);
                bubble_candidates.push_back(std::make_pair(bubble.length(), node_idx));
                omp_unset_lock(&bubble_lock);
            }
        }
    }

    for (unsigned i = 0; i < bubble_candidates.size(); ++i) {
        BranchGroup bubble(&dbg, bubble_candidates[i].second, kMaxBranchesPerGroup, max_bubble_len);
        if (bubble.Search() && bubble.RemoveErrorBranches(low_depth_ratio)) {
            ++num_bubbles;
        }
    }

    omp_destroy_lock(&bubble_lock);
    return num_bubbles;
}

void AssembleFromUnitigGraph(SuccinctDBG &dbg, FILE *contigs_file, FILE *multi_file, FILE *final_contig_file, int min_final_contig_len) {
    xtimer_t timer;
    timer.reset();
    timer.start();
    UnitigGraph unitig_graph(&dbg);
    unitig_graph.InitFromSdBG();
    timer.stop();
    printf("unitig graph size: %u, time for building: %lf\n", unitig_graph.size(), timer.elapsed());
    
    timer.reset();
    timer.start();
    histogram.clear();
    if (final_contig_file == NULL) {
        unitig_graph.OutputInitUnitigs(contigs_file, multi_file, histogram);
    } else {
        unitig_graph.OutputInitUnitigs(contigs_file, multi_file, final_contig_file, histogram, min_final_contig_len);
    }
    PrintStat();
    timer.stop();
    printf("Time to output: %lf\n", timer.elapsed());
}

void AssembleFinalFromUnitigGraph(SuccinctDBG &dbg, FILE *final_contig_file, int min_final_contig_len) {
    xtimer_t timer;
    timer.reset();
    timer.start();
    UnitigGraph unitig_graph(&dbg);
    unitig_graph.InitFromSdBG();
    timer.stop();
    printf("unitig graph size: %u, time for building: %lf\n", unitig_graph.size(), timer.elapsed());
    
    timer.reset();
    timer.start();
    histogram.clear();
    unitig_graph.OutputFinalUnitigs(final_contig_file, histogram, min_final_contig_len);
    PrintStat();
    timer.stop();
    printf("Time to output: %lf\n", timer.elapsed());
}

void RemoveLowLocalAndOutputChanged(SuccinctDBG &dbg, FILE *contigs_file, FILE *multi_file, FILE *final_contig_file, 
                                    FILE *addi_contig_file, FILE *addi_multi_file, 
                                    double min_depth, int min_len, double local_ratio, int min_final_contig_len) {
    xtimer_t timer;
    timer.reset();
    timer.start();
    UnitigGraph unitig_graph(&dbg);
    unitig_graph.InitFromSdBG();
    timer.stop();
    printf("Simple path graph size: %u, time for building: %lf\n", unitig_graph.size(), timer.elapsed());

    timer.reset();
    timer.start();
    histogram.clear();
    if (final_contig_file == NULL) {
        unitig_graph.OutputInitUnitigs(contigs_file, multi_file, histogram);
    } else {
        unitig_graph.OutputInitUnitigs(contigs_file, multi_file, final_contig_file, histogram, min_final_contig_len);
    }
    PrintStat();
    timer.stop();
    printf("Time to output: %lf\n", timer.elapsed());

    const double kMaxDepth = 65535;
    const int kLocalWidth = 1000;
    int64_t num_removed = 0;
    
    timer.reset();
    timer.start();
    while (min_depth < kMaxDepth) {
        // xtimer_t local_timer;
        // local_timer.reset();
        // local_timer.start();
        if (!unitig_graph.RemoveLocalLowDepth(min_depth, min_len, kLocalWidth, local_ratio, num_removed)) {
            break;
        }

        min_depth *= 1.1;
        // local_timer.stop();
        // printf("depth: %lf, num: %ld, time: %lf\n", min_depth, num_removed, local_timer.elapsed());
    }
    timer.stop();
    printf("Number of unitigs removed: %ld, time: %lf\n", num_removed, timer.elapsed());

    histogram.clear();
    unitig_graph.OutputChangedUnitigs(addi_contig_file, addi_multi_file, histogram);
    PrintStat();
}

void RemoveLowLocalAndOutputFinal(SuccinctDBG &dbg, FILE *final_contig_file, 
                                  double min_depth, int min_len, double local_ratio, int min_final_contig_len) {
    UnitigGraph unitig_graph(&dbg);
    unitig_graph.InitFromSdBG();
    printf("Simple path graph size: %u\n", unitig_graph.size());

    const double kMaxDepth = 65535;
    const int kLocalWidth = 1000;
    int64_t num_removed = 0;

    while (min_depth < kMaxDepth && 
           unitig_graph.RemoveLocalLowDepth(min_depth, min_len, kLocalWidth, local_ratio, num_removed)) {
        min_depth *= 1.1;
    }
    printf("Number of unitigs removed: %ld\n", num_removed);

    histogram.clear();
    unitig_graph.OutputFinalUnitigs(final_contig_file, histogram, min_final_contig_len);
    PrintStat();
}

void PrintStat(long long genome_size) {
    // total length
    int64_t total_length = 0;
    int64_t total_contigs = 0;
    int64_t average_length = 0;
    for (auto it = histogram.begin(); it != histogram.end(); ++it) {
        total_length += it->first * it->second;
        total_contigs += it->second;
    }
    if (genome_size == 0) { genome_size = total_length; }

    if (total_contigs > 0) {
        average_length = total_length / total_contigs;
    }

    // N50
    int64_t n50 = -1;
    int64_t acc_length = 0;
    for (auto it = histogram.rbegin(); it != histogram.rend(); ++it) {
        acc_length += it->first * it->second;
        if (n50 == -1 && acc_length * 2 >= genome_size) {
            n50 = it->first;
            break;
        }
    }

    printf("Total length: %ld, N50: %ld, Mean: %ld, number of contigs: %ld\n", total_length, n50, average_length, total_contigs);
    printf("Maximum length: %ld\n", histogram.size() > 0 ? histogram.rbegin()->first : 0);
}

static inline void MarkNode(SuccinctDBG &dbg, int64_t node_idx) {
    node_idx = dbg.GetLastIndex(node_idx);
    marked.set(node_idx);
}

} // namespace assembly_algorithms
Example #3
0
int64_t Trim(SuccinctDBG &dbg, int len, int min_final_contig_len) {
    int64_t number_tips = 0;
    omp_lock_t path_lock;
    omp_init_lock(&path_lock);
    marked.reset(dbg.size);

#pragma omp parallel for reduction(+:number_tips)  
    for (int64_t node_idx = 0; node_idx < dbg.size; ++node_idx) {
        if (dbg.IsValidNode(node_idx) && !marked.get(node_idx) && dbg.IsLast(node_idx) && dbg.OutdegreeZero(node_idx)) {
            vector<int64_t> path = {node_idx};
            int64_t prev_node;
            int64_t cur_node = node_idx;
            bool is_tip = false;
            for (int i = 1; i < len; ++i) {
                prev_node = dbg.UniqueIncoming(cur_node);
                if (prev_node == -1) {
                    is_tip = dbg.IndegreeZero(cur_node) && (i + dbg.kmer_k - 1 < min_final_contig_len);
                    break;
                } else if (dbg.UniqueOutgoing(prev_node) == -1) {
                    is_tip = true;
                    break;
                } else {
                    path.push_back(prev_node);
                    cur_node = prev_node;
                }
            }

            if (is_tip) {
                for (unsigned i = 0; i < path.size(); ++i) {
                    MarkNode(dbg, path[i]);
                }
                ++number_tips;
            }
        }
    }

#pragma omp parallel for reduction(+:number_tips)
    for (int64_t node_idx = 0; node_idx < dbg.size; ++node_idx) {
        if (dbg.IsValidNode(node_idx) && dbg.IsLast(node_idx) && !marked.get(node_idx) && dbg.IndegreeZero(node_idx)) {
            vector<int64_t> path = {node_idx};
            int64_t next_node;
            int64_t cur_node = node_idx;
            bool is_tip = false;
            for (int i = 1; i < len; ++i) {
                next_node = dbg.UniqueOutgoing(cur_node);
                if (next_node == -1) {
                    is_tip = dbg.OutdegreeZero(cur_node) && (i + dbg.kmer_k - 1 < min_final_contig_len);
                    break;
                } else if (dbg.UniqueIncoming(next_node) == -1) {
                    is_tip = true;
                } else {
                    path.push_back(next_node);
                    cur_node = next_node;
                }
            }

            if (is_tip) {
                for (unsigned i = 0; i < path.size(); ++i) {
                    MarkNode(dbg, path[i]);
                }
                ++number_tips;
            }
        }
    }

#pragma omp parallel for
    for (int64_t node_idx = 0; node_idx < dbg.size; ++node_idx) {
        if (marked.get(node_idx)) {
            dbg.SetInvalid(node_idx);
        }
    }

    return number_tips;
}
Example #4
0
static inline void MarkNode(SuccinctDBG &dbg, int64_t node_idx) {
    node_idx = dbg.GetLastIndex(node_idx);
    marked.set(node_idx);
}