Ejemplo n.º 1
0
    void Graph::insertSequenceAlignment(const Alignment& alignment,
                                        const string& seq,
                                        const string& label) {
        const string& aln_sequence = alignment.sequence();
        const deque<int>& seq_idxs = alignment.seq_idxs();
        const deque<int>& node_ids = alignment.node_ids();

        int first_id = -1;
        int head_id = -1;
        int tail_id = -1;
        pair<uint32_t, uint32_t> prefix_end_ids;
        pair<uint32_t, uint32_t> suffix_end_ids;

        // because of local alignment prefix or sufix of sequence can be unaligned
        // so we add it directly to graph
        deque<uint32_t> valid_seq_idxs;
        for (auto idx : seq_idxs) {
            if (idx != -1) {
                valid_seq_idxs.emplace_back(idx);
            }
        }
        uint32_t aln_seq_start_idx = valid_seq_idxs.front();
        uint32_t aln_seq_end_idx = valid_seq_idxs.back();

        if (aln_seq_start_idx > 0) {
            prefix_end_ids = addUnmatchedSequence(
                                    aln_sequence.substr(0, aln_seq_start_idx),
                                    label,
                                    false);
            first_id = prefix_end_ids.first;
            head_id = prefix_end_ids.second;
        }

        if (aln_seq_end_idx < aln_sequence.length()) {
            suffix_end_ids = addUnmatchedSequence(
                                    aln_sequence.substr(aln_seq_end_idx + 1),
                                    label,
                                    false);
            tail_id = suffix_end_ids.first;
        }

        // aligned part of sequence
        uint32_t size = max(seq_idxs.size(), node_ids.size());
        for (uint32_t i = 0; i < size; ++i) {
            auto& seq_idx = seq_idxs[i];
            auto& match_id = node_ids[i];

            if (seq_idx == -1) {
                continue;
            }

            int node_id = -1;
            char base = aln_sequence[seq_idx];
            if (match_id == -1) {
                // if sequence base unmatched with graph node add new node
                addNode(base);
                node_id = next_id_ - 1;
            } else if (nodes_[match_id]->base() == base) {
                // if sequence base matched to a node with same base
                node_id = match_id;
            } else {
                // if sequence base matched to a node with different base
                // which is aligned to a node with same base
                int found_node_id = -1;
                for (auto id : nodes_[match_id]->getAlignedIds()) {
                    if (nodes_[id]->base() == base) {
                        found_node_id = id;
                        break;
                    }
                }

                if (found_node_id == -1) {
                    // we didn't find aligned node with same base
                    addNode(base);
                    node_id = next_id_ - 1;
                    // add all aligned to nodes to newly created node
                    for (auto id : nodes_[match_id]->getAlignedIds()) {
                        nodes_[node_id]->addAlignedNode(id);
                    }
                    nodes_[node_id]->addAlignedNode(match_id);

                    // to nodes aligned to newly created node add this node
                    // as aligned to
                    for (auto id : nodes_[node_id]->getAlignedIds()) {
                        nodes_[id]->addAlignedNode(node_id);
                    }
                } else {
                    // node id is found node id
                    node_id = found_node_id;
                }
            }

            if (head_id != -1 && node_id != -1) {
                addEdge(head_id, node_id, label);
            }
            head_id = node_id;
            if (first_id == -1) {
                first_id = head_id;
            }
        }

        // connect aligned part with unaligned suffix
        if (head_id != -1 && tail_id != -1) {
            addEdge(head_id, tail_id, label);
        }

        // resort nodes order
        topological_sort();

        sequences_.emplace_back(seq);
        labels_.emplace_back(label);
        start_ids_.emplace_back(first_id);
    }