Ejemplo n.º 1
0
string alignment_to_sam(const Alignment& alignment,
                        const string& refseq,
                        const int32_t refpos,
                        const string& cigar,
                        const string& mateseq,
                        const int32_t matepos,
                        const int32_t tlen) {
    stringstream sam;

    sam << (!alignment.name().empty() ? alignment.name() : "*") << "\t"
        << sam_flag(alignment) << "\t"
        << (refseq.empty() ? "*" : refseq) << "\t"
        << refpos + 1 << "\t"
        //<< (alignment.path().mapping_size() ? refpos + 1 : 0) << "\t" // positions are 1-based in SAM, 0 means unmapped
        << alignment.mapping_quality() << "\t"
        << (alignment.has_path() && alignment.path().mapping_size() ? cigar : "*") << "\t"
        << (mateseq == refseq ? "=" : mateseq) << "\t"
        << matepos + 1 << "\t"
        << tlen << "\t"
        << (!alignment.sequence().empty() ? alignment.sequence() : "*") << "\t";
    // hack much?
    if (!alignment.quality().empty()) {
        const string& quality = alignment.quality();
        for (int i = 0; i < quality.size(); ++i) {
            sam << quality_short_to_char(quality[i]);
        }
    } else {
        sam << "*";
        //sam << string(alignment.sequence().size(), 'I');
    }
    //<< (alignment.has_quality() ? string_quality_short_to_char(alignment.quality()) : string(alignment.sequence().size(), 'I'));
    if (!alignment.read_group().empty()) sam << "\tRG:Z:" << alignment.read_group();
    sam << "\n";
    return sam.str();
}
Ejemplo n.º 2
0
void QualAdjAligner::align(Alignment& alignment, Graph& g, bool print_score_matrices) {

    gssw_graph* graph = create_gssw_graph(g, 0, nullptr);

    const string& sequence = alignment.sequence();
    const string& quality = alignment.quality();

    if (quality.length() != sequence.length()) {
        cerr << "error:[Aligner] sequence and quality strings different lengths, cannot perform base quality adjusted alignmenterror:[Aligner] sequence and quality strings different lengths, cannot perform base quality adjusted alignment" << endl;
    }

    gssw_graph_fill_qual_adj(graph, sequence.c_str(), quality.c_str(),
                             nt_table, adjusted_score_matrix,
                             scaled_gap_open, scaled_gap_extension, 15, 2);

    gssw_graph_mapping* gm = gssw_graph_trace_back_qual_adj (graph,
                             sequence.c_str(),
                             quality.c_str(),
                             sequence.size(),
                             nt_table,
                             adjusted_score_matrix,
                             scaled_gap_open,
                             scaled_gap_extension);

    gssw_mapping_to_alignment(graph, gm, alignment, print_score_matrices);

#ifdef debug
    gssw_print_graph_mapping(gm, stderr);
#endif

    gssw_graph_mapping_destroy(gm);
    gssw_graph_destroy(graph);
}
Ejemplo n.º 3
0
    Alignment Filter::avg_qual_filter(Alignment& aln){
        double total_qual = 0.0;
        // If the parameter for window size is zero, set the local equivalent
        // to the entire size of the qual scores.
        int win_len = window_length; //>= 1 ? window_length : aln.quality().size();

        cerr << "UNTESTED" << endl;
        exit(1);

        std::function<double(int64_t, int64_t)> calc_avg_qual = [](int64_t total_qual, int64_t length){
            return ((double) total_qual / (double) length);
        };

        /**
         * Helper function.
         * Sums qual scores from start : start + window_length
         * if start + window_len > qualstr.size(), return a negative float
         * otherwise return the average quality within the window
         *
         */
        std::function<double(string, int, int)> window_qual = [&](string qualstr, int start, int window_length){
            if (start + window_length > qualstr.size()){
                return -1.0;
            }

            total_qual = 0.0;
            for (int i = start; i < start + window_length; i++){
                total_qual += (int) qualstr[i];
            }

            return calc_avg_qual(total_qual, window_length);
        };



        //TODO: handle reversing alignments
        string quals = aln.quality();
        for (int i = 0; i < quals.size() - win_len; i++){
            double w_qual = window_qual(quals, i, win_len);
            if ( w_qual < 0.0){
                break;
            }
            if (w_qual < min_avg_qual){
                return inverse ? aln : Alignment();
            }
        }

        return inverse ? Alignment() : aln;

    }
Ejemplo n.º 4
0
    /**
     * CLI: vg filter -d 10 -q 40 -r -R
     * -r: track depth of both novel variants and those in the graph.
     * -R: remove edits that fail the filter (otherwise toss the whole alignment)
     */
    Alignment Filter::qual_filter(Alignment& aln){
        string quals = aln.quality();
        int offset = qual_offset;
        for (int i = 0; i < quals.size(); i++){
            if (((int) quals[i] - offset) < min_qual){
                if (!remove_failing_edits){
                    return inverse ? aln : Alignment();
                }
                else{
                    //TODO: Should we really edit out poor-quality bases??
                    // It's complex and awkward.
                    return inverse ? aln : Alignment();
                }
            }
        }

        return inverse ? Alignment() : aln;
    }
Ejemplo n.º 5
0
void Pileups::compute_from_edit(NodePileup& pileup, int64_t& node_offset,
                                int64_t& read_offset,
                                const Node& node, const Alignment& alignment,
                                const Mapping& mapping, const Edit& edit) {
    string seq = edit.sequence();
    bool is_reverse = mapping.position().is_reverse();
    
    // ***** MATCH *****
    if (edit.from_length() == edit.to_length()) {
        assert (edit.from_length() > 0);
        make_match(seq, edit.from_length(), is_reverse);
        assert(seq.length() == edit.from_length());            
        int64_t delta = 1;
        for (int64_t i = 0; i < edit.from_length(); ++i) {
            BasePileup* base_pileup = get_create_base_pileup(pileup, node_offset);
            // reference_base if empty
            if (base_pileup->num_bases() == 0) {
                base_pileup->set_ref_base(node.sequence()[node_offset]);
            } else {
                assert(base_pileup->ref_base() == node.sequence()[node_offset]);
            }
            // add base to bases field (converting to ,. if match)
            char base = seq[i];
            if (!edit.sequence().empty() &&
                base_equal(seq[i], node.sequence()[node_offset], is_reverse)) {
                base = is_reverse ? ',' : '.';
            }
            *base_pileup->mutable_bases() += base;
            // add quality if there
            if (!alignment.quality().empty()) {
                *base_pileup->mutable_qualities() += alignment.quality()[read_offset];
            }
            // pileup size increases by 1
            base_pileup->set_num_bases(base_pileup->num_bases() + 1);
            // move right along read, and left/right depending on strand on reference
            node_offset += delta;
            ++read_offset;
        }
    }
    // ***** INSERT *****
    else if (edit.from_length() < edit.to_length()) {
        make_insert(seq, is_reverse);
        assert(edit.from_length() == 0);
        // we define insert (like sam) as insertion between current and next
        // position (on forward node coordinates). this means an insertion before
        // offset 0 is invalid! 
        int64_t insert_offset =  is_reverse ? node_offset : node_offset - 1;
        if (insert_offset >= 0) {        
            BasePileup* base_pileup = get_create_base_pileup(pileup, insert_offset);
            // reference_base if empty
            if (base_pileup->num_bases() == 0) {
                base_pileup->set_ref_base(node.sequence()[insert_offset]);
            } else {
                assert(base_pileup->ref_base() == node.sequence()[insert_offset]);
            }
            // add insertion string to bases field
            // todo: should we reverse complement this if mapping is reversed ??? 
            base_pileup->mutable_bases()->append(seq);
            if (!alignment.quality().empty()) {
                *base_pileup->mutable_qualities() += alignment.quality()[read_offset];
            }
            // pileup size increases by 1
            base_pileup->set_num_bases(base_pileup->num_bases() + 1);
        }
        else {
            // todo: need to either forget about these, or extend pileup format.
            // easy solution: change insert to come before position, and just add
            // optional pileup at n+1st base of node.  would like to figure out
            // how samtools does it first...
            /*
            stringstream ss;
            ss << "Warning: pileup does not support insertions before 0th base in node."
               << " Offending edit: " << pb2json(edit) << endl;
#pragma omp critical(cerr)
            cerr << ss.str();
            */
        }
        // move right along read (and stay put on reference)
        read_offset += edit.to_length();
    }
    // ***** DELETE *****
    else {
        assert(edit.to_length() == 0);
        assert(edit.sequence().empty());
        int64_t del_start = !is_reverse ? node_offset :
            node_offset - edit.from_length() + 1;
        seq = node.sequence().substr(del_start, edit.from_length());
        make_delete(seq, is_reverse);
        BasePileup* base_pileup = get_create_base_pileup(pileup, node_offset);
        // reference_base if empty
        if (base_pileup->num_bases() == 0) {
            base_pileup->set_ref_base(node.sequence()[node_offset]);
        } else {
            assert(base_pileup->ref_base() == node.sequence()[node_offset]);
        }
        // add deletion string to bases field
        // todo: should we reverse complement this if mapping is reversed ??? 
        base_pileup->mutable_bases()->append(seq);
        if (!alignment.quality().empty()) {
            *base_pileup->mutable_qualities() += alignment.quality()[read_offset];
        }
        // pileup size increases by 1
        base_pileup->set_num_bases(base_pileup->num_bases() + 1);
        int64_t delta = edit.from_length();
        // stay put on read, move left/right depending on strand on reference
        node_offset += delta;
    }
}
Ejemplo n.º 6
0
void alignment_quality_char_to_short(Alignment& alignment) {
    alignment.set_quality(string_quality_char_to_short(alignment.quality()));
}
Ejemplo n.º 7
0
void Aligner::align_internal(Alignment& alignment, vector<Alignment>* multi_alignments, Graph& g,
                             int64_t pinned_node_id, bool pin_left, int32_t max_alt_alns, bool print_score_matrices) {

    // check input integrity
    if (pin_left && !pinned_node_id) {
        cerr << "error:[Aligner] cannot choose pinned end in non-pinned alignment" << endl;
        exit(EXIT_FAILURE);
    }
    if (multi_alignments && !pinned_node_id) {
        cerr << "error:[Aligner] multiple traceback is not valid in local alignment, only pinned and global" << endl;
        exit(EXIT_FAILURE);
    }
    if (!(multi_alignments) && max_alt_alns != 1) {
        cerr << "error:[Aligner] cannot specify maximum number of alignments in single alignment" << endl;
        exit(EXIT_FAILURE);
    }


    // alignment pinning algorithm is based on pinning in bottom right corner, if pinning in top
    // left we need to reverse all the sequences first and translate the alignment back later

    // create reversed objects if necessary
    Graph reversed_graph;
    string reversed_sequence;
    if (pin_left) {
        reversed_sequence.resize(alignment.sequence().length());

        reverse_copy(alignment.sequence().begin(), alignment.sequence().end(), reversed_sequence.begin());
        reverse_graph(g, reversed_graph);
    }

    // choose forward or reversed objects
    Graph* align_graph;
    string* align_sequence;
    if (pin_left) {
        align_graph = &reversed_graph;
        align_sequence = &reversed_sequence;
    }
    else {
        align_graph = &g;
        align_sequence = alignment.mutable_sequence();
    }

    // convert into gssw graph and get the counterpart to pinned node (if pinning)
    gssw_node* pinned_node = nullptr;
    gssw_graph* graph = create_gssw_graph(*align_graph, pinned_node_id, &pinned_node);

    if (pinned_node_id & !pinned_node) {
        cerr << "error:[Aligner] pinned node for pinned alignment is not in graph" << endl;
        exit(EXIT_FAILURE);
    }

    // perform dynamic programming
    gssw_graph_fill(graph, (*align_sequence).c_str(),
                    nt_table, score_matrix,
                    gap_open, gap_extension, 15, 2);

    // traceback either from pinned position or optimal local alignment
    if (pinned_node) {
        // trace back pinned alignment
        gssw_graph_mapping** gms = gssw_graph_trace_back_pinned_multi (graph,
                                   pinned_node,
                                   max_alt_alns,
                                   (*align_sequence).c_str(),
                                   (*align_sequence).size(),
                                   nt_table,
                                   score_matrix,
                                   gap_open,
                                   gap_extension);

        if (pin_left) {
            // translate graph and mappings into original node space
            unreverse_graph(reversed_graph);
            for (int32_t i = 0; i < max_alt_alns; i++) {
                unreverse_graph_mapping(gms[i]);
            }
        }

        // convert optimal alignment and store it in the input Alignment object (in the multi alignment,
        // this will have been set to the first in the vector)
        if (gms[0]->score > 0) {
            // have a mapping, can just convert normally
            gssw_mapping_to_alignment(graph, gms[0], alignment, print_score_matrices);
        }
        else {
            // gssw will not identify mappings with 0 score, infer location based on pinning

            Mapping* mapping = alignment.mutable_path()->add_mapping();
            mapping->set_rank(1);

            // locate at the end of the node
            Position* position = mapping->mutable_position();
            position->set_node_id(pinned_node_id);
            position->set_offset(pin_left ? 0 : pinned_node->len);

            // soft clip
            Edit* edit = mapping->add_edit();
            edit->set_to_length(alignment.sequence().length());
            edit->set_sequence(alignment.sequence());
        }


        if (multi_alignments) {
            // determine how many non-null alignments were returned
            int32_t num_non_null = max_alt_alns;
            for (int32_t i = 1; i < max_alt_alns; i++) {
                if (gms[i]->score <= 0) {
                    num_non_null = i;
                    break;
                }
            }

            // reserve to avoid illegal access errors that occur when the vector reallocates
            multi_alignments->reserve(num_non_null);

            // copy the primary alignment
            multi_alignments->emplace_back(alignment);

            // convert the alternate alignments and store them at the back of the vector (this will not
            // execute if we are doing single alignment)
            for (int32_t i = 1; i < num_non_null; i++) {
                gssw_graph_mapping* gm = gms[i];

                // make new alignment object
                multi_alignments->emplace_back();
                Alignment& next_alignment = multi_alignments->back();

                // copy over sequence information from the primary alignment
                next_alignment.set_sequence(alignment.sequence());
                next_alignment.set_quality(alignment.quality());

                // get path of the alternate alignment
                gssw_mapping_to_alignment(graph, gm, next_alignment, print_score_matrices);

            }
        }

        for (int32_t i = 0; i < max_alt_alns; i++) {
            gssw_graph_mapping_destroy(gms[i]);
        }
        free(gms);
    }
    else {
        // trace back local alignment
        gssw_graph_mapping* gm = gssw_graph_trace_back (graph,
                                 (*align_sequence).c_str(),
                                 (*align_sequence).size(),
                                 nt_table,
                                 score_matrix,
                                 gap_open,
                                 gap_extension);

        gssw_mapping_to_alignment(graph, gm, alignment, print_score_matrices);
        gssw_graph_mapping_destroy(gm);
    }

    //gssw_graph_print_score_matrices(graph, sequence.c_str(), sequence.size(), stderr);

    gssw_graph_destroy(graph);

}