Exemplo n.º 1
0
// Explore the neighborhood around a vertex looking for missing overlaps
SGEdgeStatsVisitor::CandidateVector SGEdgeStatsVisitor::getMissingCandidates(StringGraph* /*pGraph*/, 
                                                                             Vertex* pVertex, 
                                                                             int minOverlap) const
{
    CandidateVector out;

    // Mark the vertices that are reached from this vertex as black to indicate
    // they already are overlapping
    EdgePtrVec edges = pVertex->getEdges();
    for(size_t i = 0; i < edges.size(); ++i)
    {
        edges[i]->getEnd()->setColor(GC_BLACK);
    }
    pVertex->setColor(GC_BLACK);

    for(size_t i = 0; i < edges.size(); ++i)
    {
        Edge* pXY = edges[i];
        EdgePtrVec neighborEdges = pXY->getEnd()->getEdges();
        for(size_t j = 0; j < neighborEdges.size(); ++j)
        {
            Edge* pYZ = neighborEdges[j];
            if(pYZ->getEnd()->getColor() != GC_BLACK)
            {
                // Infer the overlap object from the edges
                Overlap ovrXY = pXY->getOverlap();
                Overlap ovrYZ = pYZ->getOverlap();

                if(SGAlgorithms::hasTransitiveOverlap(ovrXY, ovrYZ))
                {
                    Overlap ovr_xz = SGAlgorithms::inferTransitiveOverlap(ovrXY, ovrYZ);
                    if(ovr_xz.match.getMinOverlapLength() >= minOverlap)
                    {
                        out.push_back(Candidate(pYZ->getEnd(), ovr_xz));
                        pYZ->getEnd()->setColor(GC_BLACK);
                    }
                }
            }
        }
    }

    // Reset colors
    for(size_t i = 0; i < edges.size(); ++i)
        edges[i]->getEnd()->setColor(GC_WHITE);
    pVertex->setColor(GC_WHITE);
    for(size_t i = 0; i < out.size(); ++i)
        out[i].pEndpoint->setColor(GC_WHITE);
    return out;
}
Exemplo n.º 2
0
bool SGEdgeStatsVisitor::visit(StringGraph* pGraph, Vertex* pVertex)
{
    const int MIN_OVERLAP = pGraph->getMinOverlap();
    const double MAX_ERROR = pGraph->getErrorRate();

    static int visited = 0;
    ++visited;
    if(visited % 50000 == 0)
        std::cout << "visited: " << visited << "\n";

    // Add stats for the found overlaps
    EdgePtrVec edges = pVertex->getEdges();
    for(size_t i = 0; i < edges.size(); ++i)
    {
        Overlap ovr = edges[i]->getOverlap();
        int numDiff = ovr.match.countDifferences(pVertex->getStr(), edges[i]->getEnd()->getStr());
        int overlapLen = ovr.match.getMinOverlapLength();
        addOverlapToCount(overlapLen, numDiff, foundCounts);
    }
        
    // Explore the neighborhood around this graph for potentially missing overlaps
    CandidateVector candidates = getMissingCandidates(pGraph, pVertex, MIN_OVERLAP);
    MultiOverlap addedMO(pVertex->getID(), pVertex->getStr());
    for(size_t i = 0; i < candidates.size(); ++i)
    {
        Candidate& c = candidates[i];
        int numDiff = c.ovr.match.countDifferences(pVertex->getStr(), c.pEndpoint->getStr());
        double error_rate = double(numDiff) / double(c.ovr.match.getMinOverlapLength());

        if(error_rate < MAX_ERROR)
        {
            int overlapLen = c.ovr.match.getMinOverlapLength();
            addOverlapToCount(overlapLen, numDiff, missingCounts);
        }
    }
    
    return false;
}
Exemplo n.º 3
0
// Align the haplotype to the reference genome represented by the BWT/SSA pair
void HapgenUtil::alignHaplotypeToReferenceKmer(size_t k,
                                               const std::string& haplotype,
                                               const BWTIndexSet& referenceIndex,
                                               const ReadTable* pReferenceTable,
                                               HapgenAlignmentVector& outAlignments)
{
    PROFILE_FUNC("HapgenUtil::alignHaplotypesToReferenceKmer")
    int64_t max_interval_size = 4;

    if(haplotype.size() < k)
        return;

    std::vector<int> event_count_vector;
    std::vector<HapgenAlignment> tmp_alignments;
    int min_events = std::numeric_limits<int>::max();

    // Align forward and reverse haplotype to reference
    for(size_t i = 0; i <= 1; ++i)
    {
        bool is_reverse = i == 1;
        std::string query = is_reverse ? reverseComplement(haplotype) : haplotype;

        // Find shared kmers between the haplotype and the reference
        CandidateVector candidates;

        size_t nqk = query.size() - k + 1;
        for(size_t j = 0; j < nqk; ++j)
        {
            std::string kmer = query.substr(j, k);

            // Find the interval of this kmer in the reference
            BWTInterval interval = BWTAlgorithms::findInterval(referenceIndex, kmer);
            if(!interval.isValid() || interval.size() >= max_interval_size)
                continue; // not found or too repetitive

            // Extract the reference location of these hits
            for(int64_t k = interval.lower; k  <= interval.upper; ++k)
            {
                SAElem elem = referenceIndex.pSSA->calcSA(k, referenceIndex.pBWT);

                // Make a candidate alignment
                CandidateKmerAlignment candidate;
                candidate.query_index = j;
                candidate.target_index = elem.getPos();
                candidate.target_extrapolated_start = candidate.target_index - candidate.query_index;
                candidate.target_extrapolated_end = candidate.target_extrapolated_start + query.size();
                candidate.target_sequence_id = elem.getID();
                candidates.push_back(candidate);
            }
        }

        // Remove duplicate candidates
        std::sort(candidates.begin(), candidates.end(), CandidateKmerAlignment::sortByStart);
        CandidateVector::iterator new_end = std::unique(candidates.begin(), candidates.end(), CandidateKmerAlignment::equalByStart);
        candidates.resize(new_end - candidates.begin());
        
        for(size_t j = 0; j < candidates.size(); ++j)
        {
            // Extract window around reference
            size_t window_size = 200;
            int ref_start = candidates[j].target_extrapolated_start - window_size;
            int ref_end = candidates[j].target_extrapolated_end + window_size;
            const SeqItem& ref_record = pReferenceTable->getRead(candidates[j].target_sequence_id);
            const DNAString& ref_sequence = ref_record.seq;
            if(ref_start < 0)
                ref_start = 0;

            if(ref_end > (int)ref_sequence.length())
                ref_end = ref_sequence.length();

            std::string ref_substring = ref_sequence.substr(ref_start, ref_end - ref_start);

            // Align haplotype to the reference
            SequenceOverlap overlap = alignHaplotypeToReference(ref_substring, query);
            if(overlap.score < 0 || !overlap.isValid())
                continue;

            int alignment_start = ref_start + overlap.match[0].start;
            int alignment_end = ref_start + overlap.match[0].end; // inclusive
            int alignment_length = alignment_end - alignment_start + 1;

            // Crude count of the number of distinct variation events
            bool has_indel = false;
            int num_events = overlap.edit_distance;
            std::stringstream c_parser(overlap.cigar);
            int len;
            char t;
            while(c_parser >> len >> t) 
            {
                assert(len > 0);

                // Only count one event per insertion/deletion
                if(t == 'D' || t == 'I')
                {
                    num_events -= (len - 1);
                    has_indel = true;
                }
            }

            // Skip poor alignments
            double mismatch_rate = 1.0f - (overlap.getPercentIdentity() / 100.f);
            if(mismatch_rate > 0.05f || overlap.total_columns < 50)
            {
                if(Verbosity::Instance().getPrintLevel() > 4)
                {
                    printf("Haplotype Alignment - Ignoring low quality alignment (%.3lf, %dbp, %d events) to %s:%d\n", 
                        1.0f - mismatch_rate, overlap.total_columns, num_events, ref_record.id.c_str(), ref_start);
                }
                continue;
            }

            bool is_snp = !has_indel && overlap.edit_distance == 1;

            HapgenAlignment aln(candidates[j].target_sequence_id, 
                                alignment_start, 
                                alignment_length, 
                                overlap.score, 
                                num_events,
                                is_reverse, 
                                is_snp);

            tmp_alignments.push_back(aln);
            event_count_vector.push_back(num_events);
            if(Verbosity::Instance().getPrintLevel() > 4)
            {
                printf("Haplotype Alignment - Accepting alignment (%.3lf, %dbp, %d events) to %s:%d\n", 
                    1.0f - mismatch_rate, overlap.total_columns, num_events, ref_record.id.c_str(), ref_start);
            }            
            // Record the best edit distance
            if(num_events < min_events) 
                min_events = num_events;
        }
    }

    // Copy the best alignments into the output
    int MAX_DIFF_TO_BEST = 10;
    int MAX_EVENTS = 8;
    assert(event_count_vector.size() == tmp_alignments.size());
    for(size_t i = 0; i < event_count_vector.size(); ++i)
    {

        if(event_count_vector[i] <= MAX_EVENTS && event_count_vector[i] - min_events <= MAX_DIFF_TO_BEST)
            outAlignments.push_back(tmp_alignments[i]);
        else if(Verbosity::Instance().getPrintLevel() > 3)
            printf("Haplotype Alignment - Ignoring alignment with too many events (%d)\n", event_count_vector[i]);

    }
}
Exemplo n.º 4
0
// Align the haplotype to the reference genome represented by the BWT/SSA pair
void HapgenUtil::alignHaplotypeToReferenceKmer(size_t k,
        const std::string& haplotype,
        const BWTIndexSet& referenceIndex,
        const ReadTable* pReferenceTable,
        HapgenAlignmentVector& outAlignments)
{
    PROFILE_FUNC("HapgenUtil::alignHaplotypesToReferenceKmer")
    int64_t max_interval_size = 4;

    if(haplotype.size() < k)
        return;

    std::vector<int> event_count_vector;
    std::vector<HapgenAlignment> tmp_alignments;
    int min_events = std::numeric_limits<int>::max();

    // Align forward and reverse haplotype to reference
    for(size_t i = 0; i <= 1; ++i)
    {
        bool is_reverse = i == 1;
        std::string query = is_reverse ? reverseComplement(haplotype) : haplotype;

        // Find shared kmers between the haplotype and the reference
        CandidateVector candidates;

        size_t nqk = query.size() - k + 1;
        for(size_t j = 0; j < nqk; ++j)
        {
            std::string kmer = query.substr(j, k);

            // Find the interval of this kmer in the reference
            BWTInterval interval = BWTAlgorithms::findInterval(referenceIndex, kmer);
            if(!interval.isValid() || interval.size() >= max_interval_size)
                continue; // not found or too repetitive

            // Extract the reference location of these hits
            for(int64_t k = interval.lower; k  <= interval.upper; ++k)
            {
                SAElem elem = referenceIndex.pSSA->calcSA(k, referenceIndex.pBWT);

                // Make a candidate alignment
                CandidateKmerAlignment candidate;
                candidate.query_index = j;
                candidate.target_index = elem.getPos();
                candidate.target_extrapolated_start = candidate.target_index - candidate.query_index;
                candidate.target_extrapolated_end = candidate.target_extrapolated_start + query.size();
                candidate.target_sequence_id = elem.getID();
                candidates.push_back(candidate);
            }
        }

        // Remove duplicate candidates
        std::sort(candidates.begin(), candidates.end(), CandidateKmerAlignment::sortByStart);
        CandidateVector::iterator new_end = std::unique(candidates.begin(), candidates.end(), CandidateKmerAlignment::equalByStart);
        candidates.resize(new_end - candidates.begin());

        for(size_t j = 0; j < candidates.size(); ++j)
        {
            // Extract window around reference
            size_t window_size = 200;
            int ref_start = candidates[j].target_extrapolated_start - window_size;
            int ref_end = candidates[j].target_extrapolated_end + window_size;

            const DNAString& ref_sequence = pReferenceTable->getRead(candidates[j].target_sequence_id).seq;
            if(ref_start < 0)
                ref_start = 0;

            if(ref_end > (int)ref_sequence.length())
                ref_end = ref_sequence.length();

            std::string ref_substring = ref_sequence.substr(ref_start, ref_end - ref_start);

            // Align haplotype to the reference
            SequenceOverlap overlap = Overlapper::computeOverlap(query, ref_substring);

            // Skip terrible alignments
            double percent_aligned = (double)overlap.getOverlapLength() / query.size();
            if(percent_aligned < 0.95f)
                continue;
            /*
            // Skip alignments that are not full-length matches of the haplotype
            if(overlap.match[0].start != 0 || overlap.match[0].end != (int)haplotype.size() - 1)
                continue;
            */
            int alignment_start = ref_start + overlap.match[1].start;
            int alignment_end = ref_start + overlap.match[1].end; // inclusive
            int alignment_length = alignment_end - alignment_start + 1;

            // Crude count of the number of distinct variation events
            int num_events = overlap.edit_distance;
            std::stringstream c_parser(overlap.cigar);
            int len;
            char t;
            while(c_parser >> len >> t)
            {
                assert(len > 0);

                // Only count one event per insertion/deletion
                if(t == 'D' || t == 'I')
                    num_events -= (len - 1);
            }


            HapgenAlignment aln(candidates[j].target_sequence_id, alignment_start, alignment_length, overlap.score, is_reverse);
            tmp_alignments.push_back(aln);
            event_count_vector.push_back(num_events);

            // Record the best edit distance
            if(num_events < min_events)
                min_events = num_events;
        }
    }

    // Copy the best alignments into the output
    int MAX_DIFF_TO_BEST = 10;
    int MAX_EVENTS = 8;
    assert(event_count_vector.size() == tmp_alignments.size());
    for(size_t i = 0; i < event_count_vector.size(); ++i)
    {
        if(event_count_vector[i] <= MAX_EVENTS && event_count_vector[i] - min_events <= MAX_DIFF_TO_BEST)
            outAlignments.push_back(tmp_alignments[i]);
    }
}