コード例 #1
0
ファイル: overlap-long.cpp プロジェクト: Milt0n/sga
// Write an ascii pictogram of an overlap
// O = overhang of length block_size (region of no overlap)
// P = perfect match of length block_size
// I = matched block with mismatches
std::string ascii_overlap(const std::string& s0, const std::string& s1,
                          SequenceOverlap& ovr, int block_size = 50)
{
    std::string out;

    int pre_overhang = ovr.match[0].start / block_size;
    int post_overhang = (ovr.length[0] - ovr.match[0].end - 1) / block_size;

    std::string p0;
    std::string p1;

    ovr.makePaddedMatches(s0, s1, &p0, &p1);

    out.append(pre_overhang, '-');

    for(size_t i = 0; i < p0.size(); i += block_size)
    {
        if(p0.substr(i, block_size) == p1.substr(i, block_size))
            out.append(1, '=');
        else
            out.append(1, 'x');
    }
    out.append(post_overhang, '-');
    return out;
}
コード例 #2
0
ファイル: KmerOverlaps.cpp プロジェクト: csf-ngs/sga
SequenceOverlapPairVector KmerOverlaps::retrieveMatches(const std::string& query, size_t k, 
                                                        int min_overlap, double min_identity,
                                                        int bandwidth, const BWTIndexSet& indices)
{
    PROFILE_FUNC("OverlapHaplotypeBuilder::retrieveMatches")
    assert(indices.pBWT != NULL);
    assert(indices.pSSA != NULL);

    int64_t max_interval_size = 200;
    SequenceOverlapPairVector overlap_vector;

    // Use the FM-index to look up intervals for each kmer of the read. Each index
    // in the interval is stored individually in the KmerMatchMap. We then
    // backtrack to map these kmer indices to read IDs. As reads can share
    // multiple kmers, we use the map to avoid redundant lookups.
    // There is likely a faster algorithm which performs direct decompression
    // of the read sequences without having to expand the intervals to individual
    // indices. The current algorithm suffices for now.
    KmerMatchMap prematchMap;
    size_t num_kmers = query.size() - k + 1;
    for(size_t i = 0; i < num_kmers; ++i)
    {
        std::string kmer = query.substr(i, k);
        BWTInterval interval = BWTAlgorithms::findInterval(indices, kmer);
        if(interval.isValid() && interval.size() < max_interval_size) 
        {
            for(int64_t j = interval.lower; j <= interval.upper; ++j)
            {
                KmerMatch match = { i, static_cast<size_t>(j), false };
                prematchMap.insert(std::make_pair(match, false));
            }
        }

        kmer = reverseComplement(kmer);
        interval = BWTAlgorithms::findInterval(indices, kmer);
        if(interval.isValid() && interval.size() < max_interval_size) 
        {
            for(int64_t j = interval.lower; j <= interval.upper; ++j)
            {
                KmerMatch match = { i, static_cast<size_t>(j), true };
                prematchMap.insert(std::make_pair(match, false));
            }
        }
    }

    // Backtrack through the kmer indices to turn them into read indices.
    // This mirrors the calcSA function in SampledSuffixArray except we mark each entry
    // as visited once it is processed.
    KmerMatchSet matches;
    for(KmerMatchMap::iterator iter = prematchMap.begin(); iter != prematchMap.end(); ++iter)
    {
        // This index has been visited
        if(iter->second)
            continue;

        // Mark this as visited
        iter->second = true;

        // Backtrack the index until we hit the starting symbol
        KmerMatch out_match = iter->first;
        while(1) 
        {
            char b = indices.pBWT->getChar(out_match.index);
            out_match.index = indices.pBWT->getPC(b) + indices.pBWT->getOcc(b, out_match.index - 1);

            // Check if the hash indicates we have visited this index. If so, stop the backtrack
            KmerMatchMap::iterator find_iter = prematchMap.find(out_match);
            if(find_iter != prematchMap.end())
            {
                // We have processed this index already
                if(find_iter->second)
                    break;
                else
                    find_iter->second = true;
            }

            if(b == '$')
            {
                // We've found the lexicographic index for this read. Turn it into a proper ID
                out_match.index = indices.pSSA->lookupLexoRank(out_match.index);
                matches.insert(out_match);
                break;
            }
        }
    }

    // Refine the matches by computing proper overlaps between the sequences
    // Use the overlaps that meet the thresholds to build a multiple alignment
    for(KmerMatchSet::iterator iter = matches.begin(); iter != matches.end(); ++iter)
    {
        std::string match_sequence = BWTAlgorithms::extractString(indices.pBWT, iter->index);
        if(iter->is_reverse)
            match_sequence = reverseComplement(match_sequence);
        
        // Ignore identical matches
        if(match_sequence == query)
            continue;

        // Compute the overlap. If the kmer match occurs a single time in each sequence we use
        // the banded extension overlap strategy. Otherwise we use the slow O(M*N) overlapper.
        SequenceOverlap overlap;
        std::string match_kmer = query.substr(iter->position, k);
        size_t pos_0 = query.find(match_kmer);
        size_t pos_1 = match_sequence.find(match_kmer);
        assert(pos_0 != std::string::npos && pos_1 != std::string::npos);

        // Check for secondary occurrences
        if(query.find(match_kmer, pos_0 + 1) != std::string::npos || 
           match_sequence.find(match_kmer, pos_1 + 1) != std::string::npos) {
            // One of the reads has a second occurrence of the kmer. Use
            // the slow overlapper.
            overlap = Overlapper::computeOverlap(query, match_sequence);
        } else {
            overlap = Overlapper::extendMatch(query, match_sequence, pos_0, pos_1, bandwidth);
        }

        bool bPassedOverlap = overlap.getOverlapLength() >= min_overlap;
        bool bPassedIdentity = overlap.getPercentIdentity() / 100 >= min_identity;

        if(bPassedOverlap && bPassedIdentity)
        {
            SequenceOverlapPair op;
            op.sequence[0] = query;
            op.sequence[1] = match_sequence;
            op.overlap = overlap;
            op.is_reversed = iter->is_reverse;
            overlap_vector.push_back(op);
        }
    }

    return overlap_vector;
}
コード例 #3
0
ファイル: HapgenUtil.cpp プロジェクト: SherlockThang/sga
// Align the haplotype to the reference genome represented by the BWT/SSA pair
void HapgenUtil::alignHaplotypeToReferenceKmer(size_t k,
                                               const std::string& haplotype,
                                               const BWTIndexSet& referenceIndex,
                                               const ReadTable* pReferenceTable,
                                               HapgenAlignmentVector& outAlignments)
{
    PROFILE_FUNC("HapgenUtil::alignHaplotypesToReferenceKmer")
    int64_t max_interval_size = 4;

    if(haplotype.size() < k)
        return;

    std::vector<int> event_count_vector;
    std::vector<HapgenAlignment> tmp_alignments;
    int min_events = std::numeric_limits<int>::max();

    // Align forward and reverse haplotype to reference
    for(size_t i = 0; i <= 1; ++i)
    {
        bool is_reverse = i == 1;
        std::string query = is_reverse ? reverseComplement(haplotype) : haplotype;

        // Find shared kmers between the haplotype and the reference
        CandidateVector candidates;

        size_t nqk = query.size() - k + 1;
        for(size_t j = 0; j < nqk; ++j)
        {
            std::string kmer = query.substr(j, k);

            // Find the interval of this kmer in the reference
            BWTInterval interval = BWTAlgorithms::findInterval(referenceIndex, kmer);
            if(!interval.isValid() || interval.size() >= max_interval_size)
                continue; // not found or too repetitive

            // Extract the reference location of these hits
            for(int64_t k = interval.lower; k  <= interval.upper; ++k)
            {
                SAElem elem = referenceIndex.pSSA->calcSA(k, referenceIndex.pBWT);

                // Make a candidate alignment
                CandidateKmerAlignment candidate;
                candidate.query_index = j;
                candidate.target_index = elem.getPos();
                candidate.target_extrapolated_start = candidate.target_index - candidate.query_index;
                candidate.target_extrapolated_end = candidate.target_extrapolated_start + query.size();
                candidate.target_sequence_id = elem.getID();
                candidates.push_back(candidate);
            }
        }

        // Remove duplicate candidates
        std::sort(candidates.begin(), candidates.end(), CandidateKmerAlignment::sortByStart);
        CandidateVector::iterator new_end = std::unique(candidates.begin(), candidates.end(), CandidateKmerAlignment::equalByStart);
        candidates.resize(new_end - candidates.begin());
        
        for(size_t j = 0; j < candidates.size(); ++j)
        {
            // Extract window around reference
            size_t window_size = 200;
            int ref_start = candidates[j].target_extrapolated_start - window_size;
            int ref_end = candidates[j].target_extrapolated_end + window_size;
            const SeqItem& ref_record = pReferenceTable->getRead(candidates[j].target_sequence_id);
            const DNAString& ref_sequence = ref_record.seq;
            if(ref_start < 0)
                ref_start = 0;

            if(ref_end > (int)ref_sequence.length())
                ref_end = ref_sequence.length();

            std::string ref_substring = ref_sequence.substr(ref_start, ref_end - ref_start);

            // Align haplotype to the reference
            SequenceOverlap overlap = alignHaplotypeToReference(ref_substring, query);
            if(overlap.score < 0 || !overlap.isValid())
                continue;

            int alignment_start = ref_start + overlap.match[0].start;
            int alignment_end = ref_start + overlap.match[0].end; // inclusive
            int alignment_length = alignment_end - alignment_start + 1;

            // Crude count of the number of distinct variation events
            bool has_indel = false;
            int num_events = overlap.edit_distance;
            std::stringstream c_parser(overlap.cigar);
            int len;
            char t;
            while(c_parser >> len >> t) 
            {
                assert(len > 0);

                // Only count one event per insertion/deletion
                if(t == 'D' || t == 'I')
                {
                    num_events -= (len - 1);
                    has_indel = true;
                }
            }

            // Skip poor alignments
            double mismatch_rate = 1.0f - (overlap.getPercentIdentity() / 100.f);
            if(mismatch_rate > 0.05f || overlap.total_columns < 50)
            {
                if(Verbosity::Instance().getPrintLevel() > 4)
                {
                    printf("Haplotype Alignment - Ignoring low quality alignment (%.3lf, %dbp, %d events) to %s:%d\n", 
                        1.0f - mismatch_rate, overlap.total_columns, num_events, ref_record.id.c_str(), ref_start);
                }
                continue;
            }

            bool is_snp = !has_indel && overlap.edit_distance == 1;

            HapgenAlignment aln(candidates[j].target_sequence_id, 
                                alignment_start, 
                                alignment_length, 
                                overlap.score, 
                                num_events,
                                is_reverse, 
                                is_snp);

            tmp_alignments.push_back(aln);
            event_count_vector.push_back(num_events);
            if(Verbosity::Instance().getPrintLevel() > 4)
            {
                printf("Haplotype Alignment - Accepting alignment (%.3lf, %dbp, %d events) to %s:%d\n", 
                    1.0f - mismatch_rate, overlap.total_columns, num_events, ref_record.id.c_str(), ref_start);
            }            
            // Record the best edit distance
            if(num_events < min_events) 
                min_events = num_events;
        }
    }

    // Copy the best alignments into the output
    int MAX_DIFF_TO_BEST = 10;
    int MAX_EVENTS = 8;
    assert(event_count_vector.size() == tmp_alignments.size());
    for(size_t i = 0; i < event_count_vector.size(); ++i)
    {

        if(event_count_vector[i] <= MAX_EVENTS && event_count_vector[i] - min_events <= MAX_DIFF_TO_BEST)
            outAlignments.push_back(tmp_alignments[i]);
        else if(Verbosity::Instance().getPrintLevel() > 3)
            printf("Haplotype Alignment - Ignoring alignment with too many events (%d)\n", event_count_vector[i]);

    }
}
コード例 #4
0
ファイル: HapgenUtil.cpp プロジェクト: nathanhaigh/sga
// Align the haplotype to the reference genome represented by the BWT/SSA pair
void HapgenUtil::alignHaplotypeToReferenceKmer(size_t k,
        const std::string& haplotype,
        const BWTIndexSet& referenceIndex,
        const ReadTable* pReferenceTable,
        HapgenAlignmentVector& outAlignments)
{
    PROFILE_FUNC("HapgenUtil::alignHaplotypesToReferenceKmer")
    int64_t max_interval_size = 4;

    if(haplotype.size() < k)
        return;

    std::vector<int> event_count_vector;
    std::vector<HapgenAlignment> tmp_alignments;
    int min_events = std::numeric_limits<int>::max();

    // Align forward and reverse haplotype to reference
    for(size_t i = 0; i <= 1; ++i)
    {
        bool is_reverse = i == 1;
        std::string query = is_reverse ? reverseComplement(haplotype) : haplotype;

        // Find shared kmers between the haplotype and the reference
        CandidateVector candidates;

        size_t nqk = query.size() - k + 1;
        for(size_t j = 0; j < nqk; ++j)
        {
            std::string kmer = query.substr(j, k);

            // Find the interval of this kmer in the reference
            BWTInterval interval = BWTAlgorithms::findInterval(referenceIndex, kmer);
            if(!interval.isValid() || interval.size() >= max_interval_size)
                continue; // not found or too repetitive

            // Extract the reference location of these hits
            for(int64_t k = interval.lower; k  <= interval.upper; ++k)
            {
                SAElem elem = referenceIndex.pSSA->calcSA(k, referenceIndex.pBWT);

                // Make a candidate alignment
                CandidateKmerAlignment candidate;
                candidate.query_index = j;
                candidate.target_index = elem.getPos();
                candidate.target_extrapolated_start = candidate.target_index - candidate.query_index;
                candidate.target_extrapolated_end = candidate.target_extrapolated_start + query.size();
                candidate.target_sequence_id = elem.getID();
                candidates.push_back(candidate);
            }
        }

        // Remove duplicate candidates
        std::sort(candidates.begin(), candidates.end(), CandidateKmerAlignment::sortByStart);
        CandidateVector::iterator new_end = std::unique(candidates.begin(), candidates.end(), CandidateKmerAlignment::equalByStart);
        candidates.resize(new_end - candidates.begin());

        for(size_t j = 0; j < candidates.size(); ++j)
        {
            // Extract window around reference
            size_t window_size = 200;
            int ref_start = candidates[j].target_extrapolated_start - window_size;
            int ref_end = candidates[j].target_extrapolated_end + window_size;

            const DNAString& ref_sequence = pReferenceTable->getRead(candidates[j].target_sequence_id).seq;
            if(ref_start < 0)
                ref_start = 0;

            if(ref_end > (int)ref_sequence.length())
                ref_end = ref_sequence.length();

            std::string ref_substring = ref_sequence.substr(ref_start, ref_end - ref_start);

            // Align haplotype to the reference
            SequenceOverlap overlap = Overlapper::computeOverlap(query, ref_substring);

            // Skip terrible alignments
            double percent_aligned = (double)overlap.getOverlapLength() / query.size();
            if(percent_aligned < 0.95f)
                continue;
            /*
            // Skip alignments that are not full-length matches of the haplotype
            if(overlap.match[0].start != 0 || overlap.match[0].end != (int)haplotype.size() - 1)
                continue;
            */
            int alignment_start = ref_start + overlap.match[1].start;
            int alignment_end = ref_start + overlap.match[1].end; // inclusive
            int alignment_length = alignment_end - alignment_start + 1;

            // Crude count of the number of distinct variation events
            int num_events = overlap.edit_distance;
            std::stringstream c_parser(overlap.cigar);
            int len;
            char t;
            while(c_parser >> len >> t)
            {
                assert(len > 0);

                // Only count one event per insertion/deletion
                if(t == 'D' || t == 'I')
                    num_events -= (len - 1);
            }


            HapgenAlignment aln(candidates[j].target_sequence_id, alignment_start, alignment_length, overlap.score, is_reverse);
            tmp_alignments.push_back(aln);
            event_count_vector.push_back(num_events);

            // Record the best edit distance
            if(num_events < min_events)
                min_events = num_events;
        }
    }

    // Copy the best alignments into the output
    int MAX_DIFF_TO_BEST = 10;
    int MAX_EVENTS = 8;
    assert(event_count_vector.size() == tmp_alignments.size());
    for(size_t i = 0; i < event_count_vector.size(); ++i)
    {
        if(event_count_vector[i] <= MAX_EVENTS && event_count_vector[i] - min_events <= MAX_DIFF_TO_BEST)
            outAlignments.push_back(tmp_alignments[i]);
    }
}
コード例 #5
0
void OverlapExtractorWithCorrection::getRawOverlapsCached(const std::string& query, bool is_reverse, SequenceOverlapPairVector* out_vector)
{
    PROFILE_FUNC("OverlapExtractorWithCorrection::getRawOverlaps")

    size_t nk = query.size() - m_k + 1;
    std::set<std::string> sequences_seen;

    for(size_t i = 0; i < nk; ++i)
    {
        std::string kmer = query.substr(i, m_k);
        std::string q_kmer = is_reverse ? reverseComplement(kmer) : kmer;
        const std::vector<size_t>& indices = m_cache_map.find(q_kmer)->second;

        for(size_t j = 0; j < indices.size(); ++j)
        {
            size_t index = indices[j];
            std::string match_sequence = m_strings[index];
            if(is_reverse)
                match_sequence = reverseComplement(match_sequence);

            if(sequences_seen.find(match_sequence) != sequences_seen.end())
                continue;
            sequences_seen.insert(match_sequence);
        
            // Ignore identical matches
            if(match_sequence.empty() || match_sequence == query)
                continue;

            // Compute the overlap. If the kmer match occurs a single time in each sequence we use
            // the banded extension overlap strategy. Otherwise we use the slow O(M*N) overlapper.
            SequenceOverlap overlap;
            size_t pos_0 = query.find(kmer);
            size_t pos_1 = match_sequence.find(kmer);

            // If there is a single occurrence of the kmer in each read,
            // use that position to seed the overlap calculation
            if(pos_0 == std::string::npos ||
               pos_1 == std::string::npos ||
               query.find(kmer, pos_0 + 1) != std::string::npos || 
               match_sequence.find(kmer, pos_1 + 1) != std::string::npos) 
            {
                // One of the reads has a second occurrence of the kmer. Use
                // the slow overlapper.
                overlap = Overlapper::computeOverlap(query, match_sequence);
            } else {
                overlap = Overlapper::extendMatch(query, match_sequence, pos_0, pos_1, 2);
            }
            bool bPassedOverlap = overlap.getOverlapLength() >= m_minOverlap;
            bool bPassedIdentity = overlap.getPercentIdentity() / 100 >= m_minIdentity;
            if(bPassedOverlap && bPassedIdentity)
            {
                SequenceOverlapPair op;
                op.sequence[0] = query;
                op.sequence[1] = match_sequence;
                op.overlap = overlap;
                op.is_reversed = is_reverse;
                out_vector->push_back(op);
            }
        }
    }
}
コード例 #6
0
void _approximateSeededMatch(const std::string& in_query,
                             int min_overlap, 
                             double min_identity,
                             int bandwidth, 
                             int max_interval,
                             bool do_reverse,
                             const BWTIndexSet& indices,
                             SequenceOverlapPairVector& out_vector)
{
    Timer timer("test", true);
    assert(indices.pBWT != NULL);
    assert(indices.pSSA != NULL);
    assert(indices.pCache != NULL);

    static size_t n_calls = 0;
    static size_t n_candidates = 0;
    static size_t n_output = 0;
    static double t_time = 0;

    n_calls++;

    int target_seed_length = 41;
    int seed_stride = target_seed_length / 2;
    size_t d = 1;

    SequenceOverlapPairVector overlap_vector;

    std::string strand_query = do_reverse ? reverseComplement(in_query) : in_query;

    // Initialize seeds 
    int seed_end = strand_query.size();
    int q = indices.pCache->getCachedLength();

    std::queue<ApproxSeed> seeds;

    while(seed_end > target_seed_length)
    {
        // For the last q bases of the seed, create all strings within edit distance d
        std::string qmer = strand_query.substr(seed_end - q, q);
        assert((int)qmer.size() == q);
        // 0-distance seed
        ApproxSeed seed;
        seed.query_index = seed_end - q;
        seed.interval = indices.pCache->lookup(qmer.c_str());
        seed.length = q;
        seeds.push(seed);

        for(int i = 0; i < q; ++i)
        {
            // Switch base to other 3 symbols
            char o = qmer[i];
            for(size_t j = 0; j < 4; ++j)
            {
                char b = "ACGT"[j];
                if(b != o)
                {
                    qmer[i] = b;
                    ApproxSeed seed;
                    seed.query_index = seed_end - q;
                    seed.interval = indices.pCache->lookup(qmer.c_str());
                    seed.length = q;
                    seed.edits.push_back(SeedEdit(i + seed.query_index, b));
                    seeds.push(seed);
                }
            }
            qmer[i] = o;
        }
        seed_end -= seed_stride;
    }
    
    // Extend seeds
    std::vector<ApproxSeed> finished_seeds;
    while(!seeds.empty())
    {
        ApproxSeed& seed = seeds.front();

        // query_index is the index of the last base
        // in the seed. get the next base
        char qb = strand_query[seed.query_index - 1];

        // Branch to inexact match
        if(seed.edits.size() < d)
        {
            for(size_t j = 0; j < 4; ++j)
            {
                char b = "ACGT"[j];
                if(b != qb)
                {
                    ApproxSeed new_seed;
                    new_seed.query_index = seed.query_index - 1;
                    new_seed.interval = seed.interval;
                    BWTAlgorithms::updateInterval(new_seed.interval, b, indices.pBWT);
                    if(new_seed.interval.isValid())
                    {
                        new_seed.length = seed.length + 1;
                        new_seed.edits = seed.edits;
                        new_seed.edits.push_back(SeedEdit(new_seed.query_index, b));

                        if(new_seed.length < target_seed_length && new_seed.query_index > 0)
                            seeds.push(new_seed);
                        else
                            finished_seeds.push_back(new_seed);
                    }
                }
            }
        }
            
        // Extend with the actual query base without branching
        seed.query_index = seed.query_index - 1;
        seed.length += 1;
        BWTAlgorithms::updateInterval(seed.interval, qb, indices.pBWT);
        if(!seed.interval.isValid() || seed.length >= target_seed_length || seed.query_index == 0)
        {
            if(seed.interval.isValid())
                finished_seeds.push_back(seed);
            seeds.pop();
        }
    }
    
    std::set<size_t> rank_set;
    
    for(size_t i = 0; i < finished_seeds.size(); ++i)
    {
        if(finished_seeds[i].interval.size() > max_interval)
            continue;

        //std::cout << finished_seeds[i] << "\n";
        std::string query_seed = strand_query.substr(finished_seeds[i].query_index, target_seed_length);
        std::string match_seed = query_seed;

        // Apply edits to the new sequence
        for(size_t j = 0; j < finished_seeds[i].edits.size(); ++j)
            match_seed[finished_seeds[i].edits[j].index - finished_seeds[i].query_index] = finished_seeds[i].edits[j].base;

        // Flip the seeds to match the strand of the query
        if(do_reverse)
        {
            query_seed = reverseComplement(query_seed);
            match_seed = reverseComplement(match_seed);
        }

        // Extract the prefix of every occurrence of this seed
        RankedPrefixVector extensions = BWTAlgorithms::extractRankedPrefixes(indices.pBWT, finished_seeds[i].interval);

        // Extend the seeds to the full-length string
        for(size_t j = 0; j < extensions.size(); ++j)
        {
            size_t rank = extensions[j].rank;

            // The second element of the returned pair is
            // false if the set already contains this rank
            if(!rank_set.insert(rank).second)
                continue;

            // Extract the reminder of the read
            std::string& prefix = extensions[j].prefix;
            int64_t start_index_of_read = indices.pSSA->lookupLexoRank(rank);
            std::string suffix = BWTAlgorithms::extractUntilInterval(indices.pBWT, 
                                                                     start_index_of_read, 
                                                                     finished_seeds[i].interval);

            std::string match_sequence = prefix + suffix;

            // Ignore identical matches
            if(match_sequence == strand_query)
                continue;

            // Change strands
            if(do_reverse)
                match_sequence = reverseComplement(match_sequence);

            // Compute the overlap
            SequenceOverlap overlap;
            size_t pos_0 = in_query.find(query_seed);
            size_t pos_1 = match_sequence.find(match_seed);
            assert(pos_0 != std::string::npos);
            assert(pos_1 != std::string::npos);

            if(in_query.find(query_seed, pos_0 + 1) != std::string::npos || 
               match_sequence.find(match_seed, pos_1 + 1) != std::string::npos) {
                // One of the reads has a second occurrence of the kmer. Use
                // the slow overlapper.
                overlap = Overlapper::computeOverlap(in_query, match_sequence);
            } else {
                overlap = Overlapper::extendMatch(in_query, match_sequence, pos_0, pos_1, bandwidth);
            }

            n_candidates += 1;
            bool bPassedOverlap = overlap.getOverlapLength() >= min_overlap;
            bool bPassedIdentity = overlap.getPercentIdentity() / 100 >= min_identity;

            if(bPassedOverlap && bPassedIdentity)
            {
                //printf("Rank\t%zu\t%zu\t%s\t%.2lf\t%d\n", n_calls,
                //    rank, match_sequence.c_str(), overlap.getPercentIdentity(), overlap.getOverlapLength());
                
                SequenceOverlapPair op;
                op.sequence[0] = in_query;
                op.sequence[1] = match_sequence;
                op.overlap = overlap;
                op.is_reversed = do_reverse;
                out_vector.push_back(op);
                n_output += 1;
            }                
        }
    }
    t_time += timer.getElapsedCPUTime();
    
    if(Verbosity::Instance().getPrintLevel() > 6 && n_calls % 100 == 0)
        printf("[approx seeds] n: %zu candidates: %zu valid: %zu (%.2lf) time: %.2lfs\n", 
            n_calls, n_candidates, n_output, (double)n_output / n_candidates, t_time);
}
コード例 #7
0
SequenceOverlapPairVector KmerOverlaps::PacBioRetrieveMatches(const std::string& query, size_t k, 
                                                        int min_overlap, double min_identity,
                                                        int bandwidth, const BWTIndexSet& indices,
														KmerDistribution& kd, int round)
{
    PROFILE_FUNC("OverlapHaplotypeBuilder::PacBioRetrieveMatches")
    assert(indices.pBWT != NULL);
    assert(indices.pSSA != NULL);
	
	//size_t numStringCount[query.size()+1] = 0;
	
	int64_t intervalSum = 0;
    static size_t n_calls = 0;
    static size_t n_candidates = 0;
    static size_t n_output = 0;
    static double t_time = 0;
	size_t count = 0;
	size_t numKmer = 0;
	size_t numRepeatKmer = 0;
	size_t totalKmer = 0;
	size_t numNoSeedRead = 0;
	size_t repeatCutoff = kd.getRepeatKmerCutoff();
	size_t errorCutoff = kd.getMedian() - kd.getSdv();
    Timer timer("test", true);
	
    n_calls++;
	//std::cout<<"PacBioRetrieveMatches\n";
	std::cout<<"\tk :\t"<<k<<"\n";
    SequenceOverlapPairVector overlap_vector;
	std::vector<long> identityVector(100);
	for(int j = 0;j < identityVector.size(); j++)
		identityVector[j] = 0;
		
    // Use the FM-index to look up intervals for each kmer of the read. Each index
    // in the interval is stored individually in the KmerMatchMap. We then
    // backtrack to map these kmer indices to read IDs. As reads can share
    // multiple kmers, we use the map to avoid redundant lookups.
    // There is likely a faster algorithm which performs direct decompression
    // of the read sequences without having to expand the intervals to individual
    // indices. The current algorithm suffices for now.
    KmerMatchMap prematchMap;
    size_t num_kmers = query.size() - k + 1;
	clock_t search_seeds_s = clock(), search_seeds_e;
    
	
	for(size_t i = 0; i < num_kmers; i++)
    {
        std::string kmer = query.substr(i, k);
        BWTInterval interval = BWTAlgorithms::findInterval(indices, kmer);
		if(interval.upper - interval.lower < errorCutoff)
			numNoSeedRead++;
		if((interval.upper - interval.lower) > 20 && (interval.upper - interval.lower) < repeatCutoff)
		{
			numKmer++;
			totalKmer++;
		}
		
		//To avoid the repeat region
		/*if((interval.upper - interval.lower) > repeatCutoff)
		{	
			numRepeatKmer++;
			totalKmer++;
			continue;
		}
		else
			interval.upper = ((interval.upper - interval.lower)>20)?interval.lower + 20 : interval.upper;*/
		
        if(interval.isValid() && interval.size()) 
        {
			//std::cout<<"\tinterval size : "<<interval.upper - interval.lower<<std::endl;
            for(int64_t j = interval.lower; j <= interval.upper; ++j)
            {
                KmerMatch match = { i, static_cast<size_t>(j), false };
                prematchMap.insert(std::make_pair(match, false));
            }
			intervalSum += interval.upper - interval.lower;
			count++;
        }

        kmer = reverseComplement(kmer);
        interval = BWTAlgorithms::findInterval(indices, kmer);
		interval.upper = ((interval.upper - interval.lower)>20)?interval.lower + 20 : interval.upper;
        if(interval.isValid() && interval.size())
        {
            for(int64_t j = interval.lower; j <= interval.upper; ++j)
            {
                KmerMatch match = { i, static_cast<size_t>(j), true };
                prematchMap.insert(std::make_pair(match, false));
            }
			intervalSum += interval.upper - interval.lower;
			count++;
        }
    }
	if(numNoSeedRead == num_kmers)
		std::cout<<"\tnoSeedRead : 1"<<std::endl;
	std::cout<<"\tnumber of kmer : "<<numKmer<<std::endl;
	std::cout<<"\tnumber of RepeatKmer : "<<numRepeatKmer<<std::endl;
	std::cout<<"\tnumber of totalkmer : "<<totalKmer<<std::endl;
	
	
    // Backtrack through the kmer indices to turn them into read indices.
    // This mirrors the calcSA function in SampledSuffixArray except we mark each entry
    // as visited once it is processed.
	//std::cout<<"\tintervalSum : "<<intervalSum<<std::endl;
	//std::cout<<"\tintervalCount : "<<count<<std::endl;
	std::cout<<"\tprematchMap :\t"<<prematchMap.size()<<std::endl;
    KmerMatchSet matches;
    for(KmerMatchMap::iterator iter = prematchMap.begin(); iter != prematchMap.end(); ++iter)
    {
		//std::cout<<"iter->first.position : "<<iter->first.position<<std::endl;
        // This index has been visited
        if(iter->second)
            continue;

        // Mark this as visited
        iter->second = true;

        // Backtrack the index until we hit the starting symbol
        KmerMatch out_match = iter->first;
        while(1) 
        {
            char b = indices.pBWT->getChar(out_match.index);
            out_match.index = indices.pBWT->getPC(b) + indices.pBWT->getOcc(b, out_match.index - 1);

            // Check if the hash indicates we have visited this index. If so, stop the backtrack
            KmerMatchMap::iterator find_iter = prematchMap.find(out_match);
            if(find_iter != prematchMap.end())
            {
                // We have processed this index already
                if(find_iter->second)
                    break;
                else
                    find_iter->second = true;
            }

            if(b == '$')
            {
                // We've found the lexicographic index for this read. Turn it into a proper ID
                out_match.index = indices.pSSA->lookupLexoRank(out_match.index);
				//std::cout<<"out_match.position"<<out_match.position<<std::endl;
                matches.insert(out_match);
                break;
            }
        }
    }
	search_seeds_e = clock();
	std::cout<<"\tmatchset :\t"<<matches.size()<<"\n";
    // Refine the matches by computing proper overlaps between the sequences
    // Use the overlaps that meet the thresholds to build a multiple alignment
	clock_t  extrac_s, extrac_e;
	clock_t  overlapE_s, overlapE_e;
	clock_t  overlapC_s, overlapC_e;
	double extrac_sum = 0.0;
	double overlapE_sum = 0.0, overlapC_sum = 0.0;
	int compute_count = 0,extend_count = 0;
	size_t acNumber = 0;
    for(KmerMatchSet::iterator iter = matches.begin(); iter != matches.end(); ++iter)
    {
		extrac_s = clock();
        std::string match_sequence;// = BWTAlgorithms::extractString(indices.pBWT, iter->index);
		
		if(indices.pReadTable != NULL)
            match_sequence = indices.pReadTable->getRead(iter->index).seq.toString();
		/*else
			match_sequence = BWTAlgorithms::extractString(indices.pBWT, iter->index);*/
		
		extrac_e = clock();
		extrac_sum += (double)extrac_e - extrac_s;
        
		if(iter->is_reverse)
            match_sequence = reverseComplement(match_sequence);
        
        // Ignore identical matches
        if(match_sequence == query)
            continue;

        // Compute the overlap. If the kmer match occurs a single time in each sequence we use
        // the banded extension overlap strategy. Otherwise we use the slow O(M*N) overlapper.
        SequenceOverlap overlap;
        std::string match_kmer = query.substr(iter->position, k);
        size_t pos_0 = iter->position;//query.find(match_kmer);
        size_t pos_1 = match_sequence.find(match_kmer);
        assert(pos_0 != std::string::npos && pos_1 != std::string::npos);

		//Timer* sTimer = new Timer("seeds overlap");
        // Check for secondary occurrences
        /*if(query.find(match_kmer, pos_0 + 1) != std::string::npos || 
           match_sequence.find(match_kmer, pos_1 + 1) != std::string::npos) {
            // One of the reads has a second occurrence of the kmer. Use
            // the slow overlapper.
			overlapC_s = clock();
			compute_count++;
            overlap = Overlapper::computeOverlap(query, match_sequence);
			overlapC_e = clock();
			overlapC_sum += (double)overlapC_e - overlapC_s;
        } 
	
		else {*/
			overlapE_s = clock();
			extend_count++;
            overlap = Overlapper::PacBioExtendMatch(query, match_sequence, pos_0, pos_1, bandwidth);
			overlapE_e = clock();
			overlapE_sum += (double)overlapE_e - overlapE_s;
		//}
	
		//delete sTimer;
        n_candidates += 1;
        bool bPassedOverlap = overlap.getOverlapLength() >= min_overlap;
        bool bPassedIdentity = overlap.getPercentIdentity()  >= min_identity;
		identityVector[(int)overlap.getPercentIdentity()] += 1;
		//overlap.printTotal_columns();
		//overlap.printEdit_distance();
		//std::cout<<"min_overlap == "<<overlap.getOverlapLength()<<"\n";
		//std::cout<<"overlap.getOverlapLength() / 100 == "<<overlap.getOverlapLength() / 100<<"\n";
		//std::cout<<"min_identity == "<<min_identity<<"\n";
		//std::cout<<"bPassedOverlap == "<<bPassedOverlap<<"\n";
		//std::cout<<"bPassedIdentity == "<<bPassedIdentity<<"\n";
		//std::cout<<match_sequence<<"\n";
        if(bPassedOverlap && bPassedIdentity)
        {
            SequenceOverlapPair op;
            op.sequence[0] = query;
            op.sequence[1] = match_sequence;
            op.overlap = overlap;
            op.is_reversed = iter->is_reverse;
            overlap_vector.push_back(op);
            n_output += 1;
			acNumber += 1;
			//numStringCount
        } 
    }
	
	std::cout<<"\tacceptable number of seeds == "<<acNumber<<"\n";
	std::cout<<"\tsearch seeds time : "<<(double)(search_seeds_e - search_seeds_s)/CLOCKS_PER_SEC<<std::endl;
	std::cout<<"\textract time : "<<extrac_sum/CLOCKS_PER_SEC<<std::endl;
	//std::cout<<"\tcompute_count : "<<compute_count<<std::endl;
	//std::cout<<"\tbanded_count : "<<extend_count<<std::endl;
	//std::cout<<"\tcompute overlap time : "<<overlapC_sum/CLOCKS_PER_SEC<<std::endl;
	//std::cout<<"\tbanded overlap time : "<<overlapE_sum/CLOCKS_PER_SEC<<std::endl;
	/*------------------output-identity------------------------------------
	double mean = 0.0, temp_mean = 0.0,temp = 0.0;
	for(int i = 0; i < 100; i++)
	{	//count*identity
		mean+=identityVector[i]*i;
		temp+=identityVector[i];
	}
	mean=mean/temp;
	
	for(int i = 0; i < 100; i++)
		//count*identity^2
		temp_mean+=identityVector[i]*pow(i,2);
	std::cout<<"-----------outputIdentity------------"<<std::endl;
	std::cout<<"\tround "<<round;
	std::cout<<"\tmean identity :\t"<<mean<<std::endl;
	std::cout<<"\tSD identity :\t"<<sqrt(temp_mean/temp - pow(mean,2))<<std::endl;
	std::cout<<"-------------------------------------\n"<<std::endl;
	/*---------------------------------------------------------------------*/
	
    t_time += timer.getElapsedCPUTime();

    if(Verbosity::Instance().getPrintLevel() > 6 && n_calls % 100 == 0)
        printf("[kmer overlaps] n: %zu candidates: %zu valid: %zu (%.2lf) time: %.2lfs\n", 
            n_calls, n_candidates, n_output, (double)n_output / n_candidates, t_time);
    return overlap_vector;
}