Exemplo n.º 1
0
// Align the haplotype to the reference genome represented by the BWT/SSA pair
void HapgenUtil::alignHaplotypeToReferenceBWASW(const std::string& haplotype,
        const BWTIndexSet& referenceIndex,
        HapgenAlignmentVector& outAlignments)
{
    PROFILE_FUNC("HapgenUtil::alignHaplotypesToReferenceBWASW")
    LRAlignment::LRParams params;

    params.zBest = 20;

    for(size_t i = 0; i <= 1; ++i)
    {
        LRAlignment::LRHitVector hits;
        std::string query = (i == 0) ? haplotype : reverseComplement(haplotype);
        LRAlignment::bwaswAlignment(query, referenceIndex.pBWT, referenceIndex.pSSA, params, hits);

        // Convert the hits into alignments
        for(size_t j = 0; j < hits.size(); ++j)
        {
            int q_alignment_length = hits[j].q_end - hits[j].q_start;

            // Skip non-complete alignments
            if((int)haplotype.length() == q_alignment_length)
            {
                HapgenAlignment aln(hits[j].targetID, hits[j].t_start, hits[j].length, hits[j].G, i == 1);
                outAlignments.push_back(aln);
            }
        }
    }
}
Exemplo n.º 2
0
// Coalesce a set of alignments into distinct locations
void HapgenUtil::coalesceAlignments(HapgenAlignmentVector& alignments)
{
    if(alignments.empty())
        return;

    // Sort the alignments by reference id, then position
    std::sort(alignments.begin(), alignments.end());

    HapgenAlignmentVector outAlignments;

    // Iterate over the alignments in sorted order
    // If an alignment is distinct (=does not overlap) from the
    // previous alignment, add it to the output collection.
    
    // First alignment is always ok
    outAlignments.push_back(alignments[alignments.size()-1]);

    // Kees: start from back because alignments are sorted in order of increasing score
    for(size_t i = alignments.size()-1; i-- > 0;)
    {
        // Check this alignment against the last alignment added to the output set
        HapgenAlignment& prevAlign = outAlignments.back();
        const HapgenAlignment& currAlign = alignments[i];

        int s1 = prevAlign.position;
        int e1 = s1 + prevAlign.length;

        int s2 = currAlign.position;
        int e2 = s2 + currAlign.length;
        bool intersecting = Interval::isIntersecting(s1, e1, s2, e2);

        if(prevAlign.referenceID != currAlign.referenceID || !intersecting)
        {
            outAlignments.push_back(currAlign);
        }
        else
        {
            // merge the intersecting alignment into a window that covers both
            prevAlign.position = std::min(s1, s2);
            prevAlign.length = std::max(e1, e2) - prevAlign.position;
        }
    }

    alignments = outAlignments;
}
Exemplo n.º 3
0
// Compute the best alignment of the haplotype collection to the reference
DindelReturnCode DindelUtil::computeBestAlignment(const StringVector& inHaplotypes, 
                                                  const SeqItemVector& variantMates,
                                                  const SeqItemVector& variantRCMates,
                                                  const GraphCompareParameters& parameters,
                                                  HapgenAlignment& bestAlignment)
{
    size_t MAX_DEPTH = 2000;
    if(variantMates.size() + variantRCMates.size() > MAX_DEPTH)
        return DRC_OVER_DEPTH;

    //
    // Align the haplotypes to the reference genome to generate candidate alignments
    //
    HapgenAlignmentVector candidateAlignments;
    for(size_t i = 0; i < inHaplotypes.size(); ++i)
        HapgenUtil::alignHaplotypeToReferenceBWASW(inHaplotypes[i], parameters.referenceIndex, candidateAlignments);

    // Remove duplicate or bad alignment pairs
    HapgenUtil::coalesceAlignments(candidateAlignments);

    if(candidateAlignments.empty())
        return DRC_NO_ALIGNMENT;

    //
    // Score each candidate alignment against the mates of all the variant reads
    //
    int bestCandidate = -1;
    double bestAverageScoreFrac = 0.0f;
    double secondBest = 0.0f;
    for(size_t i = 0; i < candidateAlignments.size(); ++i)
    {
        // Compute the average score of the reads' mates to the flanking sequence
        StringVector referenceFlanking;
        StringVector referenceHaplotypes;
        HapgenUtil::makeFlankingHaplotypes(candidateAlignments[i], parameters.pRefTable, 
                                           1000, inHaplotypes, referenceFlanking, referenceHaplotypes);

        // If valid flanking haplotypes could not be made, skip this alignment
        if(referenceFlanking.empty())
            continue;

        // Realign the mates
        LocalAlignmentResultVector localAlignments = HapgenUtil::alignReadsLocally(referenceFlanking[0], variantMates);
        LocalAlignmentResultVector localAlignmentsRC = HapgenUtil::alignReadsLocally(referenceFlanking[0], variantRCMates);

        // Merge alignments
        localAlignments.insert(localAlignments.end(), localAlignmentsRC.begin(), localAlignmentsRC.end());

        double sum = 0.0f;
        double count = 0.0f;

        for(size_t j = 0; j < localAlignments.size(); ++j)
        {
            double max_score = localAlignments[j].queryEndIndex - localAlignments[j].queryStartIndex + 1;
            double frac = (double)localAlignments[j].score / max_score;
            //printf("Score: %d frac: %lf\n", localAlignments[j].score, frac);
            sum += frac;
            count += 1;
        }

        double score = sum / count;
        if(score > bestAverageScoreFrac)
        {
            secondBest = bestAverageScoreFrac;
            bestAverageScoreFrac = score;
            bestCandidate = i;
        }
        else if(score > secondBest)
        {
            secondBest = score;
        }

        //printf("Alignment %zu mate-score: %lf\n", i, score);
    }

    if(bestCandidate == -1)
        return DRC_NO_ALIGNMENT;

    /*
    if(bestAverageScoreFrac < 0.9f)
        return DRC_POOR_ALIGNMENT;

    if(bestAverageScoreFrac - secondBest < 0.05f)
        return DRC_AMBIGUOUS_ALIGNMENT;
    */
    bestAlignment = candidateAlignments[bestCandidate];
    return DRC_OK;
}
Exemplo n.º 4
0
// Run dindel on a pair of samples
DindelReturnCode DindelUtil::runDindelPairMatePair(const std::string& id,
                                                   const StringVector& base_haplotypes,
                                                   const StringVector& variant_haplotypes,
                                                   const GraphCompareParameters& parameters,
                                                   std::ostream& baseOut,
                                                   std::ostream& variantOut,
                                                   std::ostream& callsOut,
                                                   DindelReadReferenceAlignmentVector* pReadAlignments)
{
    PROFILE_FUNC("runDindelPairMatePair")

    StringVector inHaplotypes;
    inHaplotypes.insert(inHaplotypes.end(), base_haplotypes.begin(), base_haplotypes.end());
    inHaplotypes.insert(inHaplotypes.end(), variant_haplotypes.begin(), variant_haplotypes.end());

    //
    // First, extract the reads from the normal and variant data sets that match each haplotype
    //
    assert(inHaplotypes.size() > 0);

    // Get canidate alignments for the input haplotypes
    HapgenAlignmentVector candidateAlignments;

    // Choose the kmer size for alignment
    size_t align_kmer = 31;
    for(size_t i = 0; i < inHaplotypes.size(); ++i)
    {
        HapgenAlignmentVector thisCandidateAlignments;
        HapgenUtil::alignHaplotypeToReferenceKmer(align_kmer,
                                                  inHaplotypes[i],
                                                  parameters.referenceIndex,
                                                  parameters.pRefTable,
                                                  thisCandidateAlignments);

        candidateAlignments.insert(candidateAlignments.end(), thisCandidateAlignments.begin(), thisCandidateAlignments.end());
    }
   
    // Remove duplicate or bad alignment pairs
    HapgenUtil::coalesceAlignments(candidateAlignments);

    if(Verbosity::Instance().getPrintLevel() > 3)
        printf("runDindel -- %zu candidate alignments found\n", candidateAlignments.size());
    
    size_t MAX_ALIGNMENTS = 10;
    if(candidateAlignments.size() > MAX_ALIGNMENTS)
        return DRC_AMBIGUOUS_ALIGNMENT;

    // Join each haplotype with flanking sequence from the reference genome for each alignment
    // This function also adds a haplotype (with flanking sequence) for the piece of the reference
    int FLANKING_SIZE = 0;
    if (parameters.dindelRealignParameters.realignMatePairs)
        FLANKING_SIZE = 1000;
    StringVector flankingHaplotypes;

    // This vector contains the internal portion of the haplotypes, without the flanking sequence
    // It is used to extract reads
    StringVector candidateHaplotypes;
    for(size_t i = 0; i < candidateAlignments.size(); ++i)
    {
        HapgenUtil::makeFlankingHaplotypes(candidateAlignments[i],
                                           parameters.pRefTable,
                                           FLANKING_SIZE,
                                           inHaplotypes,
                                           flankingHaplotypes,
                                           candidateHaplotypes);
    
    }

    if(Verbosity::Instance().getPrintLevel() > 3)
        printf("runDindel -- made %zu flanking haplotypes\n", candidateHaplotypes.size());

    // Normal reads
    SeqRecordVector normalReads;
    SeqRecordVector normalRCReads;

    // Remove non-unique candidate haplotypes
    std::sort(candidateHaplotypes.begin(), candidateHaplotypes.end());
    StringVector::iterator haplotype_iterator = std::unique(candidateHaplotypes.begin(), candidateHaplotypes.end());
    candidateHaplotypes.resize(haplotype_iterator - candidateHaplotypes.begin());

    // Set the value to use for extracting reads that potentially match the haplotype
    // Do not use a kmer for extraction greater than this value
    size_t KMER_CEILING = 31;
    size_t extractionKmer = parameters.kmer < KMER_CEILING ? parameters.kmer : KMER_CEILING;
    
    bool extractOK = true;
    if(!parameters.bReferenceMode)
    {
        // Reads on the same strand as the haplotype
        extractOK = HapgenUtil::extractHaplotypeReads(candidateHaplotypes, parameters.baseIndex, extractionKmer, 
                                                      false, parameters.maxReads, parameters.maxExtractionIntervalSize, &normalReads, NULL);

        if(!extractOK)
            return DRC_OVER_DEPTH;

        // Reads on the reverse strand
        extractOK = HapgenUtil::extractHaplotypeReads(candidateHaplotypes, parameters.baseIndex, extractionKmer, 
                                                      true, parameters.maxReads, parameters.maxExtractionIntervalSize, &normalRCReads, NULL);

        if(!extractOK)
            return DRC_OVER_DEPTH;
    }

    // Variant reads
    SeqRecordVector variantReads;
    SeqRecordVector variantRCReads;

    extractOK = HapgenUtil::extractHaplotypeReads(candidateHaplotypes, parameters.variantIndex, extractionKmer, 
                                                  false, parameters.maxReads, parameters.maxExtractionIntervalSize, &variantReads, NULL);

    if(!extractOK)
        return DRC_OVER_DEPTH;

    extractOK = HapgenUtil::extractHaplotypeReads(candidateHaplotypes, parameters.variantIndex, extractionKmer, 
                                                  true, parameters.maxReads, parameters.maxExtractionIntervalSize, &variantRCReads, NULL);

    if(!extractOK)
        return DRC_OVER_DEPTH;

    size_t normal_reads = normalReads.size() + normalRCReads.size();
    size_t variant_reads = variantReads.size() + variantRCReads.size();
    size_t total_reads = normal_reads + variant_reads;
    
    if(Verbosity::Instance().getPrintLevel() > 3)
        printf("Extracted %zu normal reads, %zu variant reads\n", normal_reads, variant_reads);

    if(total_reads > parameters.maxReads)
        return DRC_OVER_DEPTH;

    if (total_reads == 0)
        return DRC_UNDER_DEPTH;

    // Generate the input haplotypes for dindel
    // We need at least 2 haplotypes (one is the reference)
    size_t totFlankingHaplotypes = flankingHaplotypes.size();

    if(totFlankingHaplotypes < 2)
        return DRC_NO_ALIGNMENT;

    // Ensure the reference haplotype is a non-empty string
    if(flankingHaplotypes[0].size() == 0)
        return DRC_NO_ALIGNMENT;

    // Make Dindel referenceMappings
    StringVector dindelHaplotypes;
    std::set<DindelReferenceMapping> refMappings;

    //
    for(size_t i = 0; i < candidateAlignments.size(); ++i)
    {
        std::string upstream, defined, downstream;
        std::string refName = parameters.pRefTable->getRead(candidateAlignments[i].referenceID).id;

        HapgenUtil::extractReferenceSubstrings(candidateAlignments[i],parameters.pRefTable, 
                                               FLANKING_SIZE, upstream, defined, downstream);

        std::string refSeq = upstream + defined + downstream;
     
        int refStart = candidateAlignments[i].position - int(upstream.size()) + 1;

        // Here the score is used as an estimate of how unique "defined" is in the reference sequence.
        // "defined" is not the reference sequence but a candidate haplotype.
        // It is conservative because the flanking sequence is not used in this estimation.
    	DindelReferenceMapping rm(refName, 
                                  refSeq, 
                                  refStart, 
                                  candidateAlignments[i].score+2*FLANKING_SIZE, 
                                  candidateAlignments[i].isRC);

        std::set<DindelReferenceMapping>::iterator rmit = refMappings.find(rm);
        if(rmit == refMappings.end())
        {
            refMappings.insert(rm);
        }
	    else
        {
            if(rm.referenceAlignmentScore > rmit->referenceAlignmentScore) 
                rmit->referenceAlignmentScore = rm.referenceAlignmentScore;
        }
    }
    
    // RESET MAPPING SCORES
    for(std::set<DindelReferenceMapping>::iterator it = refMappings.begin(); it != refMappings.end(); it++) 
        it->referenceAlignmentScore = 1000;
    
    // make flankingHaplotypes unique
    std::set< std::string > setFlanking(flankingHaplotypes.begin(), flankingHaplotypes.end());

    for(std::set< std::string >::const_iterator it = setFlanking.begin(); it != setFlanking.end(); it++)
    {
        dindelHaplotypes.push_back(*it);
        //dindelRefMappings[i] = std::vector<DindelReferenceMapping>(refMappings.begin(),refMappings.end());
    }

    std::vector<DindelReferenceMapping> dRefMappings(refMappings.begin(),refMappings.end());
    DindelWindow dWindow(dindelHaplotypes, dRefMappings);

    //
    // Run Dindel
    //
    
    // Initialize VCF collections
    VCFCollection vcfCollections[2];

    // If in multisample mode, load the sample names into the VCFCollection
    if(parameters.variantIndex.pPopIdx != NULL)
    {
        for(size_t i = 0; i <= 1; ++i)
            vcfCollections[i].samples = parameters.variantIndex.pPopIdx->getSamples();
    }

    size_t start_i = parameters.bReferenceMode ? 1 : 0;

    DindelRealignWindowResult *pThisResult = NULL;
    DindelRealignWindowResult *pPreviousResult = NULL;

    for(size_t i = start_i; i <= 1; ++i)
    {
        SeqRecordVector& fwdReads = (i == 0) ? normalReads : variantReads;
        SeqRecordVector& rcReads = (i == 0) ? normalRCReads : variantRCReads;
        const BWTIndexSet* indices = &parameters.variantIndex;

        // Create dindel reads
        // Mates must be at the end of the array.
        std::vector<DindelRead> dReads;
        for(size_t j = 0; j < fwdReads.size(); ++j)
            dReads.push_back(convertToDindelRead(indices, fwdReads[j], true));

        for(size_t j = 0; j < rcReads.size(); ++j)
        {
            rcReads[j].seq.reverseComplement();
            std::reverse(rcReads[j].qual.begin(), rcReads[j].qual.end());
            dReads.push_back(convertToDindelRead(indices, rcReads[j], false));
        }

        pThisResult = new DindelRealignWindowResult();

        std::stringstream out_ss;
        try
        {
            DindelRealignWindow dRealignWindow(&dWindow, dReads, parameters.dindelRealignParameters);
            dRealignWindow.run("hmm", vcfCollections[i], pReadAlignments, id, pThisResult, pPreviousResult, parameters.pRefTable);
        }
        catch(std::string e)
        {
            std::cerr << "Dindel Exception: " << e << "\n";
            exit(DRC_EXCEPTION);
        }


        if(i == 0)
            pPreviousResult = pThisResult;
    }

    // Copy raw VCFRecords to output
    for(size_t i = 0; i <= 1; ++i)
    {
        std::ostream& curr_out = i == 0 ? baseOut : variantOut;
        for(size_t j = 0; j < vcfCollections[i].records.size(); ++j)
            curr_out << vcfCollections[i].records[j] << "\n";
    }

    // Make comparative calls
    size_t VARIANT_IDX = 1;
    size_t BASE_IDX = 0;
    bool has_base_calls = !vcfCollections[BASE_IDX].records.empty();
    for(size_t i = 0; i < vcfCollections[1].records.size(); ++i)
    {
        bool not_called_in_base = true;
        if(has_base_calls)
            not_called_in_base = vcfCollections[BASE_IDX].records[i].passStr == "NoCall" ||
                                 vcfCollections[BASE_IDX].records[i].passStr == "NoSupp";

        bool called_in_variant = vcfCollections[VARIANT_IDX].records[i].passStr == "PASS";
        if(called_in_variant && not_called_in_base)
            callsOut << vcfCollections[VARIANT_IDX].records[i] << "\n";
    }

    baseOut.flush();
    variantOut.flush();

    delete pThisResult;
    delete pPreviousResult;

    return DRC_OK;
}
Exemplo n.º 5
0
// Align the haplotype to the reference genome represented by the BWT/SSA pair
void HapgenUtil::alignHaplotypeToReferenceKmer(size_t k,
                                               const std::string& haplotype,
                                               const BWTIndexSet& referenceIndex,
                                               const ReadTable* pReferenceTable,
                                               HapgenAlignmentVector& outAlignments)
{
    PROFILE_FUNC("HapgenUtil::alignHaplotypesToReferenceKmer")
    int64_t max_interval_size = 4;

    if(haplotype.size() < k)
        return;

    std::vector<int> event_count_vector;
    std::vector<HapgenAlignment> tmp_alignments;
    int min_events = std::numeric_limits<int>::max();

    // Align forward and reverse haplotype to reference
    for(size_t i = 0; i <= 1; ++i)
    {
        bool is_reverse = i == 1;
        std::string query = is_reverse ? reverseComplement(haplotype) : haplotype;

        // Find shared kmers between the haplotype and the reference
        CandidateVector candidates;

        size_t nqk = query.size() - k + 1;
        for(size_t j = 0; j < nqk; ++j)
        {
            std::string kmer = query.substr(j, k);

            // Find the interval of this kmer in the reference
            BWTInterval interval = BWTAlgorithms::findInterval(referenceIndex, kmer);
            if(!interval.isValid() || interval.size() >= max_interval_size)
                continue; // not found or too repetitive

            // Extract the reference location of these hits
            for(int64_t k = interval.lower; k  <= interval.upper; ++k)
            {
                SAElem elem = referenceIndex.pSSA->calcSA(k, referenceIndex.pBWT);

                // Make a candidate alignment
                CandidateKmerAlignment candidate;
                candidate.query_index = j;
                candidate.target_index = elem.getPos();
                candidate.target_extrapolated_start = candidate.target_index - candidate.query_index;
                candidate.target_extrapolated_end = candidate.target_extrapolated_start + query.size();
                candidate.target_sequence_id = elem.getID();
                candidates.push_back(candidate);
            }
        }

        // Remove duplicate candidates
        std::sort(candidates.begin(), candidates.end(), CandidateKmerAlignment::sortByStart);
        CandidateVector::iterator new_end = std::unique(candidates.begin(), candidates.end(), CandidateKmerAlignment::equalByStart);
        candidates.resize(new_end - candidates.begin());
        
        for(size_t j = 0; j < candidates.size(); ++j)
        {
            // Extract window around reference
            size_t window_size = 200;
            int ref_start = candidates[j].target_extrapolated_start - window_size;
            int ref_end = candidates[j].target_extrapolated_end + window_size;
            const SeqItem& ref_record = pReferenceTable->getRead(candidates[j].target_sequence_id);
            const DNAString& ref_sequence = ref_record.seq;
            if(ref_start < 0)
                ref_start = 0;

            if(ref_end > (int)ref_sequence.length())
                ref_end = ref_sequence.length();

            std::string ref_substring = ref_sequence.substr(ref_start, ref_end - ref_start);

            // Align haplotype to the reference
            SequenceOverlap overlap = alignHaplotypeToReference(ref_substring, query);
            if(overlap.score < 0 || !overlap.isValid())
                continue;

            int alignment_start = ref_start + overlap.match[0].start;
            int alignment_end = ref_start + overlap.match[0].end; // inclusive
            int alignment_length = alignment_end - alignment_start + 1;

            // Crude count of the number of distinct variation events
            bool has_indel = false;
            int num_events = overlap.edit_distance;
            std::stringstream c_parser(overlap.cigar);
            int len;
            char t;
            while(c_parser >> len >> t) 
            {
                assert(len > 0);

                // Only count one event per insertion/deletion
                if(t == 'D' || t == 'I')
                {
                    num_events -= (len - 1);
                    has_indel = true;
                }
            }

            // Skip poor alignments
            double mismatch_rate = 1.0f - (overlap.getPercentIdentity() / 100.f);
            if(mismatch_rate > 0.05f || overlap.total_columns < 50)
            {
                if(Verbosity::Instance().getPrintLevel() > 4)
                {
                    printf("Haplotype Alignment - Ignoring low quality alignment (%.3lf, %dbp, %d events) to %s:%d\n", 
                        1.0f - mismatch_rate, overlap.total_columns, num_events, ref_record.id.c_str(), ref_start);
                }
                continue;
            }

            bool is_snp = !has_indel && overlap.edit_distance == 1;

            HapgenAlignment aln(candidates[j].target_sequence_id, 
                                alignment_start, 
                                alignment_length, 
                                overlap.score, 
                                num_events,
                                is_reverse, 
                                is_snp);

            tmp_alignments.push_back(aln);
            event_count_vector.push_back(num_events);
            if(Verbosity::Instance().getPrintLevel() > 4)
            {
                printf("Haplotype Alignment - Accepting alignment (%.3lf, %dbp, %d events) to %s:%d\n", 
                    1.0f - mismatch_rate, overlap.total_columns, num_events, ref_record.id.c_str(), ref_start);
            }            
            // Record the best edit distance
            if(num_events < min_events) 
                min_events = num_events;
        }
    }

    // Copy the best alignments into the output
    int MAX_DIFF_TO_BEST = 10;
    int MAX_EVENTS = 8;
    assert(event_count_vector.size() == tmp_alignments.size());
    for(size_t i = 0; i < event_count_vector.size(); ++i)
    {

        if(event_count_vector[i] <= MAX_EVENTS && event_count_vector[i] - min_events <= MAX_DIFF_TO_BEST)
            outAlignments.push_back(tmp_alignments[i]);
        else if(Verbosity::Instance().getPrintLevel() > 3)
            printf("Haplotype Alignment - Ignoring alignment with too many events (%d)\n", event_count_vector[i]);

    }
}
Exemplo n.º 6
0
// Align the haplotype to the reference genome represented by the BWT/SSA pair
void HapgenUtil::alignHaplotypeToReferenceKmer(size_t k,
        const std::string& haplotype,
        const BWTIndexSet& referenceIndex,
        const ReadTable* pReferenceTable,
        HapgenAlignmentVector& outAlignments)
{
    PROFILE_FUNC("HapgenUtil::alignHaplotypesToReferenceKmer")
    int64_t max_interval_size = 4;

    if(haplotype.size() < k)
        return;

    std::vector<int> event_count_vector;
    std::vector<HapgenAlignment> tmp_alignments;
    int min_events = std::numeric_limits<int>::max();

    // Align forward and reverse haplotype to reference
    for(size_t i = 0; i <= 1; ++i)
    {
        bool is_reverse = i == 1;
        std::string query = is_reverse ? reverseComplement(haplotype) : haplotype;

        // Find shared kmers between the haplotype and the reference
        CandidateVector candidates;

        size_t nqk = query.size() - k + 1;
        for(size_t j = 0; j < nqk; ++j)
        {
            std::string kmer = query.substr(j, k);

            // Find the interval of this kmer in the reference
            BWTInterval interval = BWTAlgorithms::findInterval(referenceIndex, kmer);
            if(!interval.isValid() || interval.size() >= max_interval_size)
                continue; // not found or too repetitive

            // Extract the reference location of these hits
            for(int64_t k = interval.lower; k  <= interval.upper; ++k)
            {
                SAElem elem = referenceIndex.pSSA->calcSA(k, referenceIndex.pBWT);

                // Make a candidate alignment
                CandidateKmerAlignment candidate;
                candidate.query_index = j;
                candidate.target_index = elem.getPos();
                candidate.target_extrapolated_start = candidate.target_index - candidate.query_index;
                candidate.target_extrapolated_end = candidate.target_extrapolated_start + query.size();
                candidate.target_sequence_id = elem.getID();
                candidates.push_back(candidate);
            }
        }

        // Remove duplicate candidates
        std::sort(candidates.begin(), candidates.end(), CandidateKmerAlignment::sortByStart);
        CandidateVector::iterator new_end = std::unique(candidates.begin(), candidates.end(), CandidateKmerAlignment::equalByStart);
        candidates.resize(new_end - candidates.begin());

        for(size_t j = 0; j < candidates.size(); ++j)
        {
            // Extract window around reference
            size_t window_size = 200;
            int ref_start = candidates[j].target_extrapolated_start - window_size;
            int ref_end = candidates[j].target_extrapolated_end + window_size;

            const DNAString& ref_sequence = pReferenceTable->getRead(candidates[j].target_sequence_id).seq;
            if(ref_start < 0)
                ref_start = 0;

            if(ref_end > (int)ref_sequence.length())
                ref_end = ref_sequence.length();

            std::string ref_substring = ref_sequence.substr(ref_start, ref_end - ref_start);

            // Align haplotype to the reference
            SequenceOverlap overlap = Overlapper::computeOverlap(query, ref_substring);

            // Skip terrible alignments
            double percent_aligned = (double)overlap.getOverlapLength() / query.size();
            if(percent_aligned < 0.95f)
                continue;
            /*
            // Skip alignments that are not full-length matches of the haplotype
            if(overlap.match[0].start != 0 || overlap.match[0].end != (int)haplotype.size() - 1)
                continue;
            */
            int alignment_start = ref_start + overlap.match[1].start;
            int alignment_end = ref_start + overlap.match[1].end; // inclusive
            int alignment_length = alignment_end - alignment_start + 1;

            // Crude count of the number of distinct variation events
            int num_events = overlap.edit_distance;
            std::stringstream c_parser(overlap.cigar);
            int len;
            char t;
            while(c_parser >> len >> t)
            {
                assert(len > 0);

                // Only count one event per insertion/deletion
                if(t == 'D' || t == 'I')
                    num_events -= (len - 1);
            }


            HapgenAlignment aln(candidates[j].target_sequence_id, alignment_start, alignment_length, overlap.score, is_reverse);
            tmp_alignments.push_back(aln);
            event_count_vector.push_back(num_events);

            // Record the best edit distance
            if(num_events < min_events)
                min_events = num_events;
        }
    }

    // Copy the best alignments into the output
    int MAX_DIFF_TO_BEST = 10;
    int MAX_EVENTS = 8;
    assert(event_count_vector.size() == tmp_alignments.size());
    for(size_t i = 0; i < event_count_vector.size(); ++i)
    {
        if(event_count_vector[i] <= MAX_EVENTS && event_count_vector[i] - min_events <= MAX_DIFF_TO_BEST)
            outAlignments.push_back(tmp_alignments[i]);
    }
}