Beispiel #1
0
// returns BAM file pointers to beginning of alignment data
bool BamMultiReaderPrivate::RewindReaders()
{

    m_errorString.clear();
    bool errorsEncountered = false;

    // iterate over readers
    std::vector<MergeItem>::iterator readerIter = m_readers.begin();
    std::vector<MergeItem>::iterator readerEnd = m_readers.end();
    for (; readerIter != readerEnd; ++readerIter) {
        MergeItem& item = (*readerIter);
        BamReader* reader = item.Reader;
        if (reader == 0) continue;

        // attempt rewind on BamReader
        if (!reader->Rewind()) {
            m_errorString.append(1, '\t');
            m_errorString.append(reader->GetErrorString());
            m_errorString.append(1, '\n');
            errorsEncountered = true;
        }
    }

    return !errorsEncountered;
}
Beispiel #2
0
// returns BAM file pointers to beginning of alignment data
bool BamMultiReader::Rewind(void) { 
    bool result = true;
    for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
        BamReader* reader = it->first;
        result &= reader->Rewind();
    }
    return result;
}
void GenericIndividualSnpCall::PyroHMMsnp(Fasta &fastaObj, BamReader &bamObj, int chrID, int leftPosition, int rightPosition, GenericProbabilisticAlignment &probAligner, list<Allele>& allelesInBlock, VariantCallSetting& snpCallSettings, vector<GenericVariant> &variantResults)
{
    VariantCallSetting settingForPyroHMMsnp = snpCallSettings;

    // allele pool
    vector<Allele> allelePool;
    for (list<Allele>::iterator allelesInBlockIter=allelesInBlock.begin(); allelesInBlockIter!=allelesInBlock.end(); allelesInBlockIter++)
    {
        allelePool.push_back(*allelesInBlockIter);
    }

    // add 10bp flanking segment at each side
    int windowLeftPosition  = leftPosition  - snpCallSettings.m_flankingSize;
    int windowRightPosition = rightPosition + snpCallSettings.m_flankingSize;

    // genome
    string genome;
    fastaObj.GetSequence(chrID, windowLeftPosition, windowRightPosition, genome);

    int    globalDepth;
    double globalMapQual;
    int    globalStrandPos;
    int    globalStrandNeg;

    vector<PyroHMMsnp_Sequence_t> readsInWindow;

    // rewind BAM reader
    bamObj.Rewind();
    // set BAM region
    bamObj.SetRegion(chrID, windowLeftPosition, chrID, windowRightPosition);
    // read alignment
    BamAlignment al;
    while (bamObj.GetNextAlignment(al))
    {
        // skip if it is not a good alignment
        if (!GenericBamAlignmentTools::goodAlignment(al))
        {
            continue;
        }

        // skip if it is not valid at length
        if (!GenericBamAlignmentTools::validReadLength(al, m_minReadLength))
        {
            continue;
        }

        // skip if it is not valid at map quality
        if (!GenericBamAlignmentTools::validMapQuality(al, m_minMapQuality))
        {
            continue;
        }

        // skip if it is not valid at alignment identity
        if (!GenericBamAlignmentTools::validReadIdentity(al, m_maxMismatchFrac))
        {
            continue;
        }

        // global info
        globalDepth   += 1;
        globalMapQual += al.MapQuality*al.MapQuality;
        if (al.IsReverseStrand())
            globalStrandNeg += 1;
        else
            globalStrandPos += 1;

        // get local alignment
        string t_localRead, t_localGenome;
        Cigar  t_cigar;
        BamMD  t_md;
        int    t_numMismatch, t_numInDel;
        GenericBamAlignmentTools::getLocalAlignment(al, windowLeftPosition, windowRightPosition-windowLeftPosition,
                                                    t_localRead, t_localGenome, t_cigar, t_md,
                                                    t_numMismatch, t_numInDel);

        if (t_localRead.empty() || t_localGenome.empty())
            continue;


        // save into set
        PyroHMMsnp_Sequence_t t_seq;
        t_seq.t_ID           = GenericBamAlignmentTools::getBamAlignmentID(al);
        t_seq.t_sequence     = t_localRead;
        t_seq.t_cigar        = t_cigar;
        t_seq.t_md           = t_md;
        t_seq.t_numMismatch  = t_numMismatch;
        t_seq.t_numInDel     = t_numInDel;
        t_seq.t_mapQualScore = al.MapQuality;


        if (al.Position>windowLeftPosition)
            t_seq.t_startPositionShift = al.Position-windowLeftPosition;
        else
            t_seq.t_startPositionShift = 0;

        if (al.GetEndPosition()<windowRightPosition)
            t_seq.t_endPositionShift = windowRightPosition-al.GetEndPosition();
        else
            t_seq.t_endPositionShift = 0;

        readsInWindow.push_back(t_seq);
    }

    int numData = readsInWindow.size();

    // construct the consensus sequence graph
    GenericDagGraph consensusGraph;
    vector<string>  consensusGraphReads;
    vector<Cigar>   consensusGraphReadCigars;
    vector<int>     consensusGraphReadStarts;

    // set of aligned reads to construct the graph
    for (int i=0; i<numData; ++i)
    {
        consensusGraphReads.push_back(readsInWindow[i].t_sequence);
        consensusGraphReadCigars.push_back(readsInWindow[i].t_cigar);
        consensusGraphReadStarts.push_back(readsInWindow[i].t_startPositionShift);
    }

    // build up the graph
    consensusGraph.buildDagGraph(genome, consensusGraphReads, consensusGraphReadCigars, consensusGraphReadStarts);
    consensusGraph.edgePruning(snpCallSettings.m_graphPruneLevel);

    // search topK paths, excluding reference
    vector<string>       topRankConsensusGraphPaths;
    vector<list<Vertex>> topRankConsensusGraphPathVertexs;
    vector<double>       topRankConsensusGraphPathWeights;
    consensusGraph.topRankPathsExcludeGenome(30, topRankConsensusGraphPaths, topRankConsensusGraphPathVertexs, topRankConsensusGraphPathWeights);

    // change vertex list to vertex set
    vector<set<Vertex>>  topRankConsensusGraphPathVertexSet;
    for (int i=0; i<topRankConsensusGraphPathVertexs.size(); i++)
    {
        list<Vertex>::iterator vertexIter = topRankConsensusGraphPathVertexs[i].begin();
        set<Vertex> vertexSet;
        for (; vertexIter!=topRankConsensusGraphPathVertexs[i].end(); vertexIter++)
        {
            vertexSet.insert(*vertexIter);
        }
        topRankConsensusGraphPathVertexSet.push_back(vertexSet);
    }

    // get variant vertices
    vector<int>    allelePositions;
    vector<string> alleleChars;
    for (list<Allele>::iterator alleleIter=allelesInBlock.begin(); alleleIter!=allelesInBlock.end(); alleleIter++)
    {
        Allele allele = *alleleIter;
        allelePositions.push_back(allele.m_chrPosition-windowLeftPosition);
        alleleChars.push_back(allele.m_allele);
    }
    // map allele to graph vertex
    set<Vertex> variantVertexs;
    map<int,Vertex> mapAlleleToVertex;
    map<Vertex,int> mapVertexToAllele;
    for (int v=0; v<consensusGraph.m_numVertexs; v++)
    {
        if (consensusGraph.m_skip[v])
            continue;

        if (!consensusGraph.m_isMismatch[v])
            continue;

        int gp = consensusGraph.m_genomePosition[v] - 1;


        for (int j=0; j<allelePool.size(); j++)
        {
            int ap = allelePositions[j];
            if (ap==gp)
            {
                if (alleleChars[j]==consensusGraph.m_labels[v])
                {
                    variantVertexs.insert(v);
                    mapAlleleToVertex[j] = v;
                    mapVertexToAllele[v] = j;
                }
            }
        }
    }


    // set up the haplotypes
    vector<string> haplotypes;
    vector<int>    haplotypeToPathIndex;
    vector<set<Vertex>> haplotypeVariantVertexs;

    haplotypes.push_back(genome);
    haplotypeToPathIndex.push_back(-1);
    haplotypeVariantVertexs.push_back(set<Vertex>());

    int kk = 0;
    for (int i=0; i<topRankConsensusGraphPaths.size(); i++)
    {
        if (kk>=snpCallSettings.m_topK)
            continue;

        bool hasVariantVertex = false;
        int  deltaLength = (topRankConsensusGraphPaths[i].length()-genome.length());
        deltaLength = abs(deltaLength);

        if (deltaLength>5)
            continue;

        set<Vertex> pathVertexs = topRankConsensusGraphPathVertexSet[i];
        set<Vertex> pathVariantVertexs;
        for (set<Vertex>::iterator variantIter=variantVertexs.begin(); variantIter!=variantVertexs.end(); variantIter++)
        {
            if (pathVertexs.find(*variantIter)!=pathVertexs.end())
            {
                hasVariantVertex = true;
                pathVariantVertexs.insert(*variantIter);
            }
        }

        int totalNumberVariantVertexInPath = 0;
        for (set<Vertex>::iterator vertexIter=pathVertexs.begin(); vertexIter!=pathVertexs.end(); vertexIter++)
        {
            int v = *vertexIter;
            if (consensusGraph.m_isMismatch[v])
            {
                totalNumberVariantVertexInPath += 1;
            }
        }

        if (hasVariantVertex && totalNumberVariantVertexInPath<=pathVariantVertexs.size())
        {
            haplotypes.push_back(topRankConsensusGraphPaths[i]);
            haplotypeToPathIndex.push_back(i);
            haplotypeVariantVertexs.push_back(pathVariantVertexs);

            kk++;
        }
    }

    int numHaplotypes = haplotypes.size();

    // skip if there is no variant haplotype
    if (numHaplotypes==1)
    {
        return;
    }

    // compute haplotype data likelihood
    vector<vector<long double>> haplotypeDataLikelihoods(numHaplotypes);
    PyroHMMsnpHaplotypeDataLikelihood(probAligner, snpCallSettings.m_band, numHaplotypes, haplotypes, readsInWindow, haplotypeDataLikelihoods);


    // genotype
    vector<vector<int>> genotypes;
    set<set<int>> genotypeDiscovered;
    for (int i=0; i<numHaplotypes; i++)
    {
        vector<int> precedeHaplotypes;
        PyroHMMsnpGenotypeSet(snpCallSettings.m_ploidy, i, numHaplotypes, precedeHaplotypes, genotypes, genotypeDiscovered);
    }

    int numGenotypes = genotypes.size();

    // genotype variant vertex
    vector<set<Vertex>> genotypeVariantVertexs;
    for (int i=0; i<numGenotypes; i++)
    {
        set<Vertex> variantVertexInGenotype;
        for (int j=0; j<settingForPyroHMMsnp.m_ploidy; j++)
        {
            int haplotype = genotypes[i][j];
            set<Vertex> variantVertexInHaplotype = haplotypeVariantVertexs[haplotype];
            variantVertexInGenotype.insert(variantVertexInHaplotype.begin(), variantVertexInHaplotype.end());
        }
        genotypeVariantVertexs.push_back(variantVertexInGenotype);
    }

    // genotype priors
    vector<long double> genotypePriors(numGenotypes);
    PyroHMMsnpGenotypePrior(numGenotypes, genotypes, settingForPyroHMMsnp, genotypePriors);

    // genotype likelihoods
    vector<long double> genotypeLikelihoods(numGenotypes);
    PyroHMMsnpGenotypeLikelihood(numGenotypes, genotypes, readsInWindow.size(), haplotypeDataLikelihoods, snpCallSettings, genotypeLikelihoods);

    // genotype posteriors
    vector<long double> genotypePosteriors(numGenotypes);
    PyroHMMsnpGenotypePosterior(numGenotypes, genotypePriors, genotypeLikelihoods, genotypePosteriors);

    // search maximal genotype posterior
    long double maxGenotypePosterior = 0;
    int inferGenotype;
    for (int i=1; i<numGenotypes; i++)
    {
        if (maxGenotypePosterior<genotypePosteriors[i])
        {
            maxGenotypePosterior = genotypePosteriors[i];
            inferGenotype = i;
        }
    }

    // all variant vertexs in the inferred genotype
    set<Vertex> inferGenotypeVariantVertexs = genotypeVariantVertexs[inferGenotype];

    // count haploid type of variant
    map<Vertex,vector<int>> inferGenotypeVariantHaploidType;
    set<Vertex>::iterator inferVariantIter = inferGenotypeVariantVertexs.begin();
    for (; inferVariantIter!=inferGenotypeVariantVertexs.end(); inferVariantIter++)
    {
        int v = *inferVariantIter;
        vector<int> variantHaploidType;
        for (int j=0; j<settingForPyroHMMsnp.m_ploidy; j++)
        {
            int haplotype = genotypes[inferGenotype][j];
            set<Vertex> variantVertexInHaplotype = haplotypeVariantVertexs[haplotype];
            if (variantVertexInHaplotype.find(v)==variantVertexInHaplotype.end())
            {
                variantHaploidType.push_back(0);
            }else
            {
                variantHaploidType.push_back(1);
            }
        }
        inferGenotypeVariantHaploidType[v] = variantHaploidType;
    }
    // variant score
    map<Vertex,long double> inferGenotypeVariantScore;
    inferVariantIter = inferGenotypeVariantVertexs.begin();
    for (; inferVariantIter!=inferGenotypeVariantVertexs.end(); inferVariantIter++)
    {
        int v = *inferVariantIter;
        long double variantScore = 0;
        for (int i=0; i<numGenotypes; i++)
        {
            set<Vertex> variantVertexInGenotype = genotypeVariantVertexs[i];
            if (variantVertexInGenotype.find(v)!=variantVertexInGenotype.end())
                variantScore += genotypePosteriors[i];
        }

        inferGenotypeVariantScore[v] = variantScore;
    }

    // save variant result
    inferVariantIter = inferGenotypeVariantVertexs.begin();
    for (; inferVariantIter!=inferGenotypeVariantVertexs.end(); inferVariantIter++)
    {
        GenericVariant result;

        int v = *inferVariantIter;
        int a = mapVertexToAllele[v];

        int variantChrID;
        int variantChrPos;

        vector<int> haploidType = inferGenotypeVariantHaploidType[v];
        for (int j=0; j<settingForPyroHMMsnp.m_ploidy; j++)
        {
            if (haploidType[j]==0)
            {
                int g = consensusGraph.m_genomePosition[v];

                Allele allele;
                allele.m_allele = consensusGraph.m_labels[g];
                result.m_alleles.push_back(allele);
            }else
            {
                Allele allele = allelePool[a];
                result.m_alleles.push_back(allele);

                variantChrID  = allele.m_chrID;
                variantChrPos = allele.m_chrPosition;
            }
        }

        result.m_chrID           = variantChrID;
        result.m_chrPosition     = variantChrPos;
        result.m_probScoreRef    = genotypePosteriors[0];
        result.m_probScoreVar    = genotypePosteriors[inferGenotype];
        result.m_variantType     = VARIANT_SNP;
        long double variantScore = inferGenotypeVariantScore[v];
        if (fabs(1-variantScore)<1e-300)
            result.m_quality     = 3000;
        else if (variantScore<1e-300)
            result.m_quality     = 0;
        else
            result.m_quality     = -10*log10(1-variantScore);

        char refBase;
        fastaObj.GetBase(result.m_chrID, result.m_chrPosition, refBase);
        result.m_reference       = refBase;

        for (int i=0; i<result.m_alleles.size(); i++)
        {
            if (result.m_alleles[i].m_allele==result.m_reference)
                result.m_haploidType.push_back(0);
            else
                result.m_haploidType.push_back(1);
        }


        // filter
        if (result.m_quality>=snpCallSettings.m_variantQualityFilter)
            variantResults.push_back(result);

    }

}
void GenericIndividualSnpCall::simpleSnpCall(string &fastaObj, BamReader &bamObj, int chrID, int leftPosition, int rightPosition, vector<Allele> &variantCandidates, map<int,list<tuple<char,int,int,double>>> &bamData)
{
    set<int> BlockSnpPositions;
    vector<Allele> BlockSnpAlleles;

    // rewind
    bamObj.Rewind();
    // set region
    bamObj.SetRegion(chrID, leftPosition, chrID, rightPosition);

    BamAlignment al;
    // search SNP positions in the region
    while (bamObj.GetNextAlignment(al))
    {
        if (!GenericBamAlignmentTools::goodAlignment(al))
            continue;

        if (!al.HasTag("MD"))
            continue;

        vector<long> SnpInAlignment;
        GenericBamAlignmentTools::getBamAlignmentMismatches(al, SnpInAlignment);

        for (int i=0; i<SnpInAlignment.size(); i++)
        {
            BlockSnpPositions.insert(SnpInAlignment[i]);
        }
    }

    // pileup visitor
    SimpleSnpCallPileupVisitor visitor(&fastaObj, chrID, leftPosition, rightPosition, m_downSample, &BlockSnpPositions, &BlockSnpAlleles, &bamData);

    PileupEngine SimpleSnpCallPileupEngine;
    SimpleSnpCallPileupEngine.AddVisitor(&visitor);

    // rewind
    bamObj.Rewind();
    // set region
    bamObj.SetRegion(chrID, leftPosition, chrID, rightPosition);
    // load data
    while(bamObj.GetNextAlignment(al))
    {

        if (!GenericBamAlignmentTools::goodAlignment(al))
            continue;

        if (!GenericBamAlignmentTools::validMapQuality(al, m_minMapQuality))
            continue;

        if (!GenericBamAlignmentTools::validReadIdentity(al, m_maxMismatchFrac))
            continue;

        if (!GenericBamAlignmentTools::validReadLength(al, m_minReadLength))
            continue;

        if (!al.HasTag("MD"))
            continue;

        SimpleSnpCallPileupEngine.AddAlignment(al);
    }
    SimpleSnpCallPileupEngine.Flush();

    // Filter SNP candidiate
    for (int i=0; i<BlockSnpAlleles.size(); i++)
    {
        Allele allele = BlockSnpAlleles[i];

        if (allele.m_alleleDepth < m_minSnpRead)
            continue;
        if (allele.m_alleleDepth < m_minSnpFrac*allele.m_globalDepth)
            continue;

        variantCandidates.push_back(allele);
    }

}