void GenericIndividualSnpCall::PyroHMMsnp(Fasta &fastaObj, BamReader &bamObj, int chrID, int leftPosition, int rightPosition, GenericProbabilisticAlignment &probAligner, list<Allele>& allelesInBlock, VariantCallSetting& snpCallSettings, vector<GenericVariant> &variantResults) { VariantCallSetting settingForPyroHMMsnp = snpCallSettings; // allele pool vector<Allele> allelePool; for (list<Allele>::iterator allelesInBlockIter=allelesInBlock.begin(); allelesInBlockIter!=allelesInBlock.end(); allelesInBlockIter++) { allelePool.push_back(*allelesInBlockIter); } // add 10bp flanking segment at each side int windowLeftPosition = leftPosition - snpCallSettings.m_flankingSize; int windowRightPosition = rightPosition + snpCallSettings.m_flankingSize; // genome string genome; fastaObj.GetSequence(chrID, windowLeftPosition, windowRightPosition, genome); int globalDepth; double globalMapQual; int globalStrandPos; int globalStrandNeg; vector<PyroHMMsnp_Sequence_t> readsInWindow; // rewind BAM reader bamObj.Rewind(); // set BAM region bamObj.SetRegion(chrID, windowLeftPosition, chrID, windowRightPosition); // read alignment BamAlignment al; while (bamObj.GetNextAlignment(al)) { // skip if it is not a good alignment if (!GenericBamAlignmentTools::goodAlignment(al)) { continue; } // skip if it is not valid at length if (!GenericBamAlignmentTools::validReadLength(al, m_minReadLength)) { continue; } // skip if it is not valid at map quality if (!GenericBamAlignmentTools::validMapQuality(al, m_minMapQuality)) { continue; } // skip if it is not valid at alignment identity if (!GenericBamAlignmentTools::validReadIdentity(al, m_maxMismatchFrac)) { continue; } // global info globalDepth += 1; globalMapQual += al.MapQuality*al.MapQuality; if (al.IsReverseStrand()) globalStrandNeg += 1; else globalStrandPos += 1; // get local alignment string t_localRead, t_localGenome; Cigar t_cigar; BamMD t_md; int t_numMismatch, t_numInDel; GenericBamAlignmentTools::getLocalAlignment(al, windowLeftPosition, windowRightPosition-windowLeftPosition, t_localRead, t_localGenome, t_cigar, t_md, t_numMismatch, t_numInDel); if (t_localRead.empty() || t_localGenome.empty()) continue; // save into set PyroHMMsnp_Sequence_t t_seq; t_seq.t_ID = GenericBamAlignmentTools::getBamAlignmentID(al); t_seq.t_sequence = t_localRead; t_seq.t_cigar = t_cigar; t_seq.t_md = t_md; t_seq.t_numMismatch = t_numMismatch; t_seq.t_numInDel = t_numInDel; t_seq.t_mapQualScore = al.MapQuality; if (al.Position>windowLeftPosition) t_seq.t_startPositionShift = al.Position-windowLeftPosition; else t_seq.t_startPositionShift = 0; if (al.GetEndPosition()<windowRightPosition) t_seq.t_endPositionShift = windowRightPosition-al.GetEndPosition(); else t_seq.t_endPositionShift = 0; readsInWindow.push_back(t_seq); } int numData = readsInWindow.size(); // construct the consensus sequence graph GenericDagGraph consensusGraph; vector<string> consensusGraphReads; vector<Cigar> consensusGraphReadCigars; vector<int> consensusGraphReadStarts; // set of aligned reads to construct the graph for (int i=0; i<numData; ++i) { consensusGraphReads.push_back(readsInWindow[i].t_sequence); consensusGraphReadCigars.push_back(readsInWindow[i].t_cigar); consensusGraphReadStarts.push_back(readsInWindow[i].t_startPositionShift); } // build up the graph consensusGraph.buildDagGraph(genome, consensusGraphReads, consensusGraphReadCigars, consensusGraphReadStarts); consensusGraph.edgePruning(snpCallSettings.m_graphPruneLevel); // search topK paths, excluding reference vector<string> topRankConsensusGraphPaths; vector<list<Vertex>> topRankConsensusGraphPathVertexs; vector<double> topRankConsensusGraphPathWeights; consensusGraph.topRankPathsExcludeGenome(30, topRankConsensusGraphPaths, topRankConsensusGraphPathVertexs, topRankConsensusGraphPathWeights); // change vertex list to vertex set vector<set<Vertex>> topRankConsensusGraphPathVertexSet; for (int i=0; i<topRankConsensusGraphPathVertexs.size(); i++) { list<Vertex>::iterator vertexIter = topRankConsensusGraphPathVertexs[i].begin(); set<Vertex> vertexSet; for (; vertexIter!=topRankConsensusGraphPathVertexs[i].end(); vertexIter++) { vertexSet.insert(*vertexIter); } topRankConsensusGraphPathVertexSet.push_back(vertexSet); } // get variant vertices vector<int> allelePositions; vector<string> alleleChars; for (list<Allele>::iterator alleleIter=allelesInBlock.begin(); alleleIter!=allelesInBlock.end(); alleleIter++) { Allele allele = *alleleIter; allelePositions.push_back(allele.m_chrPosition-windowLeftPosition); alleleChars.push_back(allele.m_allele); } // map allele to graph vertex set<Vertex> variantVertexs; map<int,Vertex> mapAlleleToVertex; map<Vertex,int> mapVertexToAllele; for (int v=0; v<consensusGraph.m_numVertexs; v++) { if (consensusGraph.m_skip[v]) continue; if (!consensusGraph.m_isMismatch[v]) continue; int gp = consensusGraph.m_genomePosition[v] - 1; for (int j=0; j<allelePool.size(); j++) { int ap = allelePositions[j]; if (ap==gp) { if (alleleChars[j]==consensusGraph.m_labels[v]) { variantVertexs.insert(v); mapAlleleToVertex[j] = v; mapVertexToAllele[v] = j; } } } } // set up the haplotypes vector<string> haplotypes; vector<int> haplotypeToPathIndex; vector<set<Vertex>> haplotypeVariantVertexs; haplotypes.push_back(genome); haplotypeToPathIndex.push_back(-1); haplotypeVariantVertexs.push_back(set<Vertex>()); int kk = 0; for (int i=0; i<topRankConsensusGraphPaths.size(); i++) { if (kk>=snpCallSettings.m_topK) continue; bool hasVariantVertex = false; int deltaLength = (topRankConsensusGraphPaths[i].length()-genome.length()); deltaLength = abs(deltaLength); if (deltaLength>5) continue; set<Vertex> pathVertexs = topRankConsensusGraphPathVertexSet[i]; set<Vertex> pathVariantVertexs; for (set<Vertex>::iterator variantIter=variantVertexs.begin(); variantIter!=variantVertexs.end(); variantIter++) { if (pathVertexs.find(*variantIter)!=pathVertexs.end()) { hasVariantVertex = true; pathVariantVertexs.insert(*variantIter); } } int totalNumberVariantVertexInPath = 0; for (set<Vertex>::iterator vertexIter=pathVertexs.begin(); vertexIter!=pathVertexs.end(); vertexIter++) { int v = *vertexIter; if (consensusGraph.m_isMismatch[v]) { totalNumberVariantVertexInPath += 1; } } if (hasVariantVertex && totalNumberVariantVertexInPath<=pathVariantVertexs.size()) { haplotypes.push_back(topRankConsensusGraphPaths[i]); haplotypeToPathIndex.push_back(i); haplotypeVariantVertexs.push_back(pathVariantVertexs); kk++; } } int numHaplotypes = haplotypes.size(); // skip if there is no variant haplotype if (numHaplotypes==1) { return; } // compute haplotype data likelihood vector<vector<long double>> haplotypeDataLikelihoods(numHaplotypes); PyroHMMsnpHaplotypeDataLikelihood(probAligner, snpCallSettings.m_band, numHaplotypes, haplotypes, readsInWindow, haplotypeDataLikelihoods); // genotype vector<vector<int>> genotypes; set<set<int>> genotypeDiscovered; for (int i=0; i<numHaplotypes; i++) { vector<int> precedeHaplotypes; PyroHMMsnpGenotypeSet(snpCallSettings.m_ploidy, i, numHaplotypes, precedeHaplotypes, genotypes, genotypeDiscovered); } int numGenotypes = genotypes.size(); // genotype variant vertex vector<set<Vertex>> genotypeVariantVertexs; for (int i=0; i<numGenotypes; i++) { set<Vertex> variantVertexInGenotype; for (int j=0; j<settingForPyroHMMsnp.m_ploidy; j++) { int haplotype = genotypes[i][j]; set<Vertex> variantVertexInHaplotype = haplotypeVariantVertexs[haplotype]; variantVertexInGenotype.insert(variantVertexInHaplotype.begin(), variantVertexInHaplotype.end()); } genotypeVariantVertexs.push_back(variantVertexInGenotype); } // genotype priors vector<long double> genotypePriors(numGenotypes); PyroHMMsnpGenotypePrior(numGenotypes, genotypes, settingForPyroHMMsnp, genotypePriors); // genotype likelihoods vector<long double> genotypeLikelihoods(numGenotypes); PyroHMMsnpGenotypeLikelihood(numGenotypes, genotypes, readsInWindow.size(), haplotypeDataLikelihoods, snpCallSettings, genotypeLikelihoods); // genotype posteriors vector<long double> genotypePosteriors(numGenotypes); PyroHMMsnpGenotypePosterior(numGenotypes, genotypePriors, genotypeLikelihoods, genotypePosteriors); // search maximal genotype posterior long double maxGenotypePosterior = 0; int inferGenotype; for (int i=1; i<numGenotypes; i++) { if (maxGenotypePosterior<genotypePosteriors[i]) { maxGenotypePosterior = genotypePosteriors[i]; inferGenotype = i; } } // all variant vertexs in the inferred genotype set<Vertex> inferGenotypeVariantVertexs = genotypeVariantVertexs[inferGenotype]; // count haploid type of variant map<Vertex,vector<int>> inferGenotypeVariantHaploidType; set<Vertex>::iterator inferVariantIter = inferGenotypeVariantVertexs.begin(); for (; inferVariantIter!=inferGenotypeVariantVertexs.end(); inferVariantIter++) { int v = *inferVariantIter; vector<int> variantHaploidType; for (int j=0; j<settingForPyroHMMsnp.m_ploidy; j++) { int haplotype = genotypes[inferGenotype][j]; set<Vertex> variantVertexInHaplotype = haplotypeVariantVertexs[haplotype]; if (variantVertexInHaplotype.find(v)==variantVertexInHaplotype.end()) { variantHaploidType.push_back(0); }else { variantHaploidType.push_back(1); } } inferGenotypeVariantHaploidType[v] = variantHaploidType; } // variant score map<Vertex,long double> inferGenotypeVariantScore; inferVariantIter = inferGenotypeVariantVertexs.begin(); for (; inferVariantIter!=inferGenotypeVariantVertexs.end(); inferVariantIter++) { int v = *inferVariantIter; long double variantScore = 0; for (int i=0; i<numGenotypes; i++) { set<Vertex> variantVertexInGenotype = genotypeVariantVertexs[i]; if (variantVertexInGenotype.find(v)!=variantVertexInGenotype.end()) variantScore += genotypePosteriors[i]; } inferGenotypeVariantScore[v] = variantScore; } // save variant result inferVariantIter = inferGenotypeVariantVertexs.begin(); for (; inferVariantIter!=inferGenotypeVariantVertexs.end(); inferVariantIter++) { GenericVariant result; int v = *inferVariantIter; int a = mapVertexToAllele[v]; int variantChrID; int variantChrPos; vector<int> haploidType = inferGenotypeVariantHaploidType[v]; for (int j=0; j<settingForPyroHMMsnp.m_ploidy; j++) { if (haploidType[j]==0) { int g = consensusGraph.m_genomePosition[v]; Allele allele; allele.m_allele = consensusGraph.m_labels[g]; result.m_alleles.push_back(allele); }else { Allele allele = allelePool[a]; result.m_alleles.push_back(allele); variantChrID = allele.m_chrID; variantChrPos = allele.m_chrPosition; } } result.m_chrID = variantChrID; result.m_chrPosition = variantChrPos; result.m_probScoreRef = genotypePosteriors[0]; result.m_probScoreVar = genotypePosteriors[inferGenotype]; result.m_variantType = VARIANT_SNP; long double variantScore = inferGenotypeVariantScore[v]; if (fabs(1-variantScore)<1e-300) result.m_quality = 3000; else if (variantScore<1e-300) result.m_quality = 0; else result.m_quality = -10*log10(1-variantScore); char refBase; fastaObj.GetBase(result.m_chrID, result.m_chrPosition, refBase); result.m_reference = refBase; for (int i=0; i<result.m_alleles.size(); i++) { if (result.m_alleles[i].m_allele==result.m_reference) result.m_haploidType.push_back(0); else result.m_haploidType.push_back(1); } // filter if (result.m_quality>=snpCallSettings.m_variantQualityFilter) variantResults.push_back(result); } }
int GenericIndividualSnpCall::call(Fasta &fastaObj, BamReader &bamObj, BamRegion &roi, GenericProbabilisticAlignment &probAligner, VariantCallSetting& snpCallSettings, vector<GenericVariant> &variantSet) { RefVector chromosomes = bamObj.GetReferenceData(); // set up genome blocks vector<int> BlockChrID, BlockLeftPos, BlockRightPos; int BlockNumber=setupGenomeBlock(chromosomes, roi, BlockChrID, BlockLeftPos, BlockRightPos); int numSNP = 0; // iterate throught blocks for (int i=0; i<BlockNumber; ++i) { if (m_verbosity>=1) { cout << "processing " << chromosomes[BlockChrID[i]].RefName << ":" << BlockLeftPos[i]+1 << "-" << BlockRightPos[i] << endl; } clock_t startTime = clock(); // genome string BlockGenome; fastaObj.GetSequence(BlockChrID[i], BlockLeftPos[i], BlockRightPos[i], BlockGenome); map<int,list<tuple<char,int,int,double>>> BlockBamData; AlleleSet BlockSnpAlleleCandidates; // profile SNP sites by the simple method simpleSnpCall(BlockGenome, bamObj, BlockChrID[i], BlockLeftPos[i], BlockRightPos[i], BlockSnpAlleleCandidates, BlockBamData); // merge SNP sites to SNP blocks vector<tuple<int,int,list<Allele>>> BlockSnpLoci; mergeSnpSitesToBlocks(BlockSnpAlleleCandidates, BlockSnpLoci); // iterate through Snp locus for (int j=0; j<BlockSnpLoci.size(); j++) { int BlockSnpLeftPos = get<0>(BlockSnpLoci[j]); int BlockSnpRightPos = get<1>(BlockSnpLoci[j]); // it is a SNP site if (BlockSnpRightPos==BlockSnpLeftPos+1) { simpleBayesianSnpCall(fastaObj, bamObj, BlockChrID[i], BlockSnpLeftPos, BlockSnpRightPos, get<2>(BlockSnpLoci[j]), BlockBamData[BlockSnpLeftPos], snpCallSettings, variantSet); }else if (BlockSnpRightPos==BlockSnpLeftPos+2) { for (int pos=BlockSnpLeftPos; pos<BlockSnpRightPos; pos++) { list<Allele> fAlleles = get<2>(BlockSnpLoci[j]); list<Allele> tAlleles; for (list<Allele>::iterator faIter=fAlleles.begin(); faIter!=fAlleles.end(); faIter++) { if (faIter->m_chrPosition==pos) tAlleles.emplace_back(*faIter); } if (!tAlleles.empty()) simpleBayesianSnpCall(fastaObj, bamObj, BlockChrID[i], pos, pos+1, tAlleles, BlockBamData[pos], snpCallSettings, variantSet); } } else // it is a MNP site { PyroHMMsnp(fastaObj, bamObj, BlockChrID[i], BlockSnpLeftPos, BlockSnpRightPos, probAligner, get<2>(BlockSnpLoci[j]), snpCallSettings, variantSet); } } clock_t endTime = clock(); if (m_verbosity>=1) { cout << "time elapsed " << ((endTime-startTime)/(double)CLOCKS_PER_SEC/60.) << " minutes"; cout << ", "; cout << "call " << variantSet.size()-numSNP << " SNPs" << endl; } numSNP = variantSet.size(); } return variantSet.size(); }