//****************************************************************************** // ExtractDNA //****************************************************************************** void Bed2Fa::ExtractDNA() { /* Make sure that we can oen all of the files successfully*/ // open the fasta database for reading ifstream faDb(_dbFile.c_str(), ios::in); if ( !faDb ) { cerr << "Error: The requested fasta database file (" << _dbFile << ") could not be opened. Exiting!" << endl; exit (1); } // open and memory-map genome file FastaReference *fr = new FastaReference; bool memmap = true; fr->open(_dbFile, memmap); BED bed, nullBed; string sequence; _bed->Open(); while (_bed->GetNextBed(bed)) { if (_bed->_status == BED_VALID) { // make sure we are extracting >= 1 bp if (bed.zeroLength == false) { size_t seqLength = fr->sequenceLength(bed.chrom); // seqLength > 0 means chrom was found in index. // seqLength == 0 otherwise. if (seqLength) { // make sure this feature will not exceed the end of the chromosome. if ( (bed.start <= seqLength) && (bed.end <= seqLength) ) { int length = bed.end - bed.start; sequence = fr->getSubSequence(bed.chrom, bed.start, length); ReportDNA(bed, sequence); } else { cerr << "Feature (" << bed.chrom << ":" << bed.start << "-" << bed.end << ") beyond the length of " << bed.chrom << " size (" << seqLength << " bp). Skipping." << endl; } } else { cerr << "WARNING. chromosome (" << bed.chrom << ") was not found in the FASTA file. Skipping."<< endl; } } // handle zeroLength else { cerr << "Feature (" << bed.chrom << ":" << bed.start+1 << "-" << bed.end-1 << ") has length = 0, Skipping." << endl; } bed = nullBed; } } _bed->Close(); }
void getAlignment(Variant& var, FastaReference& reference, string& ref, vector<AltAlignment>& alignments, int window) { // default alignment params float matchScore = 10.0f; float mismatchScore = -9.0f; float gapOpenPenalty = 25.0f; float gapExtendPenalty = 3.33f; // establish reference sequence string pad = string(window/2, 'Z'); string leftFlank = reference.getSubSequence(var.sequenceName, var.zeroBasedPosition() - window/2, window/2); string rightFlank = reference.getSubSequence(var.sequenceName, var.zeroBasedPosition() + var.ref.size(), window/2); ref = pad + leftFlank + var.ref + rightFlank + pad; // and iterate through the alternates, generating alignments for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { string alt = pad + leftFlank + *a + rightFlank + pad; CSmithWatermanGotoh sw(matchScore, mismatchScore, gapOpenPenalty, gapExtendPenalty); unsigned int referencePos; string cigar; sw.Align(referencePos, cigar, ref, alt); alignments.push_back(AltAlignment(referencePos, alt, cigar)); } }
void SamRead::FixTandemRef() { cout << "FOUND TANDEM" << endl; write(); writeVertical(); string lastChr = "nope"; int lastPos = -1; string NewRef = ""; for (int i = 0; i < seq.size(); i++) { cout << i << " " << seq[i] << " " << cigarString[i] << " " << RefSeq[i] << endl; if (cigarString[i]=='Y') { NewRef+= toupper(Reff.getSubSequence(lastChr, lastPos+1, 1).c_str()[0]); lastPos++; } else { NewRef+=RefSeq[i]; lastChr = ChrPositions[i]; lastPos = Positions[i]; } } RefSeq = NewRef; cout << "done tandtem fix" << endl; write(); }
int main(int argc, char** argv) { string vcfFileName; string fastaFileName; int windowsize = 30; if (argc == 1) printSummary(argv); int c; while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"window-size", required_argument, 0, 'w'}, {"reference", required_argument, 0, 'r'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hw:r:", long_options, &option_index); if (c == -1) break; switch (c) { case 'w': windowsize = atoi(optarg); break; case 'r': fastaFileName = string(optarg); break; case 'h': printSummary(argv); break; case '?': printSummary(argv); exit(1); break; default: abort (); } } VariantCallFile variantFile; string inputFilename; if (optind == argc - 1) { inputFilename = argv[optind]; variantFile.open(inputFilename); } else { variantFile.open(std::cin); } if (!variantFile.is_open()) { cerr << "could not open VCF file" << endl; exit(1); } FastaReference reference; if (fastaFileName.empty()) { cerr << "a reference is required for haplotype allele generation" << endl; exit(1); } reference.open(fastaFileName); // pattern // when variants are within windowSize from each other, build up local haplotypes // establish all the haplotypes which exist within the window using genotypes+allele#+position map // generate a haplotype allele string for each unique haplotype // for completeness retain phasing information in the genotypes // write a new VCF record in which there are haplotype alleles and correctly described genotypes for each sample // if the variants are outside of the windowSize, just write out the record Variant var(variantFile); Variant outputVar(variantFile); cout << variantFile.header << endl; // get the first distances vector<Variant> cluster; while (variantFile.getNextVariant(var) || !cluster.empty()) { bool haplotypeCluster = false; if (variantFile.done()) { if (cluster.size() >= 1) { haplotypeCluster = true; } else { cout << cluster.front() << endl; cluster.clear(); } } else if (isPhased(var)) { if (cluster.empty() || cluster.back().sequenceName == var.sequenceName && var.position - cluster.back().position + cluster.back().ref.size() - 1 <= windowsize) { cluster.push_back(var); } else { if (cluster.size() == 1) { cout << cluster.front() << endl; cluster.clear(); if (!variantFile.done()) { cluster.push_back(var); } } else { haplotypeCluster = true; } } } else { // not phased if (cluster.empty()) { cout << var << endl; } else if (cluster.size() == 1) { cout << cluster.front() << endl; cout << var << endl; } else { haplotypeCluster = true; } } // we need to deal with the current cluster, as our next var is outside of bounds // process the last cluster if it's more than 1 var if (haplotypeCluster) { /* cerr << "cluster: "; for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) { cerr << " " << v->position; } cerr << endl; */ // generate haplotype alleles and genotypes! // get the reference sequence across the haplotype in question string referenceHaplotype = reference.getSubSequence(cluster.front().sequenceName, cluster.front().position - 1, cluster.back().position + cluster.back().ref.size() - cluster.front().position); // establish what haplotypes there are by parsing the (phased) genotypes across the samples over these records map<string, vector<vector<int> > > sampleHaplotypes; for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) { // build the haplotype using the genotype fields in the variant cluster // only build haplotypes for samples with complete information string& sampleName = *s; vector<vector<int> >& haplotypes = sampleHaplotypes[sampleName]; bool completeCoverage = true; // ensure complete genotype coverage over the haplotype cluster for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) { if (v->samples.find(sampleName) == v->samples.end() || v->samples[sampleName].find("GT") == v->samples[sampleName].end()) { completeCoverage = false; break; } } if (!completeCoverage) { continue; // skip samples without complete coverage } // what's the ploidy? { string& gt = cluster.front().samples[sampleName]["GT"].front(); vector<string> gtspec = split(gt, "|"); for (vector<string>::iterator g = gtspec.begin(); g != gtspec.end(); ++g) { vector<int> haplotype; haplotypes.push_back(haplotype); } } for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) { string& gt = v->samples[sampleName]["GT"].front(); vector<string> gtspec = split(gt, "|"); vector<string>::iterator g = gtspec.begin(); for (vector<vector<int> >::iterator h = haplotypes.begin(); h != haplotypes.end(); ++h, ++g) { int j; convert(*g, j); h->push_back(j); } } } set<vector<int> > uniqueHaplotypes; for (map<string, vector<vector<int> > >::iterator hs = sampleHaplotypes.begin(); hs != sampleHaplotypes.end(); ++hs) { vector<vector<int> >& haps = hs->second; for (vector<vector<int> >::iterator h = haps.begin(); h != haps.end(); ++h) { uniqueHaplotypes.insert(*h); } } // write new haplotypes map<vector<int>, string> haplotypeSeqs; map<vector<int>, int> haplotypeIndexes; map<int, string> alleles; int impossibleHaplotypes = 0; // always include the reference haplotype as 0 // when we come to it in the haplotypes, we'll ignore it int alleleIndex = 1; for (set<vector<int> >::iterator u = uniqueHaplotypes.begin(); u != uniqueHaplotypes.end(); ++u) { /* for (vector<int>::const_iterator z = u->begin(); z != u->end(); ++z) { cerr << *z; } cerr << endl; */ string haplotype = referenceHaplotype; bool isreference = true; bool impossibleHaplotype = false; int referenceInsertOffset = 0; int j = 0; // index into variant cluster int lastpos = 0; int lastrefend = 0; for (vector<int>::const_iterator z = u->begin(); z != u->end(); ++z, ++j) { int i = *z; if (i != 0) { isreference = false; Variant& vartoInsert = cluster.at(j); string& alternate = vartoInsert.alleles.at(i); if (vartoInsert.position < lastrefend) { cerr << "impossible haplotype, overlapping alleles at " << vartoInsert.sequenceName << ":" << vartoInsert.position << endl; impossibleHaplotype = true; break; } else { //cerr << vartoInsert.position << " " << cluster.front().position + referenceInsertOffset << endl; //cerr << "replacing " << vartoInsert.ref << " at " << vartoInsert.position - cluster.front().position + referenceInsertOffset << " with " << alternate << endl; haplotype.replace(vartoInsert.position - cluster.front().position + referenceInsertOffset, vartoInsert.ref.size(), alternate); if (alternate.size() != vartoInsert.ref.size()) { referenceInsertOffset += alternate.size() - vartoInsert.ref.size(); } lastpos = vartoInsert.position; lastrefend = vartoInsert.position + vartoInsert.ref.size(); } } } if (impossibleHaplotype) { ++impossibleHaplotypes; haplotypeIndexes[*u] = -1; // indicates impossible haplotype impossibleHaplotype = false; } else if (isreference) { alleles[0] = haplotype; haplotypeIndexes[*u] = 0; } else { alleles[alleleIndex] = haplotype; haplotypeIndexes[*u] = alleleIndex; ++alleleIndex; } haplotypeSeqs[*u] = haplotype; // if there's not a reference allele, add it if (alleles.find(0) == alleles.end()) { alleles[0] = referenceHaplotype; // nb, there is no reference haplotype among // the samples, so we don't have to add it to // the haplotypeIndexes } } outputVar.ref = alleles[0]; outputVar.alt.clear(); for (int i = 1; i < alleleIndex; ++i) { outputVar.alt.push_back(alleles[i]); } outputVar.sequenceName = cluster.front().sequenceName; outputVar.position = cluster.front().position; outputVar.filter = "."; outputVar.id = "."; outputVar.info = cluster.front().info; outputVar.samples.clear(); outputVar.format = cluster.front().format; // now the genotypes for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) { string& sampleName = *s; vector<string> gt; vector<vector<int> > & hs = sampleHaplotypes[sampleName]; for (vector<vector<int> >::iterator h = hs.begin(); h != hs.end(); ++h) { int hi = haplotypeIndexes[*h]; if (hi != -1) { gt.push_back(convert(hi)); } else { // nonexistent or impossible haplotype gt.push_back("."); } } if (gt.size() != 0) { outputVar.samples[sampleName]["GT"].push_back(join(gt, "|")); } } if (cluster.size() - impossibleHaplotypes < 2) { for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) { cout << *v << endl; } } else { if (!outputVar.alt.empty()) { cout << outputVar << endl; } else { cerr << "no alternate alleles remain at " << outputVar.sequenceName << ":" << outputVar.position << " after haplotype validation" << endl; } } cluster.clear(); if (!variantFile.done()) cluster.push_back(var); } } exit(0); // why? return 0; }
int main (int argc, char** argv) { // Print Commandline string ss(argv[0]); // convert Char to String string commandline = "##Print Command line " + ss; int c; FastaReference* reference = NULL; int minbaseQ = 10; //default int windowlen = 40; //by default string regionstr; string RegionFile; string bamfile; bool STdin = false; bool has_region = false; bool has_regionFile = false; bool has_bamfile = false; bool has_ref = false; int ploidy = 2; bool SetLowComplexityRegionSWGapExt = false; bool SetLowComplexityRegion = false; if (argc < 2) { printSummary(argv); exit(1); } while (true) { static struct option long_options[] = { {"help", no_argument, 0, 'h'}, {"ploidy", required_argument, 0, 'p'}, {"window-size", required_argument, 0, 'w'}, {"reference", required_argument, 0, 'f'}, {"min-base-quality", required_argument, 0,'q'}, {"Region", required_argument, 0, 'R'}, {"STdin", no_argument, 0, 's'}, {"bam", required_argument, 0, 'b'}, {"Repeat-Extgap", no_argument, 0, 'E'}, {"LowCompex", no_argument, 0, 'l'}, {0, 0, 0, 0} }; int option_index = 0; c = getopt_long (argc, argv, "hslEf:q:w:s:r:R:p:b:", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) break; switch (c) { case 'f': reference = new FastaReference(optarg); // will exit on open failure commandline = commandline + " -f " + optarg; has_ref = true; break; case 'b': has_bamfile = true; bamfile = optarg; commandline = commandline + " -b " + optarg; break; case 'r': regionstr = optarg; has_region = true; commandline = commandline + " -r " + optarg; break; case 'R': RegionFile = optarg; has_regionFile = true; commandline = commandline + " -R " + optarg; break; case 's': STdin = true; commandline = commandline + " -s "; break; case 'q': minbaseQ = atoi(optarg); commandline = commandline + " -q " + optarg; break; case 'w': windowlen = atoi(optarg); commandline = commandline + " -w " + optarg; break; case 'p': ploidy = atoi(optarg); commandline = commandline + " -p " + optarg; break; case 'E': SetLowComplexityRegionSWGapExt = true; commandline = commandline + " -E "; break; case 'l': SetLowComplexityRegion = true; commandline = commandline + " -l "; break; case 'h': printSummary(argv); commandline = commandline + " -h "; exit(0); break; case '?': printSummary(argv); exit(1); break; default: abort(); break; } } //// Open Error log files ofstream cerrlog("bonsaiReport.txt"); streambuf *cerrsave = std::cerr.rdbuf(); // Redirect stream buffers if (cerrlog.is_open()) cerr.rdbuf(cerrlog.rdbuf()); cerr << commandline << endl; //Check for Reference Fasta sequence if (!has_ref) { cerr << "no FASTA reference provided, cannot realign" << endl; exit(1); } ////Check for reader BamReader reader; if (STdin == true) { if (!reader.Open("stdin")) { cerr << "could not open stdin bam for reading" << endl; cerr << reader.GetErrorString() << endl; reader.Close(); printSummary(argv); } } else { if (has_bamfile == true) { if (!reader.Open(bamfile)) { cerr << "ERROR: could not open bam files from stdin ... Aborting" << endl; cerr << reader.GetErrorString() << endl; reader.Close(); printSummary(argv); } if ( !reader.LocateIndex() ) reader.CreateIndex(); } else { cerr << "--bam flag is set but no bamfile is provided... Aborting" << endl; reader.Close(); printSummary(argv); } } //// Check Region Tags if ( (has_regionFile == true) && (has_region == true) ) { cerr << "ERROR: You provide both region and has provide a Set Region List... Aborting" << endl; exit(1); } //// store the names of all the reference sequences in the BAM file vector<RefData> referencedata = reader.GetReferenceData(); //// Store Region LIST vector<BamRegion> regionlist; if (has_region == true) { BamRegion region; ParseRegionString(regionstr, reader, region); regionlist.push_back(region); } else if (has_regionFile == true) { ifstream RG(RegionFile.c_str(), ios_base::in); string line; while(getline(RG,line)) { BamRegion region; ParseRegionString(line, reader, region); regionlist.push_back(region); } RG.close(); } else if ( (has_regionFile == false) && (has_region == false) ) { for (int i= 0; i < (int)referencedata.size(); i++) { string regionstr = referencedata.at(i).RefName; BamRegion region; ParseRegionString(regionstr, reader, region); if (!reader.SetRegion(region)) // Bam region will get [0,101) = 0 to 100 => [closed, half-opened) { cerr << "ERROR: set region " << regionstr << " failed. Check that REGION describes a valid range... Aborting" << endl; reader.Close(); exit(1); } else regionlist.push_back(region); } } //// BamWriter writer; if (!writer.Open("stdout", reader.GetHeaderText(), reader.GetReferenceData())) { cerr << "could not open stdout for writing" << endl; exit(1); } //// Smallest start position and Largest end position for Req Seq vector<RefData>::iterator refdataIter = referencedata.begin(); vector<BamRegion>::iterator regionListIter = regionlist.begin(); // CLASS RealignFunctionsClass RealignFunction; map<int, string> RefIDRedName; vector<SalRealignInfo> AlGroups; multimap<int, BamAlignment> SortRealignedAlignmentsMultimap; int refid = 0; BamAlignment alignment; bool IsNextAlignment = reader.GetNextAlignment(alignment); //cerr << " " << alignment.Name << " Chr " << alignment.RefID << " Startpos: " << alignment.Position << " Endpos: " << alignment.GetEndPosition() << " Length: " << alignment.Length << endl; int windowrealigned = 0; int TotalWindowDetected = 0; int TotalReadsAligned = 0; int TotalWindow = 0; int TotalReads = 0; while (refdataIter != referencedata.end() ) { string refname = refdataIter->RefName; RefIDRedName[refid] = refname; int reflength = refdataIter->RefLength; int winstartpos, winendpos; int AllowableBasesInWindow = 1; bool nextChrName = false; cerr << "##HeaderINFO: RefID = " << refdataIter->RefName << "\t" << "RefLen = " << reflength << endl; while (nextChrName == false ) { vector<int> minmaxRefSeqPos; bool IsPassDetectorNoRealignment = false; minmaxRefSeqPos.push_back(-1); minmaxRefSeqPos.push_back(0); //cerr << " region: " << (*regionListIter).LeftRefID << " : " << (*regionListIter).LeftPosition << " .. " << (*regionListIter).RightPosition << endl; if ((refid == (int)referencedata.size() - 1) && ((*regionListIter).LeftRefID == refid) && ((has_region==true) || (has_regionFile==true)) ) { //// if ( (has_region == true) || (has_regionFile == true) ) { winstartpos = (*regionListIter).LeftPosition; winendpos = winstartpos + windowlen - 1; reflength = (*regionListIter).RightPosition; if (reflength < winendpos) reflength = winendpos; // Get Next Alignment First if ( (refid == alignment.RefID) && (winstartpos == (*regionListIter).LeftPosition) && (IsNextAlignment == false) ) IsNextAlignment = reader.GetNextAlignment(alignment); } else if (has_region == false) { winstartpos = 0; winendpos = winstartpos + windowlen - 1; // Get Next Alignment First if ( (refid == alignment.RefID) && (winstartpos == 0) && (IsNextAlignment == false) ) IsNextAlignment = reader.GetNextAlignment(alignment); } //cerr << " winstart: " << winstartpos << " ; winend: " << winendpos; //cerr << " " << alignment.Name << " Chr " << alignment.RefID << " Startpos: " << alignment.Position << " Endpos: " << alignment.GetEndPosition() << " Length: " << alignment.Length << endl; //// while ((winstartpos < reflength)) { //// Check window end position if (winendpos > reflength) winendpos = reflength; // Reinitialized unsigned int NewReadMappedcount = 0; //// Save and Erase alignments that are outside of window (Deque?) if (!AlGroups.empty()) { minmaxRefSeqPos.at(0) = -1; minmaxRefSeqPos.at(1) = 0; //cerr << "#Start: Keep alignments with start position exceed the right end of the window/Region " << endl; vector<SalRealignInfo>::iterator Iter = AlGroups.begin(); while (Iter != AlGroups.end()) { // Erase alignment s if ((*Iter).al.GetEndPosition() < winstartpos) { //cerr << " ToWrite: " << (*Iter).second.size() << " ; " << (*Iter).al.Name << " ; " << (*Iter).al.Position << " < " << winstartpos << " : " << (*Iter).al.GetEndPosition() << endl; SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > ((*Iter).al.Position, (*Iter).al)); AlGroups.erase(Iter); //cerr << " ToWrite: DONE " << endl; } else { string referenceSequence = reference->getSubSequence(RefIDRedName[(*Iter).al.RefID], (*Iter).al.Position, 2*(*Iter).al.Length); if ((*Iter).HasRealign == true ) { (*Iter).currentReadPosition = 0; (*Iter).currentGenomeSeqPosition = 0; (*Iter).currentAlPosition = (*Iter).al.Position; (*Iter).cigarindex = 0; } (*Iter).CigarSoftclippingLength = 0; SalRealignInfo talr = (*Iter); //cerr << " ToKEEP: " << (*Iter).al.Name << " ; " << (*Iter).al.Position << " < " << winstartpos << " : " << (*Iter).al.GetEndPosition() << endl; RealignFunction.ParseAlignmentsAndExtractVariantsByBaseQualv(AlGroups, talr, Iter, (*Iter).al, referenceSequence, minmaxRefSeqPos, winstartpos, winendpos, (float) minbaseQ, false); ++Iter; //Increment iterator } } } // Write Sorted Alignments that are outside of window //cerr << "SortRealignedAlignmentsMultimap: " << SortRealignedAlignmentsMultimap.size() << " minmaxRefSeqPos.at(0)= " << minmaxRefSeqPos.at(0) << endl; if (!SortRealignedAlignmentsMultimap.empty()) // && (winWrite < winstartpos ) ) { //cerr << "#Start: Write alignments and delete alignments with start position exceed the right end of the window/Region " << endl; multimap<int, BamAlignment>::iterator sraIter = SortRealignedAlignmentsMultimap.begin(); while (sraIter != SortRealignedAlignmentsMultimap.end()) { //cerr << " (*sraIter).first= " << (*sraIter).first << " minmaxRefSeqPos.at(0)= " << minmaxRefSeqPos.at(0) << " winstartpos - ((windowlen - 1)*0.9)= " << winstartpos - ((windowlen - 1)*0.9) << endl; if (((float) (*sraIter).first < floor((float) (winstartpos - ((windowlen - 1)*0.9)))) && ((minmaxRefSeqPos.at(0) > 0) && ((*sraIter).first < minmaxRefSeqPos.at(0)))) { //writer.SaveAlignment((*sraIter).second); // Why sometimes, it doesn't work ????? if (!writer.SaveAlignment((*sraIter).second)) cerr << writer.GetErrorString() << endl; SortRealignedAlignmentsMultimap.erase(sraIter++); } else { ++sraIter; } } //cerr << "#Done: Write alignments and delete alignments with start position exceed the right end of the window/Region " << endl; } //cerr << " winstart: " << winstartpos << " ; winend: " << winendpos; //cerr << " " << alignment.Name << " Chr " << alignment.RefID << " Startpos: " << alignment.Position << " Endpos: " << alignment.GetEndPosition() << " Length: " << alignment.Length << endl; //cerr << ": " << alignment.RefID << " :" << RefIDRedName[alignment.RefID] << " : " << RefIDRedName[alignment.RefID] << endl; //cerr << "Start: Gather aligmenets that lie (fully or partially) within the window frame and group INDELs if there are ... " << endl; // Gather Reads within a window frame while ((IsNextAlignment) && (refid == alignment.RefID)) // Neeed more conditions { if (SetLowComplexityRegion == true) { string sequenceInWindow = reference->getSubSequence(RefIDRedName[alignment.RefID], winstartpos, (winendpos-winstartpos+1) ); if (IsWindowInRepeatRegion(sequenceInWindow) == true) { if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) == 0) { TotalReads++; if (alignment.IsMapped()) { string referenceSequence = reference->getSubSequence(RefIDRedName[alignment.RefID], alignment.Position, 2*alignment.Length); vector<SalRealignInfo>::iterator tIter; SalRealignInfo alr; alr.al = alignment; alr.currentReadPosition = 0; alr.currentGenomeSeqPosition = 0; alr.currentAlPosition = alignment.Position; alr.cigarindex = 0; alr.HasRealign = false; alr.CigarSoftclippingLength = 0; string str = "ZZZZZZZZZZZZZZZZZ"; if (alignment.Name.find(str) != string::npos) { stringstream cigar; for (vector<CigarOp>::const_iterator cigarIter = alignment.CigarData.begin(); cigarIter != alignment.CigarData.end(); ++cigarIter) cigar << cigarIter->Length << cigarIter->Type; string cigarstr = cigar.str(); cerr << " TRACKING: " << alignment.RefID << " " << alignment.Name << " pos: " << alignment.Position << " cigar: " << cigarstr << endl; } RealignFunction.ParseAlignmentsAndExtractVariantsByBaseQualv(AlGroups, alr, tIter, alignment, referenceSequence, minmaxRefSeqPos, winstartpos, winendpos, (float) minbaseQ, true); NewReadMappedcount++; } else { SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment)); cerr << "UNmapped : " << alignment.Name << endl; } } else if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) == 1) { SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment)); } else break; } else { if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) < 2) SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment)); else break; } } else // (SetLowComplexityRegion == false) { if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) == 0) { TotalReads++; if (alignment.IsMapped()) { string referenceSequence = reference->getSubSequence(RefIDRedName[alignment.RefID], alignment.Position, 2 * alignment.Length); vector<SalRealignInfo>::iterator tIter; SalRealignInfo alr; alr.al = alignment; alr.currentReadPosition = 0; alr.currentGenomeSeqPosition = 0; alr.currentAlPosition = alignment.Position; alr.cigarindex = 0; alr.HasRealign = false; alr.CigarSoftclippingLength = 0; string str = "ZZZZZZZZZZZZZZZZZ"; if (alignment.Name.find(str) != string::npos) { stringstream cigar; for (vector<CigarOp>::const_iterator cigarIter = alignment.CigarData.begin(); cigarIter != alignment.CigarData.end(); ++cigarIter) cigar << cigarIter->Length << cigarIter->Type; string cigarstr = cigar.str(); cerr << " TRACKING: " << alignment.RefID << " " << alignment.Name << " pos: " << alignment.Position << " cigar: " << cigarstr << endl; } RealignFunction.ParseAlignmentsAndExtractVariantsByBaseQualv(AlGroups, alr, tIter, alignment, referenceSequence, minmaxRefSeqPos, winstartpos, winendpos, (float) minbaseQ, true); //cerr << " winstart: " << winstartpos << " ; winend: " << winendpos; //cerr << " INDEL: " << alignment.RefID << " " << alignment.Name << " pos: " << alignment.Position << " Length: " << alignment.Length << " CIGARstr: " << cigarstr << endl; NewReadMappedcount++; } else { SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment)); cerr << "UNmapped : " << alignment.Name << endl; } } else if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) == 1) { SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment)); } else break; } ////Get next alignment IsNextAlignment = reader.GetNextAlignment(alignment); } //cerr << "Done: Gather aligmenets that lie (fully or partially) within the window frame and group INDELs if there are ... " << endl; //// Detector Corner bool ToRealign = MeetIndelDetectorThresholdv(AlGroups); cerr << "MeetIndelDetectorThresholdv(AlGroups).size()= " << AlGroups.size() << endl; // ************** if (ToRealign) { //cerr << " ToRealign: " << refdataIter->RefName << "\t" << reflength << "\t" << winstartpos << "\t" << winendpos << "\t" << AlGroups.size() << endl; //cerr << " minmaxRefSeqPos.at(1)= " << minmaxRefSeqPos.at(1) << " minmaxRefSeqPos.at(0)= " << minmaxRefSeqPos.at(0) << endl; ////// Perform Realign routines int TotalAlR = 0; // Total number of alignments to be realigned int NumAlR = 0; // Now many alignments are aligned TotalWindowDetected++; cerr << "#Start: Meet Threshold, Realigning ... " << endl; if (minmaxRefSeqPos.at(1) < winendpos) minmaxRefSeqPos.at(1) = winendpos; if (minmaxRefSeqPos.at(0) > winstartpos) minmaxRefSeqPos.at(0) = winstartpos; bool IsToRealign = RealignFunction.PruningByNaiveSelectionProcedureAndConstructHaplotypes2(winstartpos, winendpos, refid, refname, minmaxRefSeqPos, reference); if (IsToRealign == true) { RealignFunction.SelectHaplotypeCandidates_SmithWatermanBSv(AlGroups, minmaxRefSeqPos, SetLowComplexityRegionSWGapExt); minmaxRefSeqPos.at(0) = -1; minmaxRefSeqPos.at(1) = 0; int nextwinstartpos = winendpos + 1; int nextwinendpos = winstartpos + windowlen - 1; if (nextwinendpos > reflength) nextwinendpos = reflength; //cerr << " Before Realign : " << SortRealignedAlignmentsMultimap.size() << endl; RealignFunction.AdjustCigarsWRTChosenMultipleHaplotypesAndPrepareAlignmentsTobeWrittenOut(AlGroups, SortRealignedAlignmentsMultimap, reference, RefIDRedName, minmaxRefSeqPos, nextwinstartpos, nextwinendpos, minbaseQ, TotalAlR, NumAlR, ploidy); IsPassDetectorNoRealignment = false; // Set flag to false to deactivate write functions //cerr << " After Realign : " << SortRealignedAlignmentsMultimap.size() << endl; TotalReadsAligned += NumAlR; if (NumAlR > 0) // Realignment done windowrealigned++; } else cerr << "#Done: Meet Threshold, Realigning ... " << endl; } if (NewReadMappedcount > 0) TotalWindow++; RealignFunction.Clear(); //// Move the window frame winstartpos = winendpos + 1; winendpos = winstartpos + windowlen - 1; } //// Save and Erase remaining alignments that are outside of window (Deque?) if ((!AlGroups.empty())) { cerr << "#Start: Write Remaining alignments and delete all alignments" << endl; for (vector<SalRealignInfo>::iterator Iter = AlGroups.begin(); Iter != AlGroups.end(); ++Iter) { //cerr << " Remain alignment start: " << (*Iter).al.Name << " " << Iter->al.Position << " < " << winstartpos << " " << winendpos << endl; SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > ((*Iter).al.Position, (*Iter).al)); } cerr << "#Done: Write Remaining alignments and delete all alignments" << endl; } AlGroups.clear(); // Write Sorted remaining Alignments that are outside of window if (!SortRealignedAlignmentsMultimap.empty()) { for (multimap<int, BamAlignment>::iterator sraIter = SortRealignedAlignmentsMultimap.begin(); sraIter != SortRealignedAlignmentsMultimap.end(); ++sraIter) { //writer.SaveAlignment((*sraIter).second); if (!writer.SaveAlignment((*sraIter).second)) cerr << writer.GetErrorString() << endl; } SortRealignedAlignmentsMultimap.clear(); } } ++regionListIter; if ((*regionListIter).LeftRefID > refid) nextChrName = true; } //// If End of the chromosome position //// increament iterator ++refdataIter; ++refid; } reader.Close(); writer.Close(); cerr << "##-Completed- " << endl; cerr << " Total Reads processed = " << TotalReads << endl; cerr << " Total Reads Aligned = " << TotalReadsAligned << endl; cerr << " Total Window processed = " << TotalWindow << endl; cerr << " Total Window Detected = " << TotalWindowDetected << endl; cerr << " Total Windows Aligned = " << windowrealigned << endl; // Restore cerr's stream buffer before terminating if (cerrlog.is_open()) cerr.rdbuf(cerrsave); commandline.clear(); return 0; }
int main (int argc, char** argv) { std::string command; std::string fastaFileName; std::string seqname; std::string longseqname; bool dump = false; bool buildIndex = false; // flag to force index building bool printEntropy = false; // entropy printing bool readRegionsFromStdin = false; std::string region; int c; while (true) { static struct option long_options[] = { /* These options set a flag. */ {"help", no_argument, 0, 'h'}, {"index", no_argument, 0, 'i'}, {"entropy", no_argument, 0, 'e'}, {"region", required_argument, 0, 'r'}, {"stdin", no_argument, 0, 'c'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hciedr:", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) break; switch (c) { case 0: /* If this option set a flag, do nothing else now. */ if (long_options[option_index].flag != 0) break; printf ("option %s", long_options[option_index].name); if (optarg) printf (" with arg %s", optarg); printf ("\n"); break; case 'e': printEntropy = true; break; case 'c': readRegionsFromStdin = true; break; case 'i': buildIndex = true; break; case 'r': region = optarg; break; case 'd': dump = true; break; case 'h': printSummary(); exit(0); break; case '?': /* getopt_long already printed an error message. */ printSummary(); exit(1); break; default: abort (); } } /* Print any remaining command line arguments (not options). */ if (optind < argc) { //cerr << "fasta file: " << argv[optind] << std::endl; fastaFileName = argv[optind]; } else { std::cerr << "Please specify a FASTA file." << std::endl; printSummary(); exit(1); } if (buildIndex) { FastaIndex* fai = new FastaIndex(); //cerr << "generating fasta index file for " << fastaFileName << std::endl; fai->indexReference(fastaFileName); fai->writeIndexFile((std::string) fastaFileName + fai->indexFileExtension()); } std::string sequence; // holds sequence so we can optionally process it FastaReference fr; fr.open(fastaFileName); if (dump) { for (vector<std::string>::iterator s = fr.index->sequenceNames.begin(); s != fr.index->sequenceNames.end(); ++s) { std::cout << *s << "\t" << fr.getSequence(*s) << std::endl; } return 0; } if (region != "") { FastaRegion target(region); sequence = fr.getTargetSubSequence(target); } if (readRegionsFromStdin) { std::string regionstr; while (getline(cin, regionstr)) { FastaRegion target(regionstr); if (target.startPos == -1) { std::cout << fr.getSequence(target.startSeq) << std::endl; } else { std::cout << fr.getSubSequence(target.startSeq, target.startPos - 1, target.length()) << std::endl; } } } else { if (sequence != "") { if (printEntropy) { if (sequence.size() > 0) { std::cout << shannon_H((char*) sequence.c_str(), sequence.size()) << std::endl; } else { std::cerr << "please specify a region or sequence for which to calculate the shannon entropy" << std::endl; } } else { // if no statistical processing is requested, just print the sequence std::cout << sequence << std::endl; } } } return 0; }
int main(int argc, char** argv) { int window = 150; VariantCallFile variantFile; string fastaFileName; int c; while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"reference", required_argument, 0, 'r'}, {"window", required_argument, 0, 'w'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hw:r:", long_options, &option_index); if (c == -1) break; switch (c) { case 'r': fastaFileName = optarg; break; case 'w': window = atoi(optarg); break; case '?': printSummary(argv); exit(1); break; case 'h': printSummary(argv); break; default: abort (); } } if (optind < argc) { string filename = argv[optind]; variantFile.open(filename); } else { variantFile.open(std::cin); } if (!variantFile.is_open()) { cerr << "could not open VCF file" << endl; exit(1); } FastaReference fastaReference; if (fastaFileName.empty()) { cerr << "a reference is required" << endl; exit(1); } else { fastaReference.open(fastaFileName); } /* variantFile.addHeaderLine("##INFO=<ID=TYPE,Number=A,Type=String,Description=\"The type of allele, either snp, mnp, ins, del, or complex.\">"); variantFile.addHeaderLine("##INFO=<ID=LEN,Number=A,Type=Integer,Description=\"allele length\">"); if (!parseFlag.empty()) { variantFile.addHeaderLine("##INFO=<ID="+parseFlag+",Number=0,Type=Flag,Description=\"The allele was parsed using vcfallelicprimitives.\">"); } */ cout << variantFile.header << endl; Variant var(variantFile); while (variantFile.getNextVariant(var)) { // if there is no indel, there is nothing to realign bool hasIndel = false; for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { if (a->size() != var.ref.size()) { hasIndel = true; break; } } if (!hasIndel) { cout << var << endl; continue; } vector<AltAlignment> alignments; string ref; // determine window size to prevent mismapping with SW algorithm int currentWindow = window; int scale = 2; if (var.ref.size()*scale > currentWindow) currentWindow = var.ref.size()*scale; for (vector<string>::iterator a = var.alleles.begin(); a != var.alleles.end(); ++a) { if (a->size()*scale > currentWindow) { currentWindow = a->size()*scale; } } // while the entropy of either flank is < some target entropy (~1 is fine), increase the flank sizes while (currentWindow < 2000) { // limit to one step > than this string refTarget = fastaReference.getSubSequence(var.sequenceName, var.position - 1 - currentWindow/2, currentWindow); if (entropy(refTarget.substr(0, refTarget.size()/2)) < 1 || entropy(refTarget.substr(refTarget.size()/2)) < 1) { currentWindow *= scale; } else { break; } } // do the alignments getAlignment(var, fastaReference, ref, alignments, currentWindow); // stably left align the alignments for (vector<AltAlignment>::iterator a = alignments.begin(); a != alignments.end(); ++a) { Cigar cigarBefore = a->cigar; //cerr << a->seq << endl; //cerr << "before : " << a->pos << " " << joinCigar(a->cigar) << endl; long int prev = a->pos; stablyLeftAlign(a->seq, ref, a->cigar, 20, false); //cerr << "after : " << a->pos << " " << joinCigar(a->cigar) << endl; if (a->pos != prev) cerr << "modified alignment @ " << var << endl; } //cout << var << endl; // transform the mappings // chop off leading matching bases // find the range of bp in the alleles // make the new ref allele // make the new alt alleles // emit the var long int newPosition = var.position+currentWindow/2; long int newEndPosition = var.position-currentWindow/2; // check for no-indel case int newLength = var.ref.size(); bool giveUp = false; for (vector<AltAlignment>::iterator a = alignments.begin(); a != alignments.end() && !giveUp; ++a) { // get the first mismatching position Cigar::iterator c = a->cigar.begin(); int rp = 0; int sp = 0; bool hitMismatch = false; int matchingBpAtStart = 0; int matchingBpAtEnd = 0; // will be set to true if the first reference position match is broken by a SNP, not an indel bool leadingSNP = false; while (c != a->cigar.end()) { char op = c->second[0]; if (c == a->cigar.begin()) { if (op != 'M') { cerr << "alignment does not start on matched sequence" << endl; cerr << var << endl; exit(1); } int i = 0; for ( ; i < c->first; ++i) { if (ref[i] != a->seq[i]) { leadingSNP = true; break; } } matchingBpAtStart = i; } if (!leadingSNP && c == (a->cigar.begin()+1)) { // if the first thing we run into is an indel, step back, per VCF spec if (op == 'D' || op == 'I') { --matchingBpAtStart; } } if (c == (a->cigar.end()-1)) { if (op != 'M') { // soft clip at end // it'll be hard to interpret this // the alignments sometimes generate this // best thing to do is to move on //cerr << "alignment does not end on matched sequence" << endl; //cout << var << endl; //exit(1); giveUp = true; break; } int i = 0; for ( ; i < c->first; ++i) { if (ref[ref.size()-1-i] != a->seq[a->seq.size()-1-i]) { break; } } matchingBpAtEnd = i; } ++c; } int altMismatchLength = a->seq.size() - matchingBpAtEnd - matchingBpAtStart; int refMismatchLength = (var.ref.size() + currentWindow) - matchingBpAtEnd - matchingBpAtStart; //cerr << "alt mismatch length " << altMismatchLength << endl // << "ref mismatch length " << refMismatchLength << endl; long int newStart = var.position - currentWindow/2 + matchingBpAtStart; long int newEnd = newStart + refMismatchLength; //cerr << "ref should run from " << newStart << " to " << newStart + refMismatchLength << endl; newPosition = min(newStart, newPosition); newEndPosition = max(newEnd, newEndPosition); //cerr << newPosition << " " << newEndPosition << endl; //if (newRefSize < refMismatchLength) newRefSize = refMismatchLength; } // the alignment failed for some reason, continue if (giveUp) { cout << var << endl; continue; } //cerr << "new ref start " << newPosition << " and end " << newEndPosition << " was " << var.position << "," << var.position + var.ref.size() << endl; int newRefSize = newEndPosition - newPosition; string newRef = fastaReference.getSubSequence(var.sequenceName, newPosition-1, newRefSize); // get the number of bp to strip from the alts int stripFromStart = currentWindow/2 - (var.position - newPosition); int stripFromEnd = (currentWindow + newRefSize) - (stripFromStart + newRefSize) + (var.ref.size() - newRefSize); //cerr << "strip from start " << stripFromStart << endl; //cerr << "strip from end " << stripFromEnd << endl; vector<string> newAlt; vector<string>::iterator l = var.alt.begin(); bool failedAlt = false; for (vector<AltAlignment>::iterator a = alignments.begin(); a != alignments.end(); ++a, ++l) { int diff = newRef.size() - l->size(); string alt = a->seq.substr(stripFromStart, a->seq.size() - (stripFromEnd + stripFromStart)); newAlt.push_back(alt); if (alt.empty()) failedAlt = true; } // check the before/after haplotypes bool brokenRealignment = false; if (!newRef.empty() && !failedAlt) { int slop = 50; // 50 extra bp! int haplotypeStart = min(var.position, newPosition) - slop; int haplotypeEnd = max(var.position + var.ref.size(), newPosition + newRef.size()) + slop; string referenceHaplotype = fastaReference.getSubSequence(var.sequenceName, haplotypeStart - 1, haplotypeEnd - haplotypeStart); vector<string>::iterator o = var.alt.begin(); vector<string>::iterator n = newAlt.begin(); for ( ; o != var.alt.end() ; ++o, ++n) { // map the haplotypes string oldHaplotype = referenceHaplotype; string newHaplotype = referenceHaplotype; oldHaplotype.replace(var.position - haplotypeStart, var.ref.size(), *o); newHaplotype.replace(newPosition - haplotypeStart, newRef.size(), *n); if (oldHaplotype != newHaplotype) { cerr << "broken left alignment!" << endl << "old " << oldHaplotype << endl << "new " << newHaplotype << endl; cerr << "was: " << var << endl; brokenRealignment = true; } } } // *if* everything is OK, update the variant if (!brokenRealignment && !newRef.empty() && !failedAlt) { var.ref = newRef; var.alt = newAlt; var.position = newPosition; } cout << var << endl; // for each parsedalternate, get the position // build a new vcf record for that position // unless we are already at the position ! // take everything which is unique to that allele (records) and append it to the new record // then handle genotypes; determine the mapping between alleleic primitives and convert to phased haplotypes // this means taking all the parsedAlternates and, for each one, generating a pattern of allele indecies corresponding to it //for (vector<Variant>::iterator v = variants.begin(); v != variants.end(); ++v) { } return 0; }
int main(int argc, char** argv) { int c; string fastaRef; bool keepFailures = false; bool excludeFailures = false; if (argc == 1) printSummary(argv); while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"fasta-reference", required_argument, 0, 'f'}, {"exclude-failures", no_argument, 0, 'x'}, {"keep-failures", no_argument, 0, 'k'}, //{"length", no_argument, &printLength, true}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hxkf:", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) break; switch (c) { case 0: /* If this option set a flag, do nothing else now. */ if (long_options[option_index].flag != 0) break; printf ("option %s", long_options[option_index].name); if (optarg) printf (" with arg %s", optarg); printf ("\n"); break; case 'f': fastaRef = optarg; break; case 'x': excludeFailures = true; break; case 'k': keepFailures = true; break; case 'h': printSummary(argv); exit(0); break; case '?': /* getopt_long already printed an error message. */ printSummary(argv); exit(1); break; default: abort (); } } if (fastaRef.empty()) { cerr << "a FASTA reference sequence must be specified" << endl; exit(1); } FastaReference ref; ref.open(fastaRef); VariantCallFile variantFile; string inputFilename; if (optind == argc - 1) { inputFilename = argv[optind]; variantFile.open(inputFilename); } else { variantFile.open(std::cin); } if (!variantFile.is_open()) { return 1; } if (keepFailures || excludeFailures) { cout << variantFile.header << endl; } Variant var(variantFile); while (variantFile.getNextVariant(var)) { int refstart = var.position - 1; // convert to 0-based string matchedRef = ref.getSubSequence(var.sequenceName, refstart, var.ref.size()); if (var.ref != matchedRef) { if (keepFailures) { cout << var << endl; } else if (!excludeFailures) { cout << "mismatched reference " << var.ref << " should be " << matchedRef << " at " << var.sequenceName << ":" << var.position << endl; } } else if (excludeFailures) { cout << var << endl; } } return 0; }
int main(int argc, char** argv) { string vcfFileName; string fastaFileName; int windowsize = 100; bool includePreviousBaseForIndels = false; bool useMNPs = true; int altwindowsize = 50; // constants for SmithWaterman algorithm float matchScore = 10.0f; float mismatchScore = -9.0f; float gapOpenPenalty = 15.0f; float gapExtendPenalty = 6.66f; bool useEntropy = false; bool useRepeatGapExtendPenalty = false; float repeatGapExtendPenalty = 1; bool adjustVcf = false; string adjustedTag = "remappedCIGAR"; if (argc == 1) printSummary(argv); int c; while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"ref-window-size", required_argument, 0, 'w'}, {"reference", required_argument, 0, 'r'}, {"match-score", required_argument, 0, 'm'}, {"mismatch-score", required_argument, 0, 'x'}, {"gap-open-penalty", required_argument, 0, 'o'}, {"gap-extend-penalty", required_argument, 0, 'e'}, {"alt-window-size", required_argument, 0, 's'}, {"entropy-gap-open", no_argument, 0, 'z'}, {"repeat-gap-extend", no_argument, 0, 'R'}, {"adjust-vcf", required_argument, 0, 'a'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hza:w:r:m:x:o:e:s:R:", long_options, &option_index); if (c == -1) break; switch (c) { case 'w': windowsize = atoi(optarg); break; case 'a': adjustVcf = true; adjustedTag = optarg; break; case 'r': fastaFileName = string(optarg); break; case 'h': printSummary(argv); break; case 'm': matchScore = atof(optarg); break; case 'x': mismatchScore = atof(optarg); break; case 'o': gapOpenPenalty = atof(optarg); break; case 'e': gapExtendPenalty = atof(optarg); break; case 's': altwindowsize = atoi(optarg); break; case 'z': useEntropy = true; break; case 'R': useRepeatGapExtendPenalty = true; repeatGapExtendPenalty = atof(optarg); break; case '?': printSummary(argv); exit(1); break; default: abort (); } } VariantCallFile variantFile; string inputFilename; if (optind == argc - 1) { inputFilename = argv[optind]; variantFile.open(inputFilename); } else { variantFile.open(std::cin); } if (!variantFile.is_open()) { cerr << "could not open VCF file" << endl; exit(1); } FastaReference freference; if (fastaFileName.empty()) { cerr << "a reference is required" << endl; exit(1); } else { freference.open(fastaFileName); } if (adjustVcf) { vector<string> commandline; for (int i = 0; i < argc; ++i) commandline.push_back(argv[i]); variantFile.addHeaderLine("##INFO=<ID=" + adjustedTag + ",Number=A,Type=String,Description=\"CIGAR when remapped using"+ join(commandline, " ") +"\">"); } cout << variantFile.header << endl; Variant var(variantFile); while (variantFile.getNextVariant(var)) { //if (!adjustVcf) { cout << endl; cout << var << endl; //} map<string, vector<VariantAllele> > variantAlleles; vector<vector<pair<int, char> > > cigars; vector<int> positionDiffs; for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { //if (!adjustVcf) cout << endl; cout << endl; // try to remap locally string reference = freference.getSubSequence(var.sequenceName, var.position - 1 - windowsize, windowsize * 2 + var.ref.size()); // passed to sw align unsigned int referencePos; string cigar; string& alternate = *a; vector<VariantAllele>& variants = variantAlleles[alternate]; string alternateQuery = reference.substr(windowsize - altwindowsize, altwindowsize) + alternate + reference.substr(reference.size() - windowsize, altwindowsize); //cout << "REF:\t" << reference << endl; //cout << "ALT:\t" << string(windowsize - altwindowsize, ' ') << alternateQuery << endl; CSmithWatermanGotoh sw(matchScore, mismatchScore, gapOpenPenalty, gapExtendPenalty); if (useEntropy) sw.EnableEntropyGapPenalty(1); if (useRepeatGapExtendPenalty) sw.EnableRepeatGapExtensionPenalty(repeatGapExtendPenalty); sw.Align(referencePos, cigar, reference, alternateQuery); int altpos = 0; int refpos = 0; int len; string slen; vector<pair<int, char> > cigarData; string ref = reference.substr(referencePos); positionDiffs.push_back(referencePos); // TODO this... is borked stringstream refss; stringstream altss; if (!adjustVcf) cout << cigar << endl; cout << cigar << endl; for (string::iterator c = cigar.begin(); c != cigar.end(); ++c) { switch (*c) { case 'I': len = atoi(slen.c_str()); slen.clear(); if (altpos < altwindowsize) { cigarData.push_back(make_pair(len, 'M')); } else { cigarData.push_back(make_pair(len, *c)); } altss << alternateQuery.substr(altpos, len); refss << string(len, '-'); altpos += len; break; case 'D': len = atoi(slen.c_str()); slen.clear(); if (altpos < altwindowsize) { } else { cigarData.push_back(make_pair(len, *c)); } refss << ref.substr(refpos, len); altss << string(len, '-'); refpos += len; break; case 'M': len = atoi(slen.c_str()); slen.clear(); { for (int i = 0; i < len; ++i) { if (ref.at(refpos + i) == alternateQuery.at(altpos + i)) { if (!cigarData.empty() && cigarData.back().second == 'M') { cigarData.back().first++; } else { cigarData.push_back(make_pair(1, 'M')); } } else { if (!cigarData.empty() && cigarData.back().second == 'X') { cigarData.back().first++; } else { cigarData.push_back(make_pair(1, 'X')); } } } } refss << ref.substr(refpos, len); altss << alternateQuery.substr(altpos, len); refpos += len; altpos += len; break; case 'S': len = atoi(slen.c_str()); slen.clear(); cigarData.push_back(make_pair(len, *c)); refss << ref.substr(refpos, len); //altss << alternateQuery.substr(altpos, len); // TODO deal with soft clipping, weird behavior refpos += len; altpos += len; break; default: len = 0; slen += *c; break; } } if (!adjustVcf) { cout << "ref:\t" << refss.str() << endl; cout << "alt:\t" << altss.str() << endl; } else { cout << "ref:\t" << refss.str() << endl; cout << "alt:\t" << altss.str() << endl; cigars.push_back(cigarData); } } if (adjustVcf) { int substart = cigars.front().front().first; int subend = cigars.front().back().first; // find the min and max match for (vector<vector<pair<int, char> > >::iterator c = cigars.begin(); c != cigars.end(); ++c) { if (c->front().second == 'M' && c->front().first <= substart) { substart = c->front().first; if (c->size() > 1 && c->at(1).second != 'X') { --substart; } } if (c->back().second == 'M' && c->back().first <= subend) { subend = c->back().first; } } // adjust the cigars and get the new reference length int reflen = 0; for (vector<vector<pair<int, char> > >::iterator c = cigars.begin(); c != cigars.end(); ++c) { c->front().first -= substart; c->back().first -= subend; int crf = cigarRefLen(*c); if (crf > reflen) reflen = crf; var.info[adjustedTag].push_back(joinCigar(*c)); } // find the lowest positional difference int pdiff = 0; for (vector<int>::iterator d = positionDiffs.begin(); d != positionDiffs.end(); ++d) { if (*d + altwindowsize < pdiff) pdiff = *d + altwindowsize; } // adjust the reference string var.position += pdiff; // adjust the variant position var.ref = freference.getSubSequence(var.sequenceName, var.position - 1, reflen); cout << var << endl; } } return 0; }
int main(int argc, char** argv) { string bedFileName; string vcfFileName; string fastaFileName; bool intersecting = false; bool unioning = false; bool invert = false; bool contained = true; bool overlapping = false; int windowsize = 30; if (argc == 1) printSummary(argv); int c; while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"bed", required_argument, 0, 'b'}, {"invert", no_argument, 0, 'v'}, {"intersect-vcf", required_argument, 0, 'i'}, {"union-vcf", required_argument, 0, 'u'}, {"contained", no_argument, 0, 'c'}, {"overlapping", no_argument, 0, 'o'}, {"window-size", required_argument, 0, 'w'}, {"reference", required_argument, 0, 'r'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hvcob:i:u:w:r:", long_options, &option_index); if (c == -1) break; switch (c) { case 'w': windowsize = atoi(optarg); break; case 'b': bedFileName = string(optarg); break; case 'i': intersecting = true; vcfFileName = string(optarg); break; case 'u': unioning = true; vcfFileName = string(optarg); break; case 'r': fastaFileName = string(optarg); break; case 'v': invert = true; break; case 'c': contained = true; break; case 'o': overlapping = true; break; case 'h': printSummary(argv); break; case '?': printSummary(argv); exit(1); break; default: abort (); } } bool usingBED = false; if (!bedFileName.empty()) { usingBED = true; } BedReader bed; if (usingBED) { bed.open(bedFileName); } VariantCallFile variantFile; string inputFilename; if (optind == argc - 1) { inputFilename = argv[optind]; variantFile.open(inputFilename); } else { variantFile.open(std::cin); } if (!variantFile.is_open()) { cerr << "could not open VCF file" << endl; exit(1); } if (usingBED) { variantFile.parseSamples = false; } VariantCallFile otherVariantFile; if (!vcfFileName.empty()) { otherVariantFile.open(vcfFileName); if (!otherVariantFile.is_open()) { cerr << "could not open VCF file " << vcfFileName << endl; exit(1); } } FastaReference reference; if (unioning || intersecting) { if (fastaFileName.empty()) { cerr << "a reference is required for haplotype-based intersection and unioniong" << endl; exit(1); } reference.open(fastaFileName); } if (!unioning && !intersecting) { variantFile.parseSamples = false; // faster, as when we are // only bed-intersecting we // can do position-only // output and don't have to // manipulate specific // alleles } // read the VCF file for union or intersection into an interval tree // indexed using some proximity window map<string, IntervalTree<Variant*> > variantIntervals; map<string, list<Variant> > otherVariants; map<string, vector<Interval<Variant*> > > otherVariantIntervals; if (unioning || intersecting) { Variant ovar(otherVariantFile); while (otherVariantFile.getNextVariant(ovar)) { long int left = ovar.position; long int right = left + ovar.ref.size(); // this should be 1-past the end otherVariants[ovar.sequenceName].push_back(ovar); Variant* v = &otherVariants[ovar.sequenceName].back(); otherVariantIntervals[ovar.sequenceName].push_back(Interval<Variant*>(left, right, v)); } for (map<string, vector<Interval<Variant*> > >::iterator j = otherVariantIntervals.begin(); j != otherVariantIntervals.end(); ++j) { variantIntervals[j->first] = IntervalTree<Variant*>(j->second); } } set<Variant*> outputVariants; long unsigned int lastOutputPosition = 0; string lastSequenceName; cout << variantFile.header; Variant var(variantFile); while (variantFile.getNextVariant(var)) { if (lastSequenceName.empty()) { lastSequenceName = var.sequenceName; } else if (lastSequenceName != var.sequenceName) { if (unioning) { vector<Interval<Variant*> > previousRecords; long int lastSeqLength = reference.sequenceLength(lastSequenceName); variantIntervals[lastSequenceName].findContained(lastOutputPosition, lastSeqLength, previousRecords); for (vector<Interval<Variant*> >::iterator r = previousRecords.begin(); r != previousRecords.end(); ++r) { Variant* v = r->value; if (outputVariants.find(v) == outputVariants.end()) { outputVariants.insert(v); cout << *v << endl; // does this output everything in correct order? } } lastSequenceName = var.sequenceName; lastOutputPosition = 0; } } if (usingBED) { BedTarget record(var.sequenceName, var.position, var.position + var.ref.size(), ""); vector<BedTarget*> overlaps = bed.targetsOverlapping(record); if (!invert && !overlaps.empty()) { cout << variantFile.line << endl; } else if (invert && overlaps.empty()) { cout << variantFile.line << endl; } } else if (unioning || intersecting) { // TODO check overlaps with union/intersection // hmm... for unioning, you might need to step through the original VCF records // but the idea is to exclude the haplotype-based duplicates vector<Interval<Variant*> > results; variantIntervals[var.sequenceName].findContained(var.position - windowsize, var.position + var.ref.size() + windowsize, results); vector<Variant*> overlapping; for (vector<Interval<Variant*> >::iterator r = results.begin(); r != results.end(); ++r) { overlapping.push_back(r->value); } if (unioning) { // unioning strategy // write out all the records from the last file // between the last one printed out and the first // one we're about to print out vector<Interval<Variant*> > previousRecords; variantIntervals[var.sequenceName].findOverlapping(lastOutputPosition, var.position - windowsize, previousRecords); map<long int, vector<Variant*> > variants; for (vector<Interval<Variant*> >::iterator r = previousRecords.begin(); r != previousRecords.end(); ++r) { Variant* v = r->value; if (outputVariants.find(v) == outputVariants.end()) { outputVariants.insert(v); variants[v->position].push_back(v); } } for (map<long int, vector<Variant*> >::iterator v = variants.begin(); v != variants.end(); ++v) { for (vector<Variant*>::iterator o = v->second.begin(); o != v->second.end(); ++o) { cout << **o << endl; lastOutputPosition = max(lastOutputPosition, (*o)->position); } } // TODO find the duplicates for the other file } if (overlapping.empty()) { if (unioning || (intersecting && invert)) { cout << var << endl; lastOutputPosition = max(lastOutputPosition, var.position); } } else { // get the min and max of the overlaps int haplotypeStart = var.position; int haplotypeEnd = var.position + var.ref.size(); for (vector<Variant*>::iterator v = overlapping.begin(); v != overlapping.end(); ++v) { haplotypeStart = min((*v)->position, (long unsigned int) haplotypeStart); haplotypeEnd = max((*v)->position + (*v)->ref.size(), (long unsigned int) haplotypeEnd); } // for everything overlapping and the current variant, construct the local haplotype within the bounds // if there is an exact match, the alllele in the current VCF does intersect string referenceHaplotype = reference.getSubSequence(var.sequenceName, haplotypeStart - 1, haplotypeEnd - haplotypeStart); map<string, vector<Variant*> > haplotypes; for (vector<Variant*>::iterator v = overlapping.begin(); v != overlapping.end(); ++v) { Variant& variant = **v; for (vector<string>::iterator a = variant.alt.begin(); a != variant.alt.end(); ++a) { string haplotype = referenceHaplotype; // get the relative start and end coordinates for the variant alternate allele int relativeStart = variant.position - haplotypeStart; haplotype.replace(relativeStart, variant.ref.size(), *a); haplotypes[haplotype].push_back(*v); } } // determine the non-intersecting alts vector<string> altsToRemove; for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { string haplotype = referenceHaplotype; int relativeStart = var.position - haplotypeStart; haplotype.replace(relativeStart, var.ref.size(), *a); map<string, vector<Variant*> >::iterator h = haplotypes.find(haplotype); if ((intersecting && !invert && h == haplotypes.end()) || (intersecting && invert && h != haplotypes.end()) || (unioning && h != haplotypes.end())) { altsToRemove.push_back(*a); } } // remove the non-overlapping (intersecting) or overlapping (unioning) alts for (vector<string>::iterator a = altsToRemove.begin(); a != altsToRemove.end(); ++a) { var.removeAlt(*a); } if (unioning) { // somehow sort the records and combine them? map<long int, vector<Variant*> > variants; for (vector<Variant*>::iterator o = overlapping.begin(); o != overlapping.end(); ++o) { if ((*o)->position <= var.position && // check ensures proper ordering of variants on output outputVariants.find(*o) == outputVariants.end()) { outputVariants.insert(*o); variants[(*o)->position].push_back(*o); } } // add in the current variant, if it has alts left if (!var.alt.empty()) { variants[var.position].push_back(&var); } for (map<long int, vector<Variant*> >::iterator v = variants.begin(); v != variants.end(); ++v) { for (vector<Variant*>::iterator o = v->second.begin(); o != v->second.end(); ++o) { cout << **o << endl; lastOutputPosition = max(lastOutputPosition, (*o)->position); } } } else { // if any alts remain, output the variant record if (!var.alt.empty()) { cout << var << endl; lastOutputPosition = max(lastOutputPosition, var.position); } } } } } // if unioning, and any variants remain, output them if (unioning) { for (map<string, list<Variant> >::iterator chrom = otherVariants.find(lastSequenceName); chrom != otherVariants.end(); ++chrom) { for (list<Variant>::iterator v = chrom->second.begin(); v != chrom->second.end(); ++v) { Variant* variant = &*v; if (outputVariants.find(variant) == outputVariants.end()) { outputVariants.insert(variant); cout << *variant << endl; // TODO guarantee sorting } } } } exit(0); // why? return 0; }
void SamRead::getRefSeq() { originalSeq = seq; originalQual = qual; RefSeq = ""; string NewSeq = ""; string NewQual = ""; string NewCigar = ""; string NewStrand = ""; vector<int> NewPositions; vector<string> NewChromosome; int InsOffset = 0; if (Reff.sequenceNameStartingWith(chr) == "") { cout << "ERROR chr " << chr << " not found\n"; return; } // correct star position of the read to account for Hard and soft clipped // bases as we are counting those now for (int i = 0; i < cigarString.size(); i++) { if (cigarString.c_str()[i] == 'H') { } else { pos = pos - i; break; } } for (int i = 0; i < cigarString.size(); i++) { if (cigarString.c_str()[i] == 'S') { } else { pos = pos - i; break; } } int Roffset = 0; int Coffset = 0; for (int i = 0; i < cigarString.length(); i++) { if (cigarString.c_str()[i] == 'M') { RefSeq += toupper( Reff.getSubSequence(chr, i + pos - 1 + Roffset, 1).c_str()[0]); NewSeq += seq.c_str()[i - Coffset]; NewQual += qual.c_str()[i - Coffset]; NewPositions.push_back(pos + i - InsOffset); NewChromosome.push_back(chr); if (toupper(Reff.getSubSequence(chr, i + pos - 1 + Roffset, 1) .c_str()[0]) == seq.c_str()[i - Coffset]) { NewCigar += 'M'; } else { NewCigar += 'X'; } } else if (cigarString.c_str()[i] == 'I') { RefSeq += '-'; Roffset += -1; NewSeq += seq.c_str()[i - Coffset]; NewQual += qual.c_str()[i - Coffset]; NewCigar += 'I'; InsOffset++; NewPositions.push_back(pos + i - InsOffset); NewChromosome.push_back(chr); } else if (cigarString.c_str()[i] == 'D') { NewSeq += '-'; NewQual += ' '; Coffset++; RefSeq += toupper( Reff.getSubSequence(chr, i + pos - 1 + Roffset, 1).c_str()[0]); NewCigar += 'D'; NewPositions.push_back(pos + i - InsOffset); NewChromosome.push_back(chr); } else if (cigarString.c_str()[i] == 'H') { RefSeq += 'H'; NewSeq += 'H'; NewQual += ' '; Coffset++; NewCigar += 'H'; NewPositions.push_back(-1); NewChromosome.push_back("nope"); } else if (cigarString.c_str()[i] == 'S') { RefSeq += '-'; NewSeq += seq.c_str()[i - Coffset]; NewQual += qual.c_str()[i - Coffset]; NewCigar += 'S'; NewPositions.push_back(pos + i - InsOffset); NewChromosome.push_back(chr); } else { cout << "Unexpected behavior in SamRead::getRefSeq()" << endl; } } seq = NewSeq; cigarString = NewCigar; qual.clear(); char lastQ = ' '; for (int i = 0; i < NewQual.size(); i++) { if (NewQual.c_str()[i] == ' ') { if (NewCigar.c_str()[i] == 'D') qual += lastQ; //'!'; else qual += '!'; } else { qual += NewQual.c_str()[i]; lastQ = NewQual.c_str()[i]; } } // build up the string that will indidcate if the read has to be flipped for (int i = 0; i < qual.size(); i++) { strand += "+"; } Positions = NewPositions; ChrPositions = NewChromosome; //************************Lookup Kmer counts ***************************// LookUpKmers(); vector<long> blank; cout << "After getRefSeq"; writeVertical(); }
int main(int argc, char** argv) { int c; string fastaRef; int windowSize = 0; if (argc == 1) printSummary(argv); while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"fasta-reference", required_argument, 0, 'f'}, {"window-size", required_argument, 0, 'w'}, //{"length", no_argument, &printLength, true}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hf:w:", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) break; switch (c) { case 0: /* If this option set a flag, do nothing else now. */ if (long_options[option_index].flag != 0) break; printf ("option %s", long_options[option_index].name); if (optarg) printf (" with arg %s", optarg); printf ("\n"); break; case 'f': fastaRef = optarg; break; case 'w': windowSize = atoi(optarg); break; case 'h': printSummary(argv); exit(0); break; case '?': /* getopt_long already printed an error message. */ printSummary(argv); exit(1); break; default: abort (); } } if (windowSize == 0) { cerr << "a window size must be specified" << endl; exit(1); } if (fastaRef.empty()) { cerr << "a FASTA reference sequence must be specified" << endl; exit(1); } FastaReference ref; ref.open(fastaRef); VariantCallFile variantFile; string inputFilename; if (optind == argc - 1) { inputFilename = argv[optind]; variantFile.open(inputFilename); } else { variantFile.open(std::cin); } if (!variantFile.is_open()) { return 1; } variantFile.addHeaderLine("##INFO=<ID=EntropyLeft,Number=1,Type=Float,Description=\"Entropy of left-flanking sequence of "+ convert(windowSize) +"bp\">"); variantFile.addHeaderLine("##INFO=<ID=EntropyCenter,Number=1,Type=Float,Description=\"Entropy of centered sequence of "+ convert(windowSize) +"bp\">"); variantFile.addHeaderLine("##INFO=<ID=EntropyRight,Number=1,Type=Float,Description=\"Entropy of right-flanking sequence of "+ convert(windowSize) +"bp\">"); variantFile.addHeaderLine("##INFO=<ID=EntropyRef,Number=1,Type=Float,Description=\"Entropy of REF allele\">"); variantFile.addHeaderLine("##INFO=<ID=EntropyAlt,Number=A,Type=Float,Description=\"Entropy of each ALT allele\">"); cout << variantFile.header << endl; Variant var(variantFile); while (variantFile.getNextVariant(var)) { // get the ref start and end positions int refstart = var.position - 1; // convert to 0-based int refend = var.position + var.ref.size() - 1; string leftseq = ref.getSubSequence(var.sequenceName, refstart - windowSize, windowSize); string rightseq = ref.getSubSequence(var.sequenceName, refend, windowSize); string centerseq = ref.getSubSequence(var.sequenceName, refstart - windowSize/2, windowSize); double entropyLeft = shannon_H((char*) &leftseq[0], windowSize); double entropyRight = shannon_H((char*) &rightseq[0], windowSize); double entropyCenter = shannon_H((char*) ¢erseq[0], windowSize); double entropyRef = shannon_H((char*) var.ref.c_str(), var.ref.size()); var.info["EntropyLeft"].clear(); var.info["EntropyRight"].clear(); var.info["EntropyCenter"].clear(); var.info["EntropyRef"].clear(); var.info["EntropyAlt"].clear(); var.info["EntropyLeft"].push_back(convert(entropyLeft)); var.info["EntropyRight"].push_back(convert(entropyRight)); var.info["EntropyCenter"].push_back(convert(entropyCenter)); var.info["EntropyRef"].push_back(convert(entropyRef)); for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { double entropyAlt = shannon_H((char*) a->c_str(), a->size()); var.info["EntropyAlt"].push_back(convert(entropyAlt)); } cout << var << endl; } return 0; }
void realign_bam(Parameters& params) { FastaReference reference; reference.open(params.fasta_reference); bool suppress_output = false; int dag_window_size = params.dag_window_size; // open BAM file BamReader reader; if (!reader.Open("stdin")) { cerr << "could not open stdin for reading" << endl; exit(1); } BamWriter writer; if (!params.dry_run && !writer.Open("stdout", reader.GetHeaderText(), reader.GetReferenceData())) { cerr << "could not open stdout for writing" << endl; exit(1); } // store the names of all the reference sequences in the BAM file map<int, string> referenceIDToName; vector<RefData> referenceSequences = reader.GetReferenceData(); int i = 0; for (RefVector::iterator r = referenceSequences.begin(); r != referenceSequences.end(); ++r) { referenceIDToName[i] = r->RefName; ++i; } vcf::VariantCallFile vcffile; if (!params.vcf_file.empty()) { if (!vcffile.open(params.vcf_file)) { cerr << "could not open VCF file " << params.vcf_file << endl; exit(1); } } else { cerr << "realignment requires VCF file" << endl; exit(1); } vcf::Variant var(vcffile); BamAlignment alignment; map<long int, vector<BamAlignment> > alignmentSortQueue; // get alignment // assemble DAG in region around alignment // loop for each alignment in BAM: // update DAG when current alignment gets close to edge of assembled DAG // attempt to realign if read has a certain number of mismatches + gaps or softclips, weighted by basequal // if alignment to DAG has fewer mismatches and gaps than original alignment, use it // flatten read into reference space (for now just output alleles from VCF un-spanned insertions) // write read to queue for streaming re-sorting (some positional change will occur) long int dag_start_position = 0; string currentSeqname; string ref; //vector<Cigar> cigars; // contains the Cigar strings of nodes in the graph //vector<long int> refpositions; // contains the reference start coords of nodes in the graph ReferenceMappings ref_map; gssw_graph* graph = gssw_graph_create(0); int8_t* nt_table = gssw_create_nt_table(); int8_t* mat = gssw_create_score_matrix(params.match, params.mism); int total_reads = 0; int total_realigned = 0; int total_improved = 0; bool emptyDAG = false; // if the dag is constructed over empty sequence // such as when realigning reads mapped to all-N sequence if (params.debug) { cerr << "about to start processing alignments" << endl; } while (reader.GetNextAlignment(alignment)) { string& seqname = referenceIDToName[alignment.RefID]; if (params.debug) { cerr << "--------------------------------------------" << endl << "processing alignment " << alignment.Name << " at " << seqname << ":" << alignment.Position << endl; } /* if (!alignment.IsMapped() && graph->size == 0) { if (params.debug) { cerr << "unable to build DAG using unmapped read " << alignment.Name << " @ " << seqname << ":" << alignment.Position << " no previous mapped read found and DAG currently empty" << endl; } alignmentSortQueue[dag_start_position+dag_window_size].push_back(alignment); continue; } */ ++total_reads; BamAlignment originalAlignment = alignment; long unsigned int initialAlignmentPosition = alignment.Position; //if (dag_start_position == 1) { // dag_start_position = max(1, (int)initialAlignmentPosition - dag_window_size/2); //} // should we construct a new DAG? do so when 3/4 of the way through the current one // center on current position + 1/2 dag window // TODO check this scheme using some scribbles on paper // alignment.IsMapped() if ((seqname != currentSeqname || ((alignment.Position + (alignment.QueryBases.size()/2) > (3*dag_window_size/4) + dag_start_position))) && alignment.Position < reference.sequenceLength(seqname)) { if (seqname != currentSeqname) { if (params.debug) { cerr << "switched ref seqs" << endl; } dag_start_position = max((long int) 0, (long int) (alignment.GetEndPosition() - dag_window_size/2)); // recenter DAG } else if (!ref_map.empty()) { dag_start_position = dag_start_position + dag_window_size/2; dag_start_position = max(dag_start_position, (long int) (alignment.GetEndPosition() - dag_window_size/2)); } else { dag_start_position = alignment.Position - dag_window_size/2; } dag_start_position = max((long int)0, dag_start_position); // TODO get sequence length and use to bound noted window size (edge case) //cerr << "getting ref " << seqname << " " << max((long int) 0, dag_start_position) << " " << dag_window_size << endl; // get variants for new DAG vector<vcf::Variant> variants; if (!vcffile.setRegion(seqname, dag_start_position + 1, dag_start_position + dag_window_size)) { // this is not necessarily an error; there should be a better way to check for VCF file validity /* cerr << "could not set region on VCF file to " << currentSeqname << ":" << dag_start_position << "-" << dag_start_position + ref.size() << endl; */ //exit(1); } else { // check first variant if (vcffile.getNextVariant(var)) { while (var.position <= dag_start_position + 1) { //cerr << "var position == dag_start_position " << endl; dag_start_position -= 1; vcffile.setRegion(seqname, dag_start_position + 1, dag_start_position + dag_window_size); if (!vcffile.getNextVariant(var)) { break; } } } vcffile.setRegion(seqname, dag_start_position + 1, dag_start_position + dag_window_size); while (vcffile.getNextVariant(var)) { if (params.debug) cerr << "getting variant at " << var.sequenceName << ":" << var.position << endl; //cerr << var.position << " + " << var.ref.length() << " <= " << dag_start_position << " + " << dag_window_size << endl; //cerr << var.position << " >= " << dag_start_position << endl; if (var.position + var.ref.length() <= dag_start_position + dag_window_size && var.position >= dag_start_position) { variants.push_back(var); } } } //cerr << "dag_start_position " << dag_start_position << endl; ref = reference.getSubSequence(seqname, max((long int) 0, dag_start_position), dag_window_size); // 0/1 conversion // clear graph and metadata ref_map.clear(); //cigars.clear(); //refpositions.clear(); gssw_graph_destroy(graph); if (params.debug) { cerr << "constructing DAG" << endl; } // and build the DAG graph = gssw_graph_create(0); constructDAGProgressive(graph, ref_map, ref, seqname, variants, dag_start_position, nt_table, mat, params.flat_input_vcf); if (params.debug) { cerr << "graph has " << graph->size << " nodes" << endl; cerr << "DAG generated from input variants over " << seqname << ":" << dag_start_position << "-" << dag_start_position + dag_window_size << endl; } if (params.display_dag) { gssw_graph_print(graph); /* for (Backbone::iterator b = backbone.begin(); b != backbone.end(); ++b) { cout << b->first << " " << b->first->id << " " << b->second.ref_position << " " << b->second.cigar << endl << b->first->seq << endl; } */ } if (graph->size == 1 && allN(ref) || graph->size == 0) { if (params.debug) { cerr << "DAG is empty (1 node, all N). Alignment is irrelevant." << endl; } emptyDAG = true; } else { emptyDAG = false; } } AlignmentStats stats_before; bool was_mapped = alignment.IsMapped(); bool has_realigned = false; if (was_mapped) { if (dag_start_position + dag_window_size < alignment.GetEndPosition()) { ref = reference.getSubSequence(seqname, max((long int) 0, dag_start_position), alignment.GetEndPosition() - dag_start_position); // 0/1 conversion } } if (params.debug) { if (emptyDAG) { cerr << "cannot realign against empty (all-N single node) graph" << endl; } } if (!emptyDAG && shouldRealign(alignment, ref, dag_start_position, params, stats_before)) { ++total_realigned; if (params.debug) { cerr << "realigning: " << alignment.Name << " " << alignment.QueryBases << endl << " aligned @ " << alignment.Position << " to variant graph over " << seqname << ":" << dag_start_position << "-" << dag_start_position + dag_window_size << endl; } //{ try { Cigar flat_cigar; string read = alignment.QueryBases; string qualities = alignment.Qualities; int score; long int position; string strand; gssw_graph_mapping* gm = gswalign(graph, ref_map, read, qualities, params, position, score, flat_cigar, strand, nt_table, mat); // gssw_graph_mapping_destroy(gm); if (params.dry_run) { if (strand == "-" && !alignment.IsMapped()) { read = reverseComplement(read); } cout << read << endl; cout << graph_mapping_to_string(gm) << endl; cout << score << " " << strand << " " << position << " " << flat_cigar << endl; } else { /* if (strand == "-") { read = reverseComplement(trace_report.read); } */ // TODO the qualities are not on the right side of the read if (strand == "-" && alignment.IsMapped()) { // if we're realigning, this is always true unless we swapped strands alignment.SetIsReverseStrand(true); //reverse(alignment.Qualities.begin(), alignment.Qualities.end()); // reverse qualities } //alignment.QueryBases = reverseComplement(trace_report.read); alignment.QueryBases = read; alignment.Qualities = qualities; alignment.Position = position;// + 1;// + 1;//(trace_report.node->position - 1) + trace_report.x; alignment.SetIsMapped(true); if (!alignment.MapQuality) { alignment.MapQuality = 20; // horrible hack... at least approximate with alignment mismatches against graph } // check if somehow we've ended up with an indel at the ends // if so, grab the reference sequence right beyond it and add // a single match to the cigar, allowing variant detection methods // to run on the results without internal modification Cigar& cigar = flat_cigar; //cerr << flat_cigar << " " << flat_cigar.readLen() << " " << flat_cigar.refLen() << endl; int flankSize = params.flatten_flank; if (cigar.front().isIndel() || (cigar.front().isSoftclip() && cigar.at(1).isIndel())) { alignment.Position -= flankSize; string refBase = reference.getSubSequence(seqname, alignment.Position, flankSize); if (cigar.front().isSoftclip()) { alignment.QueryBases.erase(alignment.QueryBases.begin(), alignment.QueryBases.begin()+cigar.front().length); alignment.Qualities.erase(alignment.Qualities.begin(), alignment.Qualities.begin()+cigar.front().length); cigar.erase(cigar.begin()); } alignment.QueryBases.insert(0, refBase); alignment.Qualities.insert(0, string(flankSize, shortInt2QualityChar(30))); Cigar newCigar; newCigar.push_back(CigarElement(flankSize, 'M')); newCigar.append(flat_cigar); flat_cigar = newCigar; } if (cigar.back().isIndel() || (cigar.back().isSoftclip() && cigar.at(cigar.size()-2).isIndel())) { string refBase = reference.getSubSequence(seqname, alignment.Position + flat_cigar.refLen(), flankSize); if (cigar.back().isSoftclip()) { alignment.QueryBases.erase(alignment.QueryBases.end()-cigar.back().length, alignment.QueryBases.end()); alignment.Qualities.erase(alignment.Qualities.end()-cigar.back().length, alignment.Qualities.end()); cigar.pop_back(); } Cigar newCigar; newCigar.push_back(CigarElement(flankSize, 'M')); flat_cigar.append(newCigar); //flat_cigar.append(newCigar); alignment.QueryBases.append(refBase); alignment.Qualities.append(string(flankSize, shortInt2QualityChar(30))); } flat_cigar.toCigarData(alignment.CigarData); //cerr << flat_cigar << " " << flat_cigar.readLen() << " " << flat_cigar.refLen() << endl; if (dag_start_position + dag_window_size < alignment.GetEndPosition()) { ref = reference.getSubSequence(seqname, max((long int) 0, dag_start_position), alignment.GetEndPosition() - dag_start_position); // 0/1 conversion } AlignmentStats stats_after; countMismatchesAndGaps(alignment, flat_cigar, ref, dag_start_position, stats_after, params.debug); /* if ((!was_mapped || (stats_before.softclip_qsum >= stats_after.softclip_qsum && stats_before.mismatch_qsum >= stats_after.mismatch_qsum)) && acceptRealignment(alignment, ref, dag_start_position, params, stats_after)) { */ /* if ((!was_mapped || (stats_before.softclip_qsum + stats_before.mismatch_qsum >= stats_after.softclip_qsum + stats_after.mismatch_qsum)) && acceptRealignment(alignment, ref, dag_start_position, params, stats_after)) { */ // we accept the new alignment if... if (!was_mapped // it wasn't mapped previously // or if we have removed soft clips or mismatches (per quality) from the alignment //|| ((stats_before.softclip_qsum >= stats_after.softclip_qsum // && stats_before.mismatch_qsum >= stats_after.mismatch_qsum) || ((stats_before.softclip_qsum + stats_before.mismatch_qsum >= stats_after.softclip_qsum + stats_after.mismatch_qsum) // and if we have added gaps, we have added them to remove mismatches or softclips && (stats_before.gaps >= stats_after.gaps // accept any time we reduce gaps while not increasing softclips/mismatches || (stats_before.gaps < stats_after.gaps // and allow gap increases when they improve the alignment && (stats_before.softclip_qsum + stats_before.mismatch_qsum > stats_after.softclip_qsum + stats_after.mismatch_qsum)))) // and the alignment must not have more than the acceptable number of gaps, softclips, or mismatches // as provided in input parameters && acceptRealignment(alignment, ref, dag_start_position, params, stats_after)) { // keep the alignment // TODO require threshold of softclips to keep alignment (or count of gaps, mismatches,...) if (params.debug) { cerr << "realigned " << alignment.Name << " to graph, which it maps to with " << stats_after.mismatch_qsum << "q in mismatches and " << stats_after.softclip_qsum << "q in soft clips" << endl; } ++total_improved; has_realigned = true; } else { // reset to old version of alignment if (params.debug) { cerr << "failed realignment of " << alignment.Name << " to graph, which it maps to with: " << stats_after.mismatch_qsum << "q in mismatches " << "(vs " << stats_before.mismatch_qsum << "q before), and " << stats_after.softclip_qsum << "q in soft clips " << "(vs " << stats_before.softclip_qsum << "q before) " << endl; } has_realigned = false; alignment = originalAlignment; } } //} // try block } catch (...) { cerr << "exception when realigning " << alignment.Name << " at position " << referenceIDToName[alignment.RefID] << ":" << alignment.Position << " " << alignment.QueryBases << endl; // reset to original alignment has_realigned = false; alignment = originalAlignment; } } // ensure correct order if alignments move long int maxOutputPos = initialAlignmentPosition - dag_window_size; // if we switched sequences we need to flush out all the reads from the previous one string lastSeqname = currentSeqname; if (seqname != currentSeqname) { // so the max output position is set past the end of the last chromosome if (!currentSeqname.empty()) { maxOutputPos = reference.sequenceLength(currentSeqname) + dag_window_size; } currentSeqname = seqname; } if (!params.dry_run) { map<long int, vector<BamAlignment> >::iterator p = alignmentSortQueue.begin(); for ( ; p != alignmentSortQueue.end(); ++p) { // except if we are running in unsorted mode, stop when we are at the window size if (!params.unsorted_output && p->first > maxOutputPos) { break; // no more to do } else { for (vector<BamAlignment>::iterator a = p->second.begin(); a != p->second.end(); ++a) { writer.SaveAlignment(*a); } } } if (p != alignmentSortQueue.begin()) { alignmentSortQueue.erase(alignmentSortQueue.begin(), p); } if (!params.only_realigned || has_realigned) { alignmentSortQueue[alignment.Position].push_back(alignment); } } } // end GetNextAlignment loop if (!params.dry_run) { map<long int, vector<BamAlignment> >::iterator p = alignmentSortQueue.begin(); for ( ; p != alignmentSortQueue.end(); ++p) { for (vector<BamAlignment>::iterator a = p->second.begin(); a != p->second.end(); ++a) writer.SaveAlignment(*a); } } gssw_graph_destroy(graph); free(nt_table); free(mat); reader.Close(); writer.Close(); if (params.debug) { cerr << "total reads:\t" << total_reads << endl; cerr << "realigned:\t" << total_realigned << endl; cerr << "improved:\t" << total_improved << endl; } }
int main(int argc, char** argv) { int c; FastaReference reference; bool has_ref = false; bool suppress_output = false; bool debug = false; bool isuncompressed = true; int maxiterations = 50; if (argc < 2) { printUsage(argv); exit(1); } while (true) { static struct option long_options[] = { {"help", no_argument, 0, 'h'}, {"debug", no_argument, 0, 'd'}, {"fasta-reference", required_argument, 0, 'f'}, {"max-iterations", required_argument, 0, 'm'}, {"suppress-output", no_argument, 0, 's'}, {"compressed", no_argument, 0, 'c'}, {0, 0, 0, 0} }; int option_index = 0; c = getopt_long (argc, argv, "hdcsf:m:", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) break; switch (c) { case 'f': reference.open(optarg); // will exit on open failure has_ref = true; break; case 'm': maxiterations = atoi(optarg); break; case 'd': debug = true; break; case 's': suppress_output = true; break; case 'c': isuncompressed = false; break; case 'h': printUsage(argv); exit(0); break; case '?': printUsage(argv); exit(1); break; default: abort(); break; } } if (!has_ref) { cerr << "no FASTA reference provided, cannot realign" << endl; exit(1); } BAMSINGLEREADER reader; if (!reader.Open(STDIN)) { cerr << "could not open stdin for reading" << endl; exit(1); } #ifdef HAVE_BAMTOOLS BamWriter writer; if (isuncompressed) { writer.SetCompressionMode(BamWriter::Uncompressed); } if (!suppress_output && !writer.Open("stdout", reader.GetHeaderText(), reader.GetReferenceData())) { cerr << "could not open stdout for writing" << endl; exit(1); } #else SeqLib::BamWriter writer(isuncompressed ? SeqLib::SAM : SeqLib::BAM); SeqLib::BamHeader hdr = reader.Header(); if (hdr.isEmpty()) { cerr << "could not open header for input" << endl; exit(1); } writer.SetHeader(hdr); if (!suppress_output && !writer.Open("-")) { cerr << "could not open stdout for writing" << endl; exit(1); } #endif // store the names of all the reference sequences in the BAM file map<int, string> referenceIDToName; REFVEC referenceSequences = reader.GETREFDATA; int i = 0; for (REFVEC::iterator r = referenceSequences.begin(); r != referenceSequences.end(); ++r) { referenceIDToName[i] = r->REFNAME; ++i; } BAMALIGN alignment; while (GETNEXT(reader, alignment)) { DEBUG("--------------------------- read --------------------------" << endl); DEBUG("| " << referenceIDToName[alignment.REFID] << ":" << alignment.POSITION << endl); DEBUG("| " << alignment.QNAME << ":" << alignment.ENDPOSITION << endl); DEBUG("| " << alignment.QNAME << ":" << (alignment.ISMAPPED ? " mapped" : " unmapped") << endl); DEBUG("| " << alignment.QNAME << ":" << " cigar data size: " << alignment.GETCIGAR.size() << endl); DEBUG("--------------------------- realigned --------------------------" << endl); // skip unmapped alignments, as they cannot be left-realigned without CIGAR data if (alignment.ISMAPPED) { int endpos = alignment.ENDPOSITION; int length = endpos - alignment.POSITION + 1; if (alignment.POSITION >= 0 && length > 0) { if (!stablyLeftAlign(alignment, reference.getSubSequence( referenceIDToName[alignment.REFID], alignment.POSITION, length), maxiterations, debug)) { cerr << "unstable realignment of " << alignment.QNAME << " at " << referenceIDToName[alignment.REFID] << ":" << alignment.POSITION << endl << alignment.QUERYBASES << endl; } } } DEBUG("----------------------------------------------------------------" << endl); DEBUG(endl); if (!suppress_output) WRITEALIGNMENT(writer, alignment); } reader.Close(); if (!suppress_output) writer.Close(); return 0; }