//****************************************************************************** // ExtractDNA //****************************************************************************** void Bed2Fa::ExtractDNA() { /* Make sure that we can oen all of the files successfully*/ // open the fasta database for reading ifstream faDb(_dbFile.c_str(), ios::in); if ( !faDb ) { cerr << "Error: The requested fasta database file (" << _dbFile << ") could not be opened. Exiting!" << endl; exit (1); } // open and memory-map genome file FastaReference *fr = new FastaReference; bool memmap = true; fr->open(_dbFile, memmap); BED bed, nullBed; string sequence; _bed->Open(); while (_bed->GetNextBed(bed)) { if (_bed->_status == BED_VALID) { // make sure we are extracting >= 1 bp if (bed.zeroLength == false) { size_t seqLength = fr->sequenceLength(bed.chrom); // seqLength > 0 means chrom was found in index. // seqLength == 0 otherwise. if (seqLength) { // make sure this feature will not exceed the end of the chromosome. if ( (bed.start <= seqLength) && (bed.end <= seqLength) ) { int length = bed.end - bed.start; sequence = fr->getSubSequence(bed.chrom, bed.start, length); ReportDNA(bed, sequence); } else { cerr << "Feature (" << bed.chrom << ":" << bed.start << "-" << bed.end << ") beyond the length of " << bed.chrom << " size (" << seqLength << " bp). Skipping." << endl; } } else { cerr << "WARNING. chromosome (" << bed.chrom << ") was not found in the FASTA file. Skipping."<< endl; } } // handle zeroLength else { cerr << "Feature (" << bed.chrom << ":" << bed.start+1 << "-" << bed.end-1 << ") has length = 0, Skipping." << endl; } bed = nullBed; } } _bed->Close(); }
int main(int argc, char** argv) { string vcfFileName; string fastaFileName; int windowsize = 30; if (argc == 1) printSummary(argv); int c; while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"window-size", required_argument, 0, 'w'}, {"reference", required_argument, 0, 'r'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hw:r:", long_options, &option_index); if (c == -1) break; switch (c) { case 'w': windowsize = atoi(optarg); break; case 'r': fastaFileName = string(optarg); break; case 'h': printSummary(argv); break; case '?': printSummary(argv); exit(1); break; default: abort (); } } VariantCallFile variantFile; string inputFilename; if (optind == argc - 1) { inputFilename = argv[optind]; variantFile.open(inputFilename); } else { variantFile.open(std::cin); } if (!variantFile.is_open()) { cerr << "could not open VCF file" << endl; exit(1); } FastaReference reference; if (fastaFileName.empty()) { cerr << "a reference is required for haplotype allele generation" << endl; exit(1); } reference.open(fastaFileName); // pattern // when variants are within windowSize from each other, build up local haplotypes // establish all the haplotypes which exist within the window using genotypes+allele#+position map // generate a haplotype allele string for each unique haplotype // for completeness retain phasing information in the genotypes // write a new VCF record in which there are haplotype alleles and correctly described genotypes for each sample // if the variants are outside of the windowSize, just write out the record Variant var(variantFile); Variant outputVar(variantFile); cout << variantFile.header << endl; // get the first distances vector<Variant> cluster; while (variantFile.getNextVariant(var) || !cluster.empty()) { bool haplotypeCluster = false; if (variantFile.done()) { if (cluster.size() >= 1) { haplotypeCluster = true; } else { cout << cluster.front() << endl; cluster.clear(); } } else if (isPhased(var)) { if (cluster.empty() || cluster.back().sequenceName == var.sequenceName && var.position - cluster.back().position + cluster.back().ref.size() - 1 <= windowsize) { cluster.push_back(var); } else { if (cluster.size() == 1) { cout << cluster.front() << endl; cluster.clear(); if (!variantFile.done()) { cluster.push_back(var); } } else { haplotypeCluster = true; } } } else { // not phased if (cluster.empty()) { cout << var << endl; } else if (cluster.size() == 1) { cout << cluster.front() << endl; cout << var << endl; } else { haplotypeCluster = true; } } // we need to deal with the current cluster, as our next var is outside of bounds // process the last cluster if it's more than 1 var if (haplotypeCluster) { /* cerr << "cluster: "; for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) { cerr << " " << v->position; } cerr << endl; */ // generate haplotype alleles and genotypes! // get the reference sequence across the haplotype in question string referenceHaplotype = reference.getSubSequence(cluster.front().sequenceName, cluster.front().position - 1, cluster.back().position + cluster.back().ref.size() - cluster.front().position); // establish what haplotypes there are by parsing the (phased) genotypes across the samples over these records map<string, vector<vector<int> > > sampleHaplotypes; for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) { // build the haplotype using the genotype fields in the variant cluster // only build haplotypes for samples with complete information string& sampleName = *s; vector<vector<int> >& haplotypes = sampleHaplotypes[sampleName]; bool completeCoverage = true; // ensure complete genotype coverage over the haplotype cluster for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) { if (v->samples.find(sampleName) == v->samples.end() || v->samples[sampleName].find("GT") == v->samples[sampleName].end()) { completeCoverage = false; break; } } if (!completeCoverage) { continue; // skip samples without complete coverage } // what's the ploidy? { string& gt = cluster.front().samples[sampleName]["GT"].front(); vector<string> gtspec = split(gt, "|"); for (vector<string>::iterator g = gtspec.begin(); g != gtspec.end(); ++g) { vector<int> haplotype; haplotypes.push_back(haplotype); } } for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) { string& gt = v->samples[sampleName]["GT"].front(); vector<string> gtspec = split(gt, "|"); vector<string>::iterator g = gtspec.begin(); for (vector<vector<int> >::iterator h = haplotypes.begin(); h != haplotypes.end(); ++h, ++g) { int j; convert(*g, j); h->push_back(j); } } } set<vector<int> > uniqueHaplotypes; for (map<string, vector<vector<int> > >::iterator hs = sampleHaplotypes.begin(); hs != sampleHaplotypes.end(); ++hs) { vector<vector<int> >& haps = hs->second; for (vector<vector<int> >::iterator h = haps.begin(); h != haps.end(); ++h) { uniqueHaplotypes.insert(*h); } } // write new haplotypes map<vector<int>, string> haplotypeSeqs; map<vector<int>, int> haplotypeIndexes; map<int, string> alleles; int impossibleHaplotypes = 0; // always include the reference haplotype as 0 // when we come to it in the haplotypes, we'll ignore it int alleleIndex = 1; for (set<vector<int> >::iterator u = uniqueHaplotypes.begin(); u != uniqueHaplotypes.end(); ++u) { /* for (vector<int>::const_iterator z = u->begin(); z != u->end(); ++z) { cerr << *z; } cerr << endl; */ string haplotype = referenceHaplotype; bool isreference = true; bool impossibleHaplotype = false; int referenceInsertOffset = 0; int j = 0; // index into variant cluster int lastpos = 0; int lastrefend = 0; for (vector<int>::const_iterator z = u->begin(); z != u->end(); ++z, ++j) { int i = *z; if (i != 0) { isreference = false; Variant& vartoInsert = cluster.at(j); string& alternate = vartoInsert.alleles.at(i); if (vartoInsert.position < lastrefend) { cerr << "impossible haplotype, overlapping alleles at " << vartoInsert.sequenceName << ":" << vartoInsert.position << endl; impossibleHaplotype = true; break; } else { //cerr << vartoInsert.position << " " << cluster.front().position + referenceInsertOffset << endl; //cerr << "replacing " << vartoInsert.ref << " at " << vartoInsert.position - cluster.front().position + referenceInsertOffset << " with " << alternate << endl; haplotype.replace(vartoInsert.position - cluster.front().position + referenceInsertOffset, vartoInsert.ref.size(), alternate); if (alternate.size() != vartoInsert.ref.size()) { referenceInsertOffset += alternate.size() - vartoInsert.ref.size(); } lastpos = vartoInsert.position; lastrefend = vartoInsert.position + vartoInsert.ref.size(); } } } if (impossibleHaplotype) { ++impossibleHaplotypes; haplotypeIndexes[*u] = -1; // indicates impossible haplotype impossibleHaplotype = false; } else if (isreference) { alleles[0] = haplotype; haplotypeIndexes[*u] = 0; } else { alleles[alleleIndex] = haplotype; haplotypeIndexes[*u] = alleleIndex; ++alleleIndex; } haplotypeSeqs[*u] = haplotype; // if there's not a reference allele, add it if (alleles.find(0) == alleles.end()) { alleles[0] = referenceHaplotype; // nb, there is no reference haplotype among // the samples, so we don't have to add it to // the haplotypeIndexes } } outputVar.ref = alleles[0]; outputVar.alt.clear(); for (int i = 1; i < alleleIndex; ++i) { outputVar.alt.push_back(alleles[i]); } outputVar.sequenceName = cluster.front().sequenceName; outputVar.position = cluster.front().position; outputVar.filter = "."; outputVar.id = "."; outputVar.info = cluster.front().info; outputVar.samples.clear(); outputVar.format = cluster.front().format; // now the genotypes for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) { string& sampleName = *s; vector<string> gt; vector<vector<int> > & hs = sampleHaplotypes[sampleName]; for (vector<vector<int> >::iterator h = hs.begin(); h != hs.end(); ++h) { int hi = haplotypeIndexes[*h]; if (hi != -1) { gt.push_back(convert(hi)); } else { // nonexistent or impossible haplotype gt.push_back("."); } } if (gt.size() != 0) { outputVar.samples[sampleName]["GT"].push_back(join(gt, "|")); } } if (cluster.size() - impossibleHaplotypes < 2) { for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) { cout << *v << endl; } } else { if (!outputVar.alt.empty()) { cout << outputVar << endl; } else { cerr << "no alternate alleles remain at " << outputVar.sequenceName << ":" << outputVar.position << " after haplotype validation" << endl; } } cluster.clear(); if (!variantFile.done()) cluster.push_back(var); } } exit(0); // why? return 0; }
int main (int argc, char** argv) { std::string command; std::string fastaFileName; std::string seqname; std::string longseqname; bool dump = false; bool buildIndex = false; // flag to force index building bool printEntropy = false; // entropy printing bool readRegionsFromStdin = false; std::string region; int c; while (true) { static struct option long_options[] = { /* These options set a flag. */ {"help", no_argument, 0, 'h'}, {"index", no_argument, 0, 'i'}, {"entropy", no_argument, 0, 'e'}, {"region", required_argument, 0, 'r'}, {"stdin", no_argument, 0, 'c'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hciedr:", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) break; switch (c) { case 0: /* If this option set a flag, do nothing else now. */ if (long_options[option_index].flag != 0) break; printf ("option %s", long_options[option_index].name); if (optarg) printf (" with arg %s", optarg); printf ("\n"); break; case 'e': printEntropy = true; break; case 'c': readRegionsFromStdin = true; break; case 'i': buildIndex = true; break; case 'r': region = optarg; break; case 'd': dump = true; break; case 'h': printSummary(); exit(0); break; case '?': /* getopt_long already printed an error message. */ printSummary(); exit(1); break; default: abort (); } } /* Print any remaining command line arguments (not options). */ if (optind < argc) { //cerr << "fasta file: " << argv[optind] << std::endl; fastaFileName = argv[optind]; } else { std::cerr << "Please specify a FASTA file." << std::endl; printSummary(); exit(1); } if (buildIndex) { FastaIndex* fai = new FastaIndex(); //cerr << "generating fasta index file for " << fastaFileName << std::endl; fai->indexReference(fastaFileName); fai->writeIndexFile((std::string) fastaFileName + fai->indexFileExtension()); } std::string sequence; // holds sequence so we can optionally process it FastaReference fr; fr.open(fastaFileName); if (dump) { for (vector<std::string>::iterator s = fr.index->sequenceNames.begin(); s != fr.index->sequenceNames.end(); ++s) { std::cout << *s << "\t" << fr.getSequence(*s) << std::endl; } return 0; } if (region != "") { FastaRegion target(region); sequence = fr.getTargetSubSequence(target); } if (readRegionsFromStdin) { std::string regionstr; while (getline(cin, regionstr)) { FastaRegion target(regionstr); if (target.startPos == -1) { std::cout << fr.getSequence(target.startSeq) << std::endl; } else { std::cout << fr.getSubSequence(target.startSeq, target.startPos - 1, target.length()) << std::endl; } } } else { if (sequence != "") { if (printEntropy) { if (sequence.size() > 0) { std::cout << shannon_H((char*) sequence.c_str(), sequence.size()) << std::endl; } else { std::cerr << "please specify a region or sequence for which to calculate the shannon entropy" << std::endl; } } else { // if no statistical processing is requested, just print the sequence std::cout << sequence << std::endl; } } } return 0; }
int main(int argc, char** argv) { int window = 150; VariantCallFile variantFile; string fastaFileName; int c; while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"reference", required_argument, 0, 'r'}, {"window", required_argument, 0, 'w'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hw:r:", long_options, &option_index); if (c == -1) break; switch (c) { case 'r': fastaFileName = optarg; break; case 'w': window = atoi(optarg); break; case '?': printSummary(argv); exit(1); break; case 'h': printSummary(argv); break; default: abort (); } } if (optind < argc) { string filename = argv[optind]; variantFile.open(filename); } else { variantFile.open(std::cin); } if (!variantFile.is_open()) { cerr << "could not open VCF file" << endl; exit(1); } FastaReference fastaReference; if (fastaFileName.empty()) { cerr << "a reference is required" << endl; exit(1); } else { fastaReference.open(fastaFileName); } /* variantFile.addHeaderLine("##INFO=<ID=TYPE,Number=A,Type=String,Description=\"The type of allele, either snp, mnp, ins, del, or complex.\">"); variantFile.addHeaderLine("##INFO=<ID=LEN,Number=A,Type=Integer,Description=\"allele length\">"); if (!parseFlag.empty()) { variantFile.addHeaderLine("##INFO=<ID="+parseFlag+",Number=0,Type=Flag,Description=\"The allele was parsed using vcfallelicprimitives.\">"); } */ cout << variantFile.header << endl; Variant var(variantFile); while (variantFile.getNextVariant(var)) { // if there is no indel, there is nothing to realign bool hasIndel = false; for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { if (a->size() != var.ref.size()) { hasIndel = true; break; } } if (!hasIndel) { cout << var << endl; continue; } vector<AltAlignment> alignments; string ref; // determine window size to prevent mismapping with SW algorithm int currentWindow = window; int scale = 2; if (var.ref.size()*scale > currentWindow) currentWindow = var.ref.size()*scale; for (vector<string>::iterator a = var.alleles.begin(); a != var.alleles.end(); ++a) { if (a->size()*scale > currentWindow) { currentWindow = a->size()*scale; } } // while the entropy of either flank is < some target entropy (~1 is fine), increase the flank sizes while (currentWindow < 2000) { // limit to one step > than this string refTarget = fastaReference.getSubSequence(var.sequenceName, var.position - 1 - currentWindow/2, currentWindow); if (entropy(refTarget.substr(0, refTarget.size()/2)) < 1 || entropy(refTarget.substr(refTarget.size()/2)) < 1) { currentWindow *= scale; } else { break; } } // do the alignments getAlignment(var, fastaReference, ref, alignments, currentWindow); // stably left align the alignments for (vector<AltAlignment>::iterator a = alignments.begin(); a != alignments.end(); ++a) { Cigar cigarBefore = a->cigar; //cerr << a->seq << endl; //cerr << "before : " << a->pos << " " << joinCigar(a->cigar) << endl; long int prev = a->pos; stablyLeftAlign(a->seq, ref, a->cigar, 20, false); //cerr << "after : " << a->pos << " " << joinCigar(a->cigar) << endl; if (a->pos != prev) cerr << "modified alignment @ " << var << endl; } //cout << var << endl; // transform the mappings // chop off leading matching bases // find the range of bp in the alleles // make the new ref allele // make the new alt alleles // emit the var long int newPosition = var.position+currentWindow/2; long int newEndPosition = var.position-currentWindow/2; // check for no-indel case int newLength = var.ref.size(); bool giveUp = false; for (vector<AltAlignment>::iterator a = alignments.begin(); a != alignments.end() && !giveUp; ++a) { // get the first mismatching position Cigar::iterator c = a->cigar.begin(); int rp = 0; int sp = 0; bool hitMismatch = false; int matchingBpAtStart = 0; int matchingBpAtEnd = 0; // will be set to true if the first reference position match is broken by a SNP, not an indel bool leadingSNP = false; while (c != a->cigar.end()) { char op = c->second[0]; if (c == a->cigar.begin()) { if (op != 'M') { cerr << "alignment does not start on matched sequence" << endl; cerr << var << endl; exit(1); } int i = 0; for ( ; i < c->first; ++i) { if (ref[i] != a->seq[i]) { leadingSNP = true; break; } } matchingBpAtStart = i; } if (!leadingSNP && c == (a->cigar.begin()+1)) { // if the first thing we run into is an indel, step back, per VCF spec if (op == 'D' || op == 'I') { --matchingBpAtStart; } } if (c == (a->cigar.end()-1)) { if (op != 'M') { // soft clip at end // it'll be hard to interpret this // the alignments sometimes generate this // best thing to do is to move on //cerr << "alignment does not end on matched sequence" << endl; //cout << var << endl; //exit(1); giveUp = true; break; } int i = 0; for ( ; i < c->first; ++i) { if (ref[ref.size()-1-i] != a->seq[a->seq.size()-1-i]) { break; } } matchingBpAtEnd = i; } ++c; } int altMismatchLength = a->seq.size() - matchingBpAtEnd - matchingBpAtStart; int refMismatchLength = (var.ref.size() + currentWindow) - matchingBpAtEnd - matchingBpAtStart; //cerr << "alt mismatch length " << altMismatchLength << endl // << "ref mismatch length " << refMismatchLength << endl; long int newStart = var.position - currentWindow/2 + matchingBpAtStart; long int newEnd = newStart + refMismatchLength; //cerr << "ref should run from " << newStart << " to " << newStart + refMismatchLength << endl; newPosition = min(newStart, newPosition); newEndPosition = max(newEnd, newEndPosition); //cerr << newPosition << " " << newEndPosition << endl; //if (newRefSize < refMismatchLength) newRefSize = refMismatchLength; } // the alignment failed for some reason, continue if (giveUp) { cout << var << endl; continue; } //cerr << "new ref start " << newPosition << " and end " << newEndPosition << " was " << var.position << "," << var.position + var.ref.size() << endl; int newRefSize = newEndPosition - newPosition; string newRef = fastaReference.getSubSequence(var.sequenceName, newPosition-1, newRefSize); // get the number of bp to strip from the alts int stripFromStart = currentWindow/2 - (var.position - newPosition); int stripFromEnd = (currentWindow + newRefSize) - (stripFromStart + newRefSize) + (var.ref.size() - newRefSize); //cerr << "strip from start " << stripFromStart << endl; //cerr << "strip from end " << stripFromEnd << endl; vector<string> newAlt; vector<string>::iterator l = var.alt.begin(); bool failedAlt = false; for (vector<AltAlignment>::iterator a = alignments.begin(); a != alignments.end(); ++a, ++l) { int diff = newRef.size() - l->size(); string alt = a->seq.substr(stripFromStart, a->seq.size() - (stripFromEnd + stripFromStart)); newAlt.push_back(alt); if (alt.empty()) failedAlt = true; } // check the before/after haplotypes bool brokenRealignment = false; if (!newRef.empty() && !failedAlt) { int slop = 50; // 50 extra bp! int haplotypeStart = min(var.position, newPosition) - slop; int haplotypeEnd = max(var.position + var.ref.size(), newPosition + newRef.size()) + slop; string referenceHaplotype = fastaReference.getSubSequence(var.sequenceName, haplotypeStart - 1, haplotypeEnd - haplotypeStart); vector<string>::iterator o = var.alt.begin(); vector<string>::iterator n = newAlt.begin(); for ( ; o != var.alt.end() ; ++o, ++n) { // map the haplotypes string oldHaplotype = referenceHaplotype; string newHaplotype = referenceHaplotype; oldHaplotype.replace(var.position - haplotypeStart, var.ref.size(), *o); newHaplotype.replace(newPosition - haplotypeStart, newRef.size(), *n); if (oldHaplotype != newHaplotype) { cerr << "broken left alignment!" << endl << "old " << oldHaplotype << endl << "new " << newHaplotype << endl; cerr << "was: " << var << endl; brokenRealignment = true; } } } // *if* everything is OK, update the variant if (!brokenRealignment && !newRef.empty() && !failedAlt) { var.ref = newRef; var.alt = newAlt; var.position = newPosition; } cout << var << endl; // for each parsedalternate, get the position // build a new vcf record for that position // unless we are already at the position ! // take everything which is unique to that allele (records) and append it to the new record // then handle genotypes; determine the mapping between alleleic primitives and convert to phased haplotypes // this means taking all the parsedAlternates and, for each one, generating a pattern of allele indecies corresponding to it //for (vector<Variant>::iterator v = variants.begin(); v != variants.end(); ++v) { } return 0; }
int main(int argc, char** argv) { int c; string fastaRef; bool keepFailures = false; bool excludeFailures = false; if (argc == 1) printSummary(argv); while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"fasta-reference", required_argument, 0, 'f'}, {"exclude-failures", no_argument, 0, 'x'}, {"keep-failures", no_argument, 0, 'k'}, //{"length", no_argument, &printLength, true}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hxkf:", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) break; switch (c) { case 0: /* If this option set a flag, do nothing else now. */ if (long_options[option_index].flag != 0) break; printf ("option %s", long_options[option_index].name); if (optarg) printf (" with arg %s", optarg); printf ("\n"); break; case 'f': fastaRef = optarg; break; case 'x': excludeFailures = true; break; case 'k': keepFailures = true; break; case 'h': printSummary(argv); exit(0); break; case '?': /* getopt_long already printed an error message. */ printSummary(argv); exit(1); break; default: abort (); } } if (fastaRef.empty()) { cerr << "a FASTA reference sequence must be specified" << endl; exit(1); } FastaReference ref; ref.open(fastaRef); VariantCallFile variantFile; string inputFilename; if (optind == argc - 1) { inputFilename = argv[optind]; variantFile.open(inputFilename); } else { variantFile.open(std::cin); } if (!variantFile.is_open()) { return 1; } if (keepFailures || excludeFailures) { cout << variantFile.header << endl; } Variant var(variantFile); while (variantFile.getNextVariant(var)) { int refstart = var.position - 1; // convert to 0-based string matchedRef = ref.getSubSequence(var.sequenceName, refstart, var.ref.size()); if (var.ref != matchedRef) { if (keepFailures) { cout << var << endl; } else if (!excludeFailures) { cout << "mismatched reference " << var.ref << " should be " << matchedRef << " at " << var.sequenceName << ":" << var.position << endl; } } else if (excludeFailures) { cout << var << endl; } } return 0; }
int main(int argc, char** argv) { string vcfFileName; string fastaFileName; int windowsize = 100; bool includePreviousBaseForIndels = false; bool useMNPs = true; int altwindowsize = 50; // constants for SmithWaterman algorithm float matchScore = 10.0f; float mismatchScore = -9.0f; float gapOpenPenalty = 15.0f; float gapExtendPenalty = 6.66f; bool useEntropy = false; bool useRepeatGapExtendPenalty = false; float repeatGapExtendPenalty = 1; bool adjustVcf = false; string adjustedTag = "remappedCIGAR"; if (argc == 1) printSummary(argv); int c; while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"ref-window-size", required_argument, 0, 'w'}, {"reference", required_argument, 0, 'r'}, {"match-score", required_argument, 0, 'm'}, {"mismatch-score", required_argument, 0, 'x'}, {"gap-open-penalty", required_argument, 0, 'o'}, {"gap-extend-penalty", required_argument, 0, 'e'}, {"alt-window-size", required_argument, 0, 's'}, {"entropy-gap-open", no_argument, 0, 'z'}, {"repeat-gap-extend", no_argument, 0, 'R'}, {"adjust-vcf", required_argument, 0, 'a'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hza:w:r:m:x:o:e:s:R:", long_options, &option_index); if (c == -1) break; switch (c) { case 'w': windowsize = atoi(optarg); break; case 'a': adjustVcf = true; adjustedTag = optarg; break; case 'r': fastaFileName = string(optarg); break; case 'h': printSummary(argv); break; case 'm': matchScore = atof(optarg); break; case 'x': mismatchScore = atof(optarg); break; case 'o': gapOpenPenalty = atof(optarg); break; case 'e': gapExtendPenalty = atof(optarg); break; case 's': altwindowsize = atoi(optarg); break; case 'z': useEntropy = true; break; case 'R': useRepeatGapExtendPenalty = true; repeatGapExtendPenalty = atof(optarg); break; case '?': printSummary(argv); exit(1); break; default: abort (); } } VariantCallFile variantFile; string inputFilename; if (optind == argc - 1) { inputFilename = argv[optind]; variantFile.open(inputFilename); } else { variantFile.open(std::cin); } if (!variantFile.is_open()) { cerr << "could not open VCF file" << endl; exit(1); } FastaReference freference; if (fastaFileName.empty()) { cerr << "a reference is required" << endl; exit(1); } else { freference.open(fastaFileName); } if (adjustVcf) { vector<string> commandline; for (int i = 0; i < argc; ++i) commandline.push_back(argv[i]); variantFile.addHeaderLine("##INFO=<ID=" + adjustedTag + ",Number=A,Type=String,Description=\"CIGAR when remapped using"+ join(commandline, " ") +"\">"); } cout << variantFile.header << endl; Variant var(variantFile); while (variantFile.getNextVariant(var)) { //if (!adjustVcf) { cout << endl; cout << var << endl; //} map<string, vector<VariantAllele> > variantAlleles; vector<vector<pair<int, char> > > cigars; vector<int> positionDiffs; for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { //if (!adjustVcf) cout << endl; cout << endl; // try to remap locally string reference = freference.getSubSequence(var.sequenceName, var.position - 1 - windowsize, windowsize * 2 + var.ref.size()); // passed to sw align unsigned int referencePos; string cigar; string& alternate = *a; vector<VariantAllele>& variants = variantAlleles[alternate]; string alternateQuery = reference.substr(windowsize - altwindowsize, altwindowsize) + alternate + reference.substr(reference.size() - windowsize, altwindowsize); //cout << "REF:\t" << reference << endl; //cout << "ALT:\t" << string(windowsize - altwindowsize, ' ') << alternateQuery << endl; CSmithWatermanGotoh sw(matchScore, mismatchScore, gapOpenPenalty, gapExtendPenalty); if (useEntropy) sw.EnableEntropyGapPenalty(1); if (useRepeatGapExtendPenalty) sw.EnableRepeatGapExtensionPenalty(repeatGapExtendPenalty); sw.Align(referencePos, cigar, reference, alternateQuery); int altpos = 0; int refpos = 0; int len; string slen; vector<pair<int, char> > cigarData; string ref = reference.substr(referencePos); positionDiffs.push_back(referencePos); // TODO this... is borked stringstream refss; stringstream altss; if (!adjustVcf) cout << cigar << endl; cout << cigar << endl; for (string::iterator c = cigar.begin(); c != cigar.end(); ++c) { switch (*c) { case 'I': len = atoi(slen.c_str()); slen.clear(); if (altpos < altwindowsize) { cigarData.push_back(make_pair(len, 'M')); } else { cigarData.push_back(make_pair(len, *c)); } altss << alternateQuery.substr(altpos, len); refss << string(len, '-'); altpos += len; break; case 'D': len = atoi(slen.c_str()); slen.clear(); if (altpos < altwindowsize) { } else { cigarData.push_back(make_pair(len, *c)); } refss << ref.substr(refpos, len); altss << string(len, '-'); refpos += len; break; case 'M': len = atoi(slen.c_str()); slen.clear(); { for (int i = 0; i < len; ++i) { if (ref.at(refpos + i) == alternateQuery.at(altpos + i)) { if (!cigarData.empty() && cigarData.back().second == 'M') { cigarData.back().first++; } else { cigarData.push_back(make_pair(1, 'M')); } } else { if (!cigarData.empty() && cigarData.back().second == 'X') { cigarData.back().first++; } else { cigarData.push_back(make_pair(1, 'X')); } } } } refss << ref.substr(refpos, len); altss << alternateQuery.substr(altpos, len); refpos += len; altpos += len; break; case 'S': len = atoi(slen.c_str()); slen.clear(); cigarData.push_back(make_pair(len, *c)); refss << ref.substr(refpos, len); //altss << alternateQuery.substr(altpos, len); // TODO deal with soft clipping, weird behavior refpos += len; altpos += len; break; default: len = 0; slen += *c; break; } } if (!adjustVcf) { cout << "ref:\t" << refss.str() << endl; cout << "alt:\t" << altss.str() << endl; } else { cout << "ref:\t" << refss.str() << endl; cout << "alt:\t" << altss.str() << endl; cigars.push_back(cigarData); } } if (adjustVcf) { int substart = cigars.front().front().first; int subend = cigars.front().back().first; // find the min and max match for (vector<vector<pair<int, char> > >::iterator c = cigars.begin(); c != cigars.end(); ++c) { if (c->front().second == 'M' && c->front().first <= substart) { substart = c->front().first; if (c->size() > 1 && c->at(1).second != 'X') { --substart; } } if (c->back().second == 'M' && c->back().first <= subend) { subend = c->back().first; } } // adjust the cigars and get the new reference length int reflen = 0; for (vector<vector<pair<int, char> > >::iterator c = cigars.begin(); c != cigars.end(); ++c) { c->front().first -= substart; c->back().first -= subend; int crf = cigarRefLen(*c); if (crf > reflen) reflen = crf; var.info[adjustedTag].push_back(joinCigar(*c)); } // find the lowest positional difference int pdiff = 0; for (vector<int>::iterator d = positionDiffs.begin(); d != positionDiffs.end(); ++d) { if (*d + altwindowsize < pdiff) pdiff = *d + altwindowsize; } // adjust the reference string var.position += pdiff; // adjust the variant position var.ref = freference.getSubSequence(var.sequenceName, var.position - 1, reflen); cout << var << endl; } } return 0; }
int main(int argc, char** argv) { string bedFileName; string vcfFileName; string fastaFileName; bool intersecting = false; bool unioning = false; bool invert = false; bool contained = true; bool overlapping = false; int windowsize = 30; if (argc == 1) printSummary(argv); int c; while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"bed", required_argument, 0, 'b'}, {"invert", no_argument, 0, 'v'}, {"intersect-vcf", required_argument, 0, 'i'}, {"union-vcf", required_argument, 0, 'u'}, {"contained", no_argument, 0, 'c'}, {"overlapping", no_argument, 0, 'o'}, {"window-size", required_argument, 0, 'w'}, {"reference", required_argument, 0, 'r'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hvcob:i:u:w:r:", long_options, &option_index); if (c == -1) break; switch (c) { case 'w': windowsize = atoi(optarg); break; case 'b': bedFileName = string(optarg); break; case 'i': intersecting = true; vcfFileName = string(optarg); break; case 'u': unioning = true; vcfFileName = string(optarg); break; case 'r': fastaFileName = string(optarg); break; case 'v': invert = true; break; case 'c': contained = true; break; case 'o': overlapping = true; break; case 'h': printSummary(argv); break; case '?': printSummary(argv); exit(1); break; default: abort (); } } bool usingBED = false; if (!bedFileName.empty()) { usingBED = true; } BedReader bed; if (usingBED) { bed.open(bedFileName); } VariantCallFile variantFile; string inputFilename; if (optind == argc - 1) { inputFilename = argv[optind]; variantFile.open(inputFilename); } else { variantFile.open(std::cin); } if (!variantFile.is_open()) { cerr << "could not open VCF file" << endl; exit(1); } if (usingBED) { variantFile.parseSamples = false; } VariantCallFile otherVariantFile; if (!vcfFileName.empty()) { otherVariantFile.open(vcfFileName); if (!otherVariantFile.is_open()) { cerr << "could not open VCF file " << vcfFileName << endl; exit(1); } } FastaReference reference; if (unioning || intersecting) { if (fastaFileName.empty()) { cerr << "a reference is required for haplotype-based intersection and unioniong" << endl; exit(1); } reference.open(fastaFileName); } if (!unioning && !intersecting) { variantFile.parseSamples = false; // faster, as when we are // only bed-intersecting we // can do position-only // output and don't have to // manipulate specific // alleles } // read the VCF file for union or intersection into an interval tree // indexed using some proximity window map<string, IntervalTree<Variant*> > variantIntervals; map<string, list<Variant> > otherVariants; map<string, vector<Interval<Variant*> > > otherVariantIntervals; if (unioning || intersecting) { Variant ovar(otherVariantFile); while (otherVariantFile.getNextVariant(ovar)) { long int left = ovar.position; long int right = left + ovar.ref.size(); // this should be 1-past the end otherVariants[ovar.sequenceName].push_back(ovar); Variant* v = &otherVariants[ovar.sequenceName].back(); otherVariantIntervals[ovar.sequenceName].push_back(Interval<Variant*>(left, right, v)); } for (map<string, vector<Interval<Variant*> > >::iterator j = otherVariantIntervals.begin(); j != otherVariantIntervals.end(); ++j) { variantIntervals[j->first] = IntervalTree<Variant*>(j->second); } } set<Variant*> outputVariants; long unsigned int lastOutputPosition = 0; string lastSequenceName; cout << variantFile.header; Variant var(variantFile); while (variantFile.getNextVariant(var)) { if (lastSequenceName.empty()) { lastSequenceName = var.sequenceName; } else if (lastSequenceName != var.sequenceName) { if (unioning) { vector<Interval<Variant*> > previousRecords; long int lastSeqLength = reference.sequenceLength(lastSequenceName); variantIntervals[lastSequenceName].findContained(lastOutputPosition, lastSeqLength, previousRecords); for (vector<Interval<Variant*> >::iterator r = previousRecords.begin(); r != previousRecords.end(); ++r) { Variant* v = r->value; if (outputVariants.find(v) == outputVariants.end()) { outputVariants.insert(v); cout << *v << endl; // does this output everything in correct order? } } lastSequenceName = var.sequenceName; lastOutputPosition = 0; } } if (usingBED) { BedTarget record(var.sequenceName, var.position, var.position + var.ref.size(), ""); vector<BedTarget*> overlaps = bed.targetsOverlapping(record); if (!invert && !overlaps.empty()) { cout << variantFile.line << endl; } else if (invert && overlaps.empty()) { cout << variantFile.line << endl; } } else if (unioning || intersecting) { // TODO check overlaps with union/intersection // hmm... for unioning, you might need to step through the original VCF records // but the idea is to exclude the haplotype-based duplicates vector<Interval<Variant*> > results; variantIntervals[var.sequenceName].findContained(var.position - windowsize, var.position + var.ref.size() + windowsize, results); vector<Variant*> overlapping; for (vector<Interval<Variant*> >::iterator r = results.begin(); r != results.end(); ++r) { overlapping.push_back(r->value); } if (unioning) { // unioning strategy // write out all the records from the last file // between the last one printed out and the first // one we're about to print out vector<Interval<Variant*> > previousRecords; variantIntervals[var.sequenceName].findOverlapping(lastOutputPosition, var.position - windowsize, previousRecords); map<long int, vector<Variant*> > variants; for (vector<Interval<Variant*> >::iterator r = previousRecords.begin(); r != previousRecords.end(); ++r) { Variant* v = r->value; if (outputVariants.find(v) == outputVariants.end()) { outputVariants.insert(v); variants[v->position].push_back(v); } } for (map<long int, vector<Variant*> >::iterator v = variants.begin(); v != variants.end(); ++v) { for (vector<Variant*>::iterator o = v->second.begin(); o != v->second.end(); ++o) { cout << **o << endl; lastOutputPosition = max(lastOutputPosition, (*o)->position); } } // TODO find the duplicates for the other file } if (overlapping.empty()) { if (unioning || (intersecting && invert)) { cout << var << endl; lastOutputPosition = max(lastOutputPosition, var.position); } } else { // get the min and max of the overlaps int haplotypeStart = var.position; int haplotypeEnd = var.position + var.ref.size(); for (vector<Variant*>::iterator v = overlapping.begin(); v != overlapping.end(); ++v) { haplotypeStart = min((*v)->position, (long unsigned int) haplotypeStart); haplotypeEnd = max((*v)->position + (*v)->ref.size(), (long unsigned int) haplotypeEnd); } // for everything overlapping and the current variant, construct the local haplotype within the bounds // if there is an exact match, the alllele in the current VCF does intersect string referenceHaplotype = reference.getSubSequence(var.sequenceName, haplotypeStart - 1, haplotypeEnd - haplotypeStart); map<string, vector<Variant*> > haplotypes; for (vector<Variant*>::iterator v = overlapping.begin(); v != overlapping.end(); ++v) { Variant& variant = **v; for (vector<string>::iterator a = variant.alt.begin(); a != variant.alt.end(); ++a) { string haplotype = referenceHaplotype; // get the relative start and end coordinates for the variant alternate allele int relativeStart = variant.position - haplotypeStart; haplotype.replace(relativeStart, variant.ref.size(), *a); haplotypes[haplotype].push_back(*v); } } // determine the non-intersecting alts vector<string> altsToRemove; for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { string haplotype = referenceHaplotype; int relativeStart = var.position - haplotypeStart; haplotype.replace(relativeStart, var.ref.size(), *a); map<string, vector<Variant*> >::iterator h = haplotypes.find(haplotype); if ((intersecting && !invert && h == haplotypes.end()) || (intersecting && invert && h != haplotypes.end()) || (unioning && h != haplotypes.end())) { altsToRemove.push_back(*a); } } // remove the non-overlapping (intersecting) or overlapping (unioning) alts for (vector<string>::iterator a = altsToRemove.begin(); a != altsToRemove.end(); ++a) { var.removeAlt(*a); } if (unioning) { // somehow sort the records and combine them? map<long int, vector<Variant*> > variants; for (vector<Variant*>::iterator o = overlapping.begin(); o != overlapping.end(); ++o) { if ((*o)->position <= var.position && // check ensures proper ordering of variants on output outputVariants.find(*o) == outputVariants.end()) { outputVariants.insert(*o); variants[(*o)->position].push_back(*o); } } // add in the current variant, if it has alts left if (!var.alt.empty()) { variants[var.position].push_back(&var); } for (map<long int, vector<Variant*> >::iterator v = variants.begin(); v != variants.end(); ++v) { for (vector<Variant*>::iterator o = v->second.begin(); o != v->second.end(); ++o) { cout << **o << endl; lastOutputPosition = max(lastOutputPosition, (*o)->position); } } } else { // if any alts remain, output the variant record if (!var.alt.empty()) { cout << var << endl; lastOutputPosition = max(lastOutputPosition, var.position); } } } } } // if unioning, and any variants remain, output them if (unioning) { for (map<string, list<Variant> >::iterator chrom = otherVariants.find(lastSequenceName); chrom != otherVariants.end(); ++chrom) { for (list<Variant>::iterator v = chrom->second.begin(); v != chrom->second.end(); ++v) { Variant* variant = &*v; if (outputVariants.find(variant) == outputVariants.end()) { outputVariants.insert(variant); cout << *variant << endl; // TODO guarantee sorting } } } } exit(0); // why? return 0; }
int main(int argc, char** argv) { string ref_file = ""; vector<string> insertion_files; int max_interval = -1; bool replace_sequences = true; int c = 0; while (true) { static struct option long_options[] = { {"insertions", no_argument, 0, 'i'}, {"help", no_argument, 0, 'h'}, {"reference", required_argument, 0, 'r'}, {"no-replace-sequences", no_argument, 0, 's'}, {0, 0, 0, 0} }; int option_index = 0; c = getopt_long (argc, argv, "sr:i:h", long_options, &option_index); if (c == -1) break; /* Detect the end of the options. */ switch(c){ case 's': replace_sequences = false; break; case 'r': ref_file = optarg; break; case 'i': insertion_files.push_back(optarg); break; case 'h': case '?': print_help(argv); exit(1); default: print_help(argv); abort(); } } if (argc < 2){ print_help(argv); exit(1); } VariantCallFile variantFile; string filename = argv[argc - 1]; variantFile.open(filename); if (!variantFile.is_open()) { return 1; } vector<FastaReference*> insertions; if (!insertion_files.empty()){ for (auto x : insertion_files){ FastaReference* ins = new FastaReference(); insertions.push_back(ins); ins->open(x); } } FastaReference ref; if(!ref_file.empty()){ ref.open(ref_file); } cout << variantFile.header << endl; Variant var; while (variantFile.getNextVariant(var)) { bool valid = var.canonicalize_sv(ref, insertions, replace_sequences, max_interval); if (!valid){ cerr << "Variant could not be normalized" << var << endl; } cout << var << endl; } return 0; }
int main(int argc, char** argv) { int c; string fastaRef; int windowSize = 0; if (argc == 1) printSummary(argv); while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"fasta-reference", required_argument, 0, 'f'}, {"window-size", required_argument, 0, 'w'}, //{"length", no_argument, &printLength, true}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hf:w:", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) break; switch (c) { case 0: /* If this option set a flag, do nothing else now. */ if (long_options[option_index].flag != 0) break; printf ("option %s", long_options[option_index].name); if (optarg) printf (" with arg %s", optarg); printf ("\n"); break; case 'f': fastaRef = optarg; break; case 'w': windowSize = atoi(optarg); break; case 'h': printSummary(argv); exit(0); break; case '?': /* getopt_long already printed an error message. */ printSummary(argv); exit(1); break; default: abort (); } } if (windowSize == 0) { cerr << "a window size must be specified" << endl; exit(1); } if (fastaRef.empty()) { cerr << "a FASTA reference sequence must be specified" << endl; exit(1); } FastaReference ref; ref.open(fastaRef); VariantCallFile variantFile; string inputFilename; if (optind == argc - 1) { inputFilename = argv[optind]; variantFile.open(inputFilename); } else { variantFile.open(std::cin); } if (!variantFile.is_open()) { return 1; } variantFile.addHeaderLine("##INFO=<ID=EntropyLeft,Number=1,Type=Float,Description=\"Entropy of left-flanking sequence of "+ convert(windowSize) +"bp\">"); variantFile.addHeaderLine("##INFO=<ID=EntropyCenter,Number=1,Type=Float,Description=\"Entropy of centered sequence of "+ convert(windowSize) +"bp\">"); variantFile.addHeaderLine("##INFO=<ID=EntropyRight,Number=1,Type=Float,Description=\"Entropy of right-flanking sequence of "+ convert(windowSize) +"bp\">"); variantFile.addHeaderLine("##INFO=<ID=EntropyRef,Number=1,Type=Float,Description=\"Entropy of REF allele\">"); variantFile.addHeaderLine("##INFO=<ID=EntropyAlt,Number=A,Type=Float,Description=\"Entropy of each ALT allele\">"); cout << variantFile.header << endl; Variant var(variantFile); while (variantFile.getNextVariant(var)) { // get the ref start and end positions int refstart = var.position - 1; // convert to 0-based int refend = var.position + var.ref.size() - 1; string leftseq = ref.getSubSequence(var.sequenceName, refstart - windowSize, windowSize); string rightseq = ref.getSubSequence(var.sequenceName, refend, windowSize); string centerseq = ref.getSubSequence(var.sequenceName, refstart - windowSize/2, windowSize); double entropyLeft = shannon_H((char*) &leftseq[0], windowSize); double entropyRight = shannon_H((char*) &rightseq[0], windowSize); double entropyCenter = shannon_H((char*) ¢erseq[0], windowSize); double entropyRef = shannon_H((char*) var.ref.c_str(), var.ref.size()); var.info["EntropyLeft"].clear(); var.info["EntropyRight"].clear(); var.info["EntropyCenter"].clear(); var.info["EntropyRef"].clear(); var.info["EntropyAlt"].clear(); var.info["EntropyLeft"].push_back(convert(entropyLeft)); var.info["EntropyRight"].push_back(convert(entropyRight)); var.info["EntropyCenter"].push_back(convert(entropyCenter)); var.info["EntropyRef"].push_back(convert(entropyRef)); for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { double entropyAlt = shannon_H((char*) a->c_str(), a->size()); var.info["EntropyAlt"].push_back(convert(entropyAlt)); } cout << var << endl; } return 0; }
int main_construct(int argc, char** argv) { if (argc == 2) { help_construct(argv); return 1; } // Make a constructor to fill in Constructor constructor; // We also parse some arguments separately. vector<string> fasta_filenames; vector<string> vcf_filenames; string region; bool region_is_chrom = false; int c; optind = 2; // force optind past command positional argument while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"vcf", required_argument, 0, 'v'}, {"reference", required_argument, 0, 'r'}, {"rename", required_argument, 0, 'n'}, {"alt-paths", no_argument, 0, 'a'}, {"progress", no_argument, 0, 'p'}, {"region-size", required_argument, 0, 'z'}, {"threads", required_argument, 0, 't'}, {"region", required_argument, 0, 'R'}, {"region-is-chrom", no_argument, 0, 'C'}, {"node-max", required_argument, 0, 'm'},\ {"flat-alts", no_argument, 0, 'f'}, {0, 0, 0, 0} }; int option_index = 0; c = getopt_long (argc, argv, "v:r:n:ph?z:t:R:m:as:Cf", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) break; switch (c) { case 'v': vcf_filenames.push_back(optarg); break; case 'r': fasta_filenames.push_back(optarg); break; case 'n': { // Parse the rename old=new string key_value(optarg); auto found = key_value.find('='); if (found == string::npos || found == 0 || found + 1 == key_value.size()) { cerr << "error:[vg construct] could not parse rename " << key_value << endl; exit(1); } // Parse out the two parts string vcf_contig = key_value.substr(0, found); string fasta_contig = key_value.substr(found + 1); // Add the name mapping constructor.add_name_mapping(vcf_contig, fasta_contig); } break; case 'a': constructor.alt_paths = true; break; case 'p': constructor.show_progress = true; break; case 'z': constructor.vars_per_chunk = atoi(optarg); break; case 'R': region = optarg; break; case 'C': region_is_chrom = true; break; case 't': omp_set_num_threads(atoi(optarg)); break; case 'm': constructor.max_node_size = atoi(optarg); break; case 'f': constructor.flat = true; break; case 'h': case '?': /* getopt_long already printed an error message. */ help_construct(argv); exit(1); break; default: abort (); } } if (constructor.max_node_size == 0) { // Make sure we can actually make nodes cerr << "error:[vg construct] max node size cannot be 0" << endl; exit(1); } if (!region.empty()) { // We want to limit to a certain region if (!region_is_chrom) { // We are allowed to parse the region. // Break out sequence name and region bounds string seq_name; int start_pos = 0, stop_pos = 0; parse_region(region, seq_name, start_pos, stop_pos); if (start_pos != 0 && stop_pos != 0) { // These are 0-based, so if both are nonzero we got a real set of coordinates if (constructor.show_progress) { cerr << "Restricting to " << seq_name << " from " << start_pos << " to " << stop_pos << endl; } constructor.allowed_vcf_names.insert(seq_name); // Make sure to correct the coordinates to 0-based exclusive-end, from 1-based inclusive-end constructor.allowed_vcf_regions[seq_name] = make_pair(start_pos - 1, stop_pos); } else if (start_pos == 0 && stop_pos == 0) { // We just got a name cerr << "Restricting to " << seq_name << " from 1 to end" << endl; constructor.allowed_vcf_names.insert(seq_name); } else { // This doesn't make sense. Does it have like one coordinate? cerr << "error:[vg construct] could not parse " << region << endl; exit(1); } } else { // We have been told not to parse the region cerr << "Restricting to " << region << " from 1 to end" << endl; constructor.allowed_vcf_names.insert(region); } } // This will own all the VCF files vector<unique_ptr<vcflib::VariantCallFile>> variant_files; for (auto& vcf_filename : vcf_filenames) { // Make sure each VCF file exists. Otherwise Tabix++ may exit with a non- // helpful message. // We can't invoke stat woithout a place for it to write. But all we // really want is its return value. struct stat temp; if(stat(vcf_filename.c_str(), &temp)) { cerr << "error:[vg construct] file \"" << vcf_filename << "\" not found" << endl; return 1; } vcflib::VariantCallFile* variant_file = new vcflib::VariantCallFile(); variant_files.emplace_back(variant_file); variant_file->open(vcf_filename); if (!variant_file->is_open()) { cerr << "error:[vg construct] could not open" << vcf_filename << endl; return 1; } } if (fasta_filenames.empty()) { cerr << "error:[vg construct] a reference is required for graph construction" << endl; return 1; } vector<unique_ptr<FastaReference>> references; for (auto& fasta_filename : fasta_filenames) { // Open each FASTA file FastaReference* reference = new FastaReference(); references.emplace_back(reference); reference->open(fasta_filename); } // We need a callback to handle pieces of graph as they are produced. auto callback = [&](Graph& big_chunk) { // TODO: these chunks may be too big to (de)serialize directly. For now, // just serialize them directly anyway. #pragma omp critical (cout) stream::write(cout, 1, std::function<Graph(uint64_t)>([&](uint64_t chunk_number) -> Graph { assert(chunk_number == 0); // Just spit out our one chunk return big_chunk; })); }; // Make vectors of just bare pointers vector<vcflib::VariantCallFile*> vcf_pointers; for(auto& vcf : variant_files) { vcf_pointers.push_back(vcf.get()); } vector<FastaReference*> fasta_pointers; for(auto& fasta : references) { fasta_pointers.push_back(fasta.get()); } // Construct the graph. constructor.construct_graph(fasta_pointers, vcf_pointers, callback); // NB: If you worry about "still reachable but possibly lost" warnings in valgrind, // this would free all the memory used by protobuf: //ShutdownProtobufLibrary(); return 0; }
int main (int argc, char** argv) { double snp_mutation_rate = 0.001; double indel_mutation_rate = 0.0001; double het_rate = 0.5; double afs_alpha = 1; double indel_alpha = 3; double microsatellite_afs_alpha = 1; double microsatellite_len_alpha = 1.7; double microsatellite_mutation_rate = 0.0001; double mnp_ratio = 0.01; double tstv_ratio = 2.5; double deamination_ratio = 1.8; int microsatellite_min_length = 1; int indel_max = 1000; int ploidy = 1; int population_size = 1; int sample_id_max_digits = 1; int seed = time(NULL); string fastaFileName; string file_prefix = ""; string sample_prefix = ""; bool dry_run = false; int repeat_size_max = 20; bool uniform_indel_distribution = false; double p, lambda, shape, mu, sigma; string command_line = argv[0]; for (int i = 1; i < argc; ++i) { command_line += " "; command_line += argv[i]; } int c; while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, //{"brief", no_argument, &verbose_flag, 0}, {"help", no_argument, 0, 'h'}, {"snp-rate", required_argument, 0, 's'}, {"mnp-ratio", required_argument, 0, 'M'}, {"indel-rate", required_argument, 0, 'i'}, {"indel-alpha", required_argument, 0, 'z'}, {"indel-max", required_argument, 0, 'X'}, {"repeat-size-max", required_argument, 0, 'q'}, {"microsat-rate", required_argument, 0, 'm'}, {"microsat-afs-alpha", required_argument, 0, 't'}, {"microsat-len-alpha", required_argument, 0, 'j'}, {"microsat-min-len", required_argument, 0, 'l'}, {"afs-alpha", required_argument, 0, 'a'}, {"ploidy", required_argument, 0, 'p'}, {"population-size", required_argument, 0, 'n'}, {"file-prefix", required_argument, 0, 'P'}, {"sample-prefix", required_argument, 0, 'S'}, {"random-seed", required_argument, 0, 'g'}, {"dry-run", no_argument, 0, 'd'}, {"uniform-indels", no_argument, 0, 'U'}, {"ts-tv-ratio", required_argument, 0, 'T'}, {"deamination-ratio", required_argument, 0, 'D'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hdUa:z:s:i:q:p:n:M:X:t:m:P:S:g:l:j:T:", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) break; switch (c) { case 0: /* If this option set a flag, do nothing else now. */ if (long_options[option_index].flag != 0) break; printf ("option %s", long_options[option_index].name); if (optarg) printf (" with arg %s", optarg); printf ("\n"); break; case 'd': dry_run = true; break; case 'U': uniform_indel_distribution = true; break; case 'q': if (!convert(optarg, repeat_size_max)) { cerr << "could not read -q, --repeat-size-max" << endl; exit(1); } break; case 's': if (!convert(optarg, snp_mutation_rate)) { cerr << "could not read -s, --snp-rate" << endl; exit(1); } break; case 'i': if (!convert(optarg, indel_mutation_rate)) { cerr << "could not read -i, --indel-rate" << endl; exit(1); } break; case 'a': if (!convert(optarg, afs_alpha)) { cerr << "could not read -a, --afs-alpha" << endl; exit(1); } break; case 'z': if (!convert(optarg, indel_alpha)) { cerr << "could not read -z, --indel-alpha" << endl; exit(1); } break; case 'X': if (!convert(optarg, indel_max)) { cerr << "could not read -M, --indel-max" << endl; exit(1); } break; case 'M': if (!convert(optarg, mnp_ratio)) { cerr << "could not read -m, --mnp-ratio" << endl; exit(1); } break; case 'm': if (!convert(optarg, microsatellite_mutation_rate)) { cerr << "could not read -m, --microsat-rate" << endl; exit(1); } break; case 'T': if (!convert(optarg, tstv_ratio)) { cerr << "could not read -T, --ts-tv-ratio" << endl; exit(1); } break; case 't': if (!convert(optarg, microsatellite_afs_alpha)) { cerr << "could not read -m, --microsatellite-afs-alpha" << endl; exit(1); } break; case 'j': if (!convert(optarg, microsatellite_len_alpha)) { cerr << "could not read -m, --microsatellite-len-alpha" << endl; exit(1); } break; case 'l': if (!convert(optarg, microsatellite_min_length)) { cerr << "could not read -l, --microsat-min-len" << endl; exit(1); } break; case 'p': if (!convert(optarg, ploidy)) { cerr << "could not read -p, --ploidy" << endl; exit(1); } break; case 'P': file_prefix = optarg; break; case 'S': sample_prefix = optarg; break; case 'n': if (!convert(optarg, population_size)) { cerr << "could not read -n, --population-size" << endl; exit(1); } sample_id_max_digits = strlen(optarg); break; case 'g': if (!convert(optarg, seed)) { cerr << "could not read -g, --random-seed" << endl; exit(1); } break; case 'h': printSummary(); exit(0); break; case '?': /* getopt_long already printed an error message. */ printSummary(); exit(1); break; default: abort (); } } /* Print any remaining command line arguments (not options). */ if (optind < argc) { //cerr << "fasta file: " << argv[optind] << endl; fastaFileName = argv[optind]; } else { cerr << "please specify a fasta file" << endl; printSummary(); exit(1); } init_genrand(seed); // seed mt with current time //mt19937 eng(seed); int bpPerHaplotypeMean = 1000; double bpPerHaplotypeSigma = 200; normal_distribution<double> normal(mu, sigma); //lambda = 7.0; //poisson_distribution<int> poisson(lambda); //poisson(eng); string seqname; string sequence; // holds sequence so we can process it FastaReference fr; fr.open(fastaFileName); string bases = "ATGC"; vcf::VariantCallFile vcfFile; // write the VCF header stringstream headerss; headerss << "##fileformat=VCFv4.1" << endl << "##fileDate=" << dateStr() << endl << "##source=mutatrix population genome simulator" << endl << "##seed=" << seed << endl << "##reference=" << fastaFileName << endl << "##phasing=true" << endl << "##commandline=" << command_line << endl << "##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Alternate allele count\">" << endl << "##INFO=<ID=TYPE,Number=A,Type=String,Description=\"Type of each allele (snp, ins, del, mnp, complex)\">" << endl << "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of samples at the site\">" << endl << "##INFO=<ID=NA,Number=1,Type=Integer,Description=\"Number of alternate alleles\">" << endl << "##INFO=<ID=LEN,Number=A,Type=Integer,Description=\"Length of each alternate allele\">" << endl << "##INFO=<ID=MICROSAT,Number=0,Type=Flag,Description=\"Generated at a sequence repeat loci\">" << endl << "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">" << endl << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"; vector<string> samples; for (int i = 0; i < population_size; ++i) { stringstream sampless; sampless << sample_prefix << setfill('0') << setw(sample_id_max_digits) << i + 1; // one-based sample names samples.push_back(sampless.str()); headerss << "\t" << sampless.str(); } // and set up our VCF output file string header = headerss.str(); vcfFile.openForOutput(header); cout << vcfFile.header << endl; int copies = ploidy * population_size; map<string, vector<SampleFastaFile*> > sequencesByRefseq; if (!dry_run) { for (FastaIndex::iterator s = fr.index->begin(); s != fr.index->end(); ++s) { FastaIndexEntry& indexEntry = s->second; seqname = indexEntry.name; vector<SampleFastaFile*>& sequences = sequencesByRefseq[seqname]; for (int i = 0; i < population_size; ++i) { stringstream sname; sname << sample_prefix << setfill('0') << setw(sample_id_max_digits) << i + 1; string samplename = sname.str(); for (int j = 0; j < ploidy; ++j) { stringstream cname; cname << j; string chromname = cname.str(); string fullname = samplename + ":" + seqname + ":" + chromname; string filename = file_prefix + fullname + ".fa"; //sequences.push_back(SampleFastaFile(filename, seqname)); sequences.push_back(new SampleFastaFile(filename, seqname)); } } } } for (FastaIndex::iterator s = fr.index->begin(); s != fr.index->end(); ++s) { FastaIndexEntry& indexEntry = s->second; seqname = indexEntry.name; sequence = fr.getSequence(s->first); vector<SampleFastaFile*>& sequences = sequencesByRefseq[seqname]; //sequences.resize(copies); long int pos = 0; long int microsatellite_end_pos = 0; while (pos < sequence.size()) { //cout << pos + 1 << " microsat end pos " << microsatellite_end_pos << endl; string ref = sequence.substr(pos, 1); // by default, ref is just the current base // skip non-DNA sequence information if (!(ref == "A" || ref == "T" || ref == "C" || ref == "G")) { pos += ref.size(); for (vector<SampleFastaFile*>::iterator s = sequences.begin(); s != sequences.end(); ++s) { (*s)->write(ref); } continue; } vector<Allele> alleles; // establish if we are in a repeat // and what motif is being repeated, how many times int len = 1; // get reference repeats // if we have a repeat, adjust the mutation rate // using length and direction-dependent // formula from "Likelihood-Based Estimation of Microsatellite Mutation Rates" // http://www.genetics.org/cgi/content/full/164/2/781#T1 if (pos > microsatellite_end_pos) { map<string, int> repeats = repeatCounts(pos + 1, (const string&) sequence, repeat_size_max); string seq; int repeat_count = 0; // get the "biggest" repeat, the most likely ms allele at this site for (map<string, int>::iterator r = repeats.begin(); r != repeats.end(); ++r) { if (repeat_count < r->second) { repeat_count = r->second; seq = r->first; } } //cout << pos + 1 << " " << sequence.substr(pos + 1, seq.size() * repeat_count) << " ?= " << seq * repeat_count << endl; // guard ensures that we are in a pure repeat situoation, tandem-tandem repeats are not handled presently if (repeats.size() > 0 && sequence.substr(pos + 1, seq.size() * repeat_count) == seq * repeat_count) { int microsatellite_length = repeat_count * seq.size(); // record end of microsatellite so we don't generate more mutations until we pass it microsatellite_end_pos = pos + microsatellite_length - 1; if (microsatellite_length > microsatellite_min_length //&& genrand_real1() / copies // < microsatellite_mutation_rate * repeat_count) { && genrand_real1() > pow(1 - (microsatellite_mutation_rate * repeat_count), log(copies) * 2)) { // establish the relative rate of ins and del events /* long double repeatMutationDelProbability = microsatelliteDelProb(repeat_count); long double repeatMutationInsProbability = microsatelliteInsProb(repeat_count); long double indel_balance = 1; if (repeatMutationInsProbability > repeatMutationDelProbability) { indel_balance = repeatMutationInsProbability / repeatMutationDelProbability; } else { indel_balance = 1 - (repeatMutationInsProbability / repeatMutationDelProbability); } */ double indel_balance = 0.5; // how many alleles at the site? //int numalleles = min((int) floor(zetarandom(microsatellite_afs_alpha)), (int) ((double) repeat_count * indel_balance)); int numalleles = random_allele_frequency(repeat_count, microsatellite_afs_alpha); //cout << "repeat_count: " << repeat_count << " numalleles: " << numalleles << endl; map<int, bool> allele_lengths; // lengths of the alleles while (allele_lengths.size() < numalleles) { int allele_length; // TODO adjust length so that shorter events are more likely... if (genrand_real1() > indel_balance) { allele_length = -1 * min((int) floor(zetarandom(microsatellite_len_alpha)), repeat_count); } else { allele_length = min((int) floor(zetarandom(microsatellite_len_alpha)), repeat_count); } //cout << allele_length << endl; map<int, bool>::iterator f = allele_lengths.find(allele_length); if (f == allele_lengths.end()) { allele_lengths[allele_length] = true; } } // generate alleles for (map<int, bool>::iterator f = allele_lengths.begin(); f != allele_lengths.end(); ++f) { int allele_length = f->first; int c = abs(f->first); string alt = seq; for (int i = 1; i < c; ++i) alt += seq; if (allele_length > 0) { alleles.push_back(Allele(ref, ref + alt, "MICROSAT")); } else { alleles.push_back(Allele(ref + alt, ref, "MICROSAT")); } //cout << pos + 1 << " " << microsatellite_length << " " << alleles.back() << endl; } //cout << "alleles.size() == " << alleles.size() << endl; } } } // snp case if (genrand_real1() > pow(1 - snp_mutation_rate, log(max(copies, 2)) * 2)) { // make an alternate allele /* string alt = ref; while (alt == ref) { alt = string(1, bases.at(genrand_int32() % 4)); } */ string alt = ref; if (genrand_real1() > 1 / (1 + tstv_ratio)) { if (ref == "A") { alt = "G"; } else if (ref == "G") { alt = "A"; } else if (ref == "C") { alt = "T"; } else if (ref == "T") { alt = "C"; } } else { while (alt == ref || isTransition(ref, alt)) { alt = string(1, bases.at(genrand_int32() % 4)); } } if (genrand_real1() < mnp_ratio) { int i = 1; do { ref += sequence.substr(pos + i, 1); alt += sequence.substr(pos + i, 1); ++i; while (alt.at(alt.size() - 1) == ref.at(ref.size() - 1)) { alt.at(alt.size() - 1) = bases.at(genrand_int32() % 4); } } while (genrand_real1() < mnp_ratio); len = alt.size(); } alleles.push_back(Allele(ref, alt)); } // indel case if (genrand_real1() > pow(1 - indel_mutation_rate, log(max(copies, 2)) * 2)) { // how many bp? if (uniform_indel_distribution) { len = (int) floor(genrand_real1() * indel_max); } else { len = (int) floor(zetarandom(indel_alpha)); } // guard against out-of-sequence indels if (pos + len < sequence.size() && len <= indel_max) { if (genrand_int32() % 2 == 0) { // deletion alleles.push_back(Allele(sequence.substr(pos, 1 + len), sequence.substr(pos, 1))); } else { string alt = ref; // insertion? // insert some random de novo bases while (alt.length() < len + 1) { alt += string(1, bases.at(genrand_int32() % 4)); } alleles.push_back(Allele(ref, alt)); } } else { // fall through } } // no mutation generated if (alleles.empty()) { for (int i = 0; i < copies; ++i) { if (!dry_run) { sequences.at(i)->write(ref); } } pos += ref.size(); } else { // TODO randomly distribute all the alleles throughout the population // generate allele frequencies for each // fun times... string genotype; vector<bool> alts; random_shuffle(alleles.begin(), alleles.end()); vector<Allele*> population_alleles; list<Allele> present_alleles; // filtered for AFS > 0 in the sample // AFS simulation int remaining_copies = copies; while (remaining_copies > 0 && !alleles.empty()) { Allele allele = alleles.back(); alleles.pop_back(); int allele_freq = random_allele_frequency(remaining_copies, afs_alpha); if (allele_freq > 0) { present_alleles.push_back(allele); Allele* allelePtr = &present_alleles.back(); for (int i = 0; i < allele_freq; ++i) { population_alleles.push_back(allelePtr); } remaining_copies -= allele_freq; } } if (present_alleles.empty()) { for (int i = 0; i < copies; ++i) { if (!dry_run) { sequences.at(i)->write(ref); } } pos += ref.size(); continue; } reverse(present_alleles.begin(), present_alleles.end()); // establish the correct reference sequence and alternate allele set for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a) { Allele& allele = *a; //cout << allele << endl; if (allele.ref.size() > ref.size()) { ref = allele.ref; } } // reference alleles take up the rest Allele reference_allele = Allele(ref, ref); for (int i = 0; i < remaining_copies; ++i) { population_alleles.push_back(&reference_allele); } vector<string> altstrs; // now the reference allele is the largest possible, adjust the alt allele strings to reflect this // if we have indels, add the base before, set the position back one for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a) { Allele& allele = *a; string alleleStr = ref; if (allele.ref.size() == allele.alt.size()) { alleleStr.replace(0, allele.alt.size(), allele.alt); } else { alleleStr.replace(0, allele.ref.size(), allele.alt); } allele.ref = ref; allele.alt = alleleStr; altstrs.push_back(alleleStr); } assert(population_alleles.size() == copies); // shuffle the alleles around the population random_shuffle(population_alleles.begin(), population_alleles.end()); vcf::Variant var(vcfFile); var.sequenceName = seqname; var.position = pos + 1; var.quality = 99; var.id = "."; var.filter = "."; var.info["NS"].push_back(convert(population_size)); var.info["NA"].push_back(convert(present_alleles.size())); var.format.push_back("GT"); var.ref = ref; var.alt = altstrs; // debugging, uncomment to see sequence context //cout << sequence.substr(pos - 10, 10) << "*" << ref << "*" << sequence.substr(pos + 1, 9) << endl; map<string, int> alleleIndexes; alleleIndexes[convert(reference_allele)] = 0; // XXX should we handle this differently, by adding the reference allele to present_alleles? int i = 1; for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a, ++i) { Allele& allele = *a; //cout << allele << " " << i << endl; alleleIndexes[convert(allele)] = i; //cout << allele << " " << i << endl; } //for (map<string, int>::iterator a = alleleIndexes.begin(); a != alleleIndexes.end(); ++a) { // cout << a->first << " = " << a->second << endl; //} int j = 0; for (vector<string>::iterator s = samples.begin(); s != samples.end(); ++s, ++j) { string& sample = *s; vector<string> genotype; // XXX hack, maybe this should get stored in another map for easier access? for (int i = 0; i < ploidy; ++i) { int l = (j * ploidy) + i; //cout << l << " " << population_alleles.at(l) << " " << alleleIndexes[convert(population_alleles.at(l))] << endl; genotype.push_back(convert(alleleIndexes[convert(*population_alleles.at(l))])); } var.samples[sample]["GT"].push_back(join(genotype, "|")); //cout << var.samples[sample]["GT"].front() << endl; } // XXX THIS IS BROKEN BECAUSE YOUR REFERENCE ALLELE CHANGES // LENGTH WITH DELETIONS. // // IT'S POSSIBLE TO GET COMPLEX ALLELES AT THE INTERSECTIONS // BETWEEN ONE ALLELIC VARIANT AND ANOTHER. THIS IS BROKEN! // // TO FIX--- BUILD HAPLOTYPES, THEN DISTRIBUTE THEM WITHIN THE POPULATION // // now write out our sequence data (FASTA files) for (int j = 0; j < population_size; ++j) { for (int i = 0; i < ploidy; ++i) { int l = (j * ploidy) + i; Allele* allele = population_alleles.at(l); if (!dry_run) { sequences.at(l)->write(allele->alt); } } } // tabulate allele frequency, and write some details to the VCF for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a) { Allele& allele = *a; Allele* allelePtr = &*a; vector<string> genotypes; genotypes.resize(population_size); int allele_freq = 0; // obtain allele frequencies and output FASTA sequence data // for each simulated sample for (int j = 0; j < population_size; ++j) { for (int i = 0; i < ploidy; ++i) { int l = (j * ploidy) + i; if (population_alleles.at(l) == allelePtr) { ++allele_freq; } } } // set up the allele-specific INFO fields in the VCF record var.info["AC"].push_back(convert(allele_freq)); int delta = allele.alt.size() - allele.ref.size(); if (delta == 0) { if (allele.ref.size() == 1) { var.info["TYPE"].push_back("snp"); var.info["LEN"].push_back(convert(allele.ref.size())); } else { var.info["TYPE"].push_back("mnp");; var.info["LEN"].push_back(convert(allele.ref.size())); } } else if (delta > 0) { var.info["TYPE"].push_back("ins");; var.info["LEN"].push_back(convert(abs(delta))); } else { var.info["TYPE"].push_back("del");; var.info["LEN"].push_back(convert(abs(delta))); } if (!allele.type.empty()) { var.infoFlags[allele.type] = true; } } // write the VCF record to stdout cout << var << endl; int largest_ref = 1; // enforce one pos for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a) { if (a->ref.size() > largest_ref) { largest_ref = a->ref.size(); } } pos += largest_ref; // step by the size of the last event } } } // close, clean up files for (map<string, vector<SampleFastaFile*> >::iterator s = sequencesByRefseq.begin(); s != sequencesByRefseq.end(); ++s) { vector<SampleFastaFile*>& files = s->second; for (vector<SampleFastaFile*>::iterator f = files.begin(); f != files.end(); ++f) { delete *f; } files.clear(); } return 0; }
void realign_bam(Parameters& params) { FastaReference reference; reference.open(params.fasta_reference); bool suppress_output = false; int dag_window_size = params.dag_window_size; // open BAM file BamReader reader; if (!reader.Open("stdin")) { cerr << "could not open stdin for reading" << endl; exit(1); } BamWriter writer; if (!params.dry_run && !writer.Open("stdout", reader.GetHeaderText(), reader.GetReferenceData())) { cerr << "could not open stdout for writing" << endl; exit(1); } // store the names of all the reference sequences in the BAM file map<int, string> referenceIDToName; vector<RefData> referenceSequences = reader.GetReferenceData(); int i = 0; for (RefVector::iterator r = referenceSequences.begin(); r != referenceSequences.end(); ++r) { referenceIDToName[i] = r->RefName; ++i; } vcf::VariantCallFile vcffile; if (!params.vcf_file.empty()) { if (!vcffile.open(params.vcf_file)) { cerr << "could not open VCF file " << params.vcf_file << endl; exit(1); } } else { cerr << "realignment requires VCF file" << endl; exit(1); } vcf::Variant var(vcffile); BamAlignment alignment; map<long int, vector<BamAlignment> > alignmentSortQueue; // get alignment // assemble DAG in region around alignment // loop for each alignment in BAM: // update DAG when current alignment gets close to edge of assembled DAG // attempt to realign if read has a certain number of mismatches + gaps or softclips, weighted by basequal // if alignment to DAG has fewer mismatches and gaps than original alignment, use it // flatten read into reference space (for now just output alleles from VCF un-spanned insertions) // write read to queue for streaming re-sorting (some positional change will occur) long int dag_start_position = 0; string currentSeqname; string ref; //vector<Cigar> cigars; // contains the Cigar strings of nodes in the graph //vector<long int> refpositions; // contains the reference start coords of nodes in the graph ReferenceMappings ref_map; gssw_graph* graph = gssw_graph_create(0); int8_t* nt_table = gssw_create_nt_table(); int8_t* mat = gssw_create_score_matrix(params.match, params.mism); int total_reads = 0; int total_realigned = 0; int total_improved = 0; bool emptyDAG = false; // if the dag is constructed over empty sequence // such as when realigning reads mapped to all-N sequence if (params.debug) { cerr << "about to start processing alignments" << endl; } while (reader.GetNextAlignment(alignment)) { string& seqname = referenceIDToName[alignment.RefID]; if (params.debug) { cerr << "--------------------------------------------" << endl << "processing alignment " << alignment.Name << " at " << seqname << ":" << alignment.Position << endl; } /* if (!alignment.IsMapped() && graph->size == 0) { if (params.debug) { cerr << "unable to build DAG using unmapped read " << alignment.Name << " @ " << seqname << ":" << alignment.Position << " no previous mapped read found and DAG currently empty" << endl; } alignmentSortQueue[dag_start_position+dag_window_size].push_back(alignment); continue; } */ ++total_reads; BamAlignment originalAlignment = alignment; long unsigned int initialAlignmentPosition = alignment.Position; //if (dag_start_position == 1) { // dag_start_position = max(1, (int)initialAlignmentPosition - dag_window_size/2); //} // should we construct a new DAG? do so when 3/4 of the way through the current one // center on current position + 1/2 dag window // TODO check this scheme using some scribbles on paper // alignment.IsMapped() if ((seqname != currentSeqname || ((alignment.Position + (alignment.QueryBases.size()/2) > (3*dag_window_size/4) + dag_start_position))) && alignment.Position < reference.sequenceLength(seqname)) { if (seqname != currentSeqname) { if (params.debug) { cerr << "switched ref seqs" << endl; } dag_start_position = max((long int) 0, (long int) (alignment.GetEndPosition() - dag_window_size/2)); // recenter DAG } else if (!ref_map.empty()) { dag_start_position = dag_start_position + dag_window_size/2; dag_start_position = max(dag_start_position, (long int) (alignment.GetEndPosition() - dag_window_size/2)); } else { dag_start_position = alignment.Position - dag_window_size/2; } dag_start_position = max((long int)0, dag_start_position); // TODO get sequence length and use to bound noted window size (edge case) //cerr << "getting ref " << seqname << " " << max((long int) 0, dag_start_position) << " " << dag_window_size << endl; // get variants for new DAG vector<vcf::Variant> variants; if (!vcffile.setRegion(seqname, dag_start_position + 1, dag_start_position + dag_window_size)) { // this is not necessarily an error; there should be a better way to check for VCF file validity /* cerr << "could not set region on VCF file to " << currentSeqname << ":" << dag_start_position << "-" << dag_start_position + ref.size() << endl; */ //exit(1); } else { // check first variant if (vcffile.getNextVariant(var)) { while (var.position <= dag_start_position + 1) { //cerr << "var position == dag_start_position " << endl; dag_start_position -= 1; vcffile.setRegion(seqname, dag_start_position + 1, dag_start_position + dag_window_size); if (!vcffile.getNextVariant(var)) { break; } } } vcffile.setRegion(seqname, dag_start_position + 1, dag_start_position + dag_window_size); while (vcffile.getNextVariant(var)) { if (params.debug) cerr << "getting variant at " << var.sequenceName << ":" << var.position << endl; //cerr << var.position << " + " << var.ref.length() << " <= " << dag_start_position << " + " << dag_window_size << endl; //cerr << var.position << " >= " << dag_start_position << endl; if (var.position + var.ref.length() <= dag_start_position + dag_window_size && var.position >= dag_start_position) { variants.push_back(var); } } } //cerr << "dag_start_position " << dag_start_position << endl; ref = reference.getSubSequence(seqname, max((long int) 0, dag_start_position), dag_window_size); // 0/1 conversion // clear graph and metadata ref_map.clear(); //cigars.clear(); //refpositions.clear(); gssw_graph_destroy(graph); if (params.debug) { cerr << "constructing DAG" << endl; } // and build the DAG graph = gssw_graph_create(0); constructDAGProgressive(graph, ref_map, ref, seqname, variants, dag_start_position, nt_table, mat, params.flat_input_vcf); if (params.debug) { cerr << "graph has " << graph->size << " nodes" << endl; cerr << "DAG generated from input variants over " << seqname << ":" << dag_start_position << "-" << dag_start_position + dag_window_size << endl; } if (params.display_dag) { gssw_graph_print(graph); /* for (Backbone::iterator b = backbone.begin(); b != backbone.end(); ++b) { cout << b->first << " " << b->first->id << " " << b->second.ref_position << " " << b->second.cigar << endl << b->first->seq << endl; } */ } if (graph->size == 1 && allN(ref) || graph->size == 0) { if (params.debug) { cerr << "DAG is empty (1 node, all N). Alignment is irrelevant." << endl; } emptyDAG = true; } else { emptyDAG = false; } } AlignmentStats stats_before; bool was_mapped = alignment.IsMapped(); bool has_realigned = false; if (was_mapped) { if (dag_start_position + dag_window_size < alignment.GetEndPosition()) { ref = reference.getSubSequence(seqname, max((long int) 0, dag_start_position), alignment.GetEndPosition() - dag_start_position); // 0/1 conversion } } if (params.debug) { if (emptyDAG) { cerr << "cannot realign against empty (all-N single node) graph" << endl; } } if (!emptyDAG && shouldRealign(alignment, ref, dag_start_position, params, stats_before)) { ++total_realigned; if (params.debug) { cerr << "realigning: " << alignment.Name << " " << alignment.QueryBases << endl << " aligned @ " << alignment.Position << " to variant graph over " << seqname << ":" << dag_start_position << "-" << dag_start_position + dag_window_size << endl; } //{ try { Cigar flat_cigar; string read = alignment.QueryBases; string qualities = alignment.Qualities; int score; long int position; string strand; gssw_graph_mapping* gm = gswalign(graph, ref_map, read, qualities, params, position, score, flat_cigar, strand, nt_table, mat); // gssw_graph_mapping_destroy(gm); if (params.dry_run) { if (strand == "-" && !alignment.IsMapped()) { read = reverseComplement(read); } cout << read << endl; cout << graph_mapping_to_string(gm) << endl; cout << score << " " << strand << " " << position << " " << flat_cigar << endl; } else { /* if (strand == "-") { read = reverseComplement(trace_report.read); } */ // TODO the qualities are not on the right side of the read if (strand == "-" && alignment.IsMapped()) { // if we're realigning, this is always true unless we swapped strands alignment.SetIsReverseStrand(true); //reverse(alignment.Qualities.begin(), alignment.Qualities.end()); // reverse qualities } //alignment.QueryBases = reverseComplement(trace_report.read); alignment.QueryBases = read; alignment.Qualities = qualities; alignment.Position = position;// + 1;// + 1;//(trace_report.node->position - 1) + trace_report.x; alignment.SetIsMapped(true); if (!alignment.MapQuality) { alignment.MapQuality = 20; // horrible hack... at least approximate with alignment mismatches against graph } // check if somehow we've ended up with an indel at the ends // if so, grab the reference sequence right beyond it and add // a single match to the cigar, allowing variant detection methods // to run on the results without internal modification Cigar& cigar = flat_cigar; //cerr << flat_cigar << " " << flat_cigar.readLen() << " " << flat_cigar.refLen() << endl; int flankSize = params.flatten_flank; if (cigar.front().isIndel() || (cigar.front().isSoftclip() && cigar.at(1).isIndel())) { alignment.Position -= flankSize; string refBase = reference.getSubSequence(seqname, alignment.Position, flankSize); if (cigar.front().isSoftclip()) { alignment.QueryBases.erase(alignment.QueryBases.begin(), alignment.QueryBases.begin()+cigar.front().length); alignment.Qualities.erase(alignment.Qualities.begin(), alignment.Qualities.begin()+cigar.front().length); cigar.erase(cigar.begin()); } alignment.QueryBases.insert(0, refBase); alignment.Qualities.insert(0, string(flankSize, shortInt2QualityChar(30))); Cigar newCigar; newCigar.push_back(CigarElement(flankSize, 'M')); newCigar.append(flat_cigar); flat_cigar = newCigar; } if (cigar.back().isIndel() || (cigar.back().isSoftclip() && cigar.at(cigar.size()-2).isIndel())) { string refBase = reference.getSubSequence(seqname, alignment.Position + flat_cigar.refLen(), flankSize); if (cigar.back().isSoftclip()) { alignment.QueryBases.erase(alignment.QueryBases.end()-cigar.back().length, alignment.QueryBases.end()); alignment.Qualities.erase(alignment.Qualities.end()-cigar.back().length, alignment.Qualities.end()); cigar.pop_back(); } Cigar newCigar; newCigar.push_back(CigarElement(flankSize, 'M')); flat_cigar.append(newCigar); //flat_cigar.append(newCigar); alignment.QueryBases.append(refBase); alignment.Qualities.append(string(flankSize, shortInt2QualityChar(30))); } flat_cigar.toCigarData(alignment.CigarData); //cerr << flat_cigar << " " << flat_cigar.readLen() << " " << flat_cigar.refLen() << endl; if (dag_start_position + dag_window_size < alignment.GetEndPosition()) { ref = reference.getSubSequence(seqname, max((long int) 0, dag_start_position), alignment.GetEndPosition() - dag_start_position); // 0/1 conversion } AlignmentStats stats_after; countMismatchesAndGaps(alignment, flat_cigar, ref, dag_start_position, stats_after, params.debug); /* if ((!was_mapped || (stats_before.softclip_qsum >= stats_after.softclip_qsum && stats_before.mismatch_qsum >= stats_after.mismatch_qsum)) && acceptRealignment(alignment, ref, dag_start_position, params, stats_after)) { */ /* if ((!was_mapped || (stats_before.softclip_qsum + stats_before.mismatch_qsum >= stats_after.softclip_qsum + stats_after.mismatch_qsum)) && acceptRealignment(alignment, ref, dag_start_position, params, stats_after)) { */ // we accept the new alignment if... if (!was_mapped // it wasn't mapped previously // or if we have removed soft clips or mismatches (per quality) from the alignment //|| ((stats_before.softclip_qsum >= stats_after.softclip_qsum // && stats_before.mismatch_qsum >= stats_after.mismatch_qsum) || ((stats_before.softclip_qsum + stats_before.mismatch_qsum >= stats_after.softclip_qsum + stats_after.mismatch_qsum) // and if we have added gaps, we have added them to remove mismatches or softclips && (stats_before.gaps >= stats_after.gaps // accept any time we reduce gaps while not increasing softclips/mismatches || (stats_before.gaps < stats_after.gaps // and allow gap increases when they improve the alignment && (stats_before.softclip_qsum + stats_before.mismatch_qsum > stats_after.softclip_qsum + stats_after.mismatch_qsum)))) // and the alignment must not have more than the acceptable number of gaps, softclips, or mismatches // as provided in input parameters && acceptRealignment(alignment, ref, dag_start_position, params, stats_after)) { // keep the alignment // TODO require threshold of softclips to keep alignment (or count of gaps, mismatches,...) if (params.debug) { cerr << "realigned " << alignment.Name << " to graph, which it maps to with " << stats_after.mismatch_qsum << "q in mismatches and " << stats_after.softclip_qsum << "q in soft clips" << endl; } ++total_improved; has_realigned = true; } else { // reset to old version of alignment if (params.debug) { cerr << "failed realignment of " << alignment.Name << " to graph, which it maps to with: " << stats_after.mismatch_qsum << "q in mismatches " << "(vs " << stats_before.mismatch_qsum << "q before), and " << stats_after.softclip_qsum << "q in soft clips " << "(vs " << stats_before.softclip_qsum << "q before) " << endl; } has_realigned = false; alignment = originalAlignment; } } //} // try block } catch (...) { cerr << "exception when realigning " << alignment.Name << " at position " << referenceIDToName[alignment.RefID] << ":" << alignment.Position << " " << alignment.QueryBases << endl; // reset to original alignment has_realigned = false; alignment = originalAlignment; } } // ensure correct order if alignments move long int maxOutputPos = initialAlignmentPosition - dag_window_size; // if we switched sequences we need to flush out all the reads from the previous one string lastSeqname = currentSeqname; if (seqname != currentSeqname) { // so the max output position is set past the end of the last chromosome if (!currentSeqname.empty()) { maxOutputPos = reference.sequenceLength(currentSeqname) + dag_window_size; } currentSeqname = seqname; } if (!params.dry_run) { map<long int, vector<BamAlignment> >::iterator p = alignmentSortQueue.begin(); for ( ; p != alignmentSortQueue.end(); ++p) { // except if we are running in unsorted mode, stop when we are at the window size if (!params.unsorted_output && p->first > maxOutputPos) { break; // no more to do } else { for (vector<BamAlignment>::iterator a = p->second.begin(); a != p->second.end(); ++a) { writer.SaveAlignment(*a); } } } if (p != alignmentSortQueue.begin()) { alignmentSortQueue.erase(alignmentSortQueue.begin(), p); } if (!params.only_realigned || has_realigned) { alignmentSortQueue[alignment.Position].push_back(alignment); } } } // end GetNextAlignment loop if (!params.dry_run) { map<long int, vector<BamAlignment> >::iterator p = alignmentSortQueue.begin(); for ( ; p != alignmentSortQueue.end(); ++p) { for (vector<BamAlignment>::iterator a = p->second.begin(); a != p->second.end(); ++a) writer.SaveAlignment(*a); } } gssw_graph_destroy(graph); free(nt_table); free(mat); reader.Close(); writer.Close(); if (params.debug) { cerr << "total reads:\t" << total_reads << endl; cerr << "realigned:\t" << total_realigned << endl; cerr << "improved:\t" << total_improved << endl; } }
// one-off void construct_dag_and_align_single_sequence(Parameters& params) { if (params.debug) { cout << "read: " << params.read_input << endl; //cout << "fastq file:" << params.fastq_file << endl; cout << "fasta reference:" << params.fasta_reference << endl; cout << "vcf file " << params.vcf_file << endl; cout << "target " << params.target << endl; cout << endl; } // get sequence of target FastaReference reference; reference.open(params.fasta_reference); FastaRegion target(params.target); string targetSequence = reference.getTargetSubSequence(target); // get variants in target vector<vcf::Variant> variants; vcf::VariantCallFile vcffile; if (!params.vcf_file.empty()) { vcffile.open(params.vcf_file); vcf::Variant var(vcffile); vcffile.setRegion(params.target); while (vcffile.getNextVariant(var)) { if (var.position + var.ref.length() <= target.stopPos) { variants.push_back(var); } } } long offset = max(target.startPos, 1); // start is -1 when coordinates are not specified // Declare the target DAG to align against. //vector<Cigar> cigars; //vector<long int> refpositions; ReferenceMappings ref_map; gssw_graph* graph = gssw_graph_create(0); int8_t* nt_table = gssw_create_nt_table(); int8_t* mat = gssw_create_score_matrix(params.match, params.mism); constructDAGProgressive(graph, ref_map, targetSequence, target.startSeq, variants, offset, nt_table, mat, params.flat_input_vcf); if (params.display_dag) { cout << "DAG generated from input variants:" << endl; } // run the alignment string read = params.read_input; string qualities(read.size(), shortInt2QualityChar(30)); int score; long int position; string strand; Cigar flat_cigar; gssw_graph_mapping* gm = gswalign(graph, ref_map, read, qualities, params, position, score, flat_cigar, strand, nt_table, mat); cerr << graph_mapping_to_string(gm) << endl; gssw_graph_mapping_destroy(gm); /* cout << score << " " << strand << " " << (trace_report.node->position - 1) + trace_report.x << " " << trace_report.fcigar << " seq:" << trace_report.x << " read:" << trace_report.y << " " << trace_report.gcigar << " " << trace_report.fcigar << endl; if (params.display_alignment) { string refseq; for (vector<sn*>::iterator n = trace_report.node_list.begin(); n != trace_report.node_list.end(); ++n) { refseq.append((*n)->sequence); } refseq = refseq.substr(trace_report.x, read.size()); cout << refseq << endl; if (strand == "+") { cout << read << endl; } else { cout << reverseComplement(read) << endl; } } */ }
int main(int argc, char** argv) { int c; FastaReference reference; bool has_ref = false; bool suppress_output = false; bool debug = false; bool isuncompressed = true; int maxiterations = 50; if (argc < 2) { printUsage(argv); exit(1); } while (true) { static struct option long_options[] = { {"help", no_argument, 0, 'h'}, {"debug", no_argument, 0, 'd'}, {"fasta-reference", required_argument, 0, 'f'}, {"max-iterations", required_argument, 0, 'm'}, {"suppress-output", no_argument, 0, 's'}, {"compressed", no_argument, 0, 'c'}, {0, 0, 0, 0} }; int option_index = 0; c = getopt_long (argc, argv, "hdcsf:m:", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) break; switch (c) { case 'f': reference.open(optarg); // will exit on open failure has_ref = true; break; case 'm': maxiterations = atoi(optarg); break; case 'd': debug = true; break; case 's': suppress_output = true; break; case 'c': isuncompressed = false; break; case 'h': printUsage(argv); exit(0); break; case '?': printUsage(argv); exit(1); break; default: abort(); break; } } if (!has_ref) { cerr << "no FASTA reference provided, cannot realign" << endl; exit(1); } BAMSINGLEREADER reader; if (!reader.Open(STDIN)) { cerr << "could not open stdin for reading" << endl; exit(1); } #ifdef HAVE_BAMTOOLS BamWriter writer; if (isuncompressed) { writer.SetCompressionMode(BamWriter::Uncompressed); } if (!suppress_output && !writer.Open("stdout", reader.GetHeaderText(), reader.GetReferenceData())) { cerr << "could not open stdout for writing" << endl; exit(1); } #else SeqLib::BamWriter writer(isuncompressed ? SeqLib::SAM : SeqLib::BAM); SeqLib::BamHeader hdr = reader.Header(); if (hdr.isEmpty()) { cerr << "could not open header for input" << endl; exit(1); } writer.SetHeader(hdr); if (!suppress_output && !writer.Open("-")) { cerr << "could not open stdout for writing" << endl; exit(1); } #endif // store the names of all the reference sequences in the BAM file map<int, string> referenceIDToName; REFVEC referenceSequences = reader.GETREFDATA; int i = 0; for (REFVEC::iterator r = referenceSequences.begin(); r != referenceSequences.end(); ++r) { referenceIDToName[i] = r->REFNAME; ++i; } BAMALIGN alignment; while (GETNEXT(reader, alignment)) { DEBUG("--------------------------- read --------------------------" << endl); DEBUG("| " << referenceIDToName[alignment.REFID] << ":" << alignment.POSITION << endl); DEBUG("| " << alignment.QNAME << ":" << alignment.ENDPOSITION << endl); DEBUG("| " << alignment.QNAME << ":" << (alignment.ISMAPPED ? " mapped" : " unmapped") << endl); DEBUG("| " << alignment.QNAME << ":" << " cigar data size: " << alignment.GETCIGAR.size() << endl); DEBUG("--------------------------- realigned --------------------------" << endl); // skip unmapped alignments, as they cannot be left-realigned without CIGAR data if (alignment.ISMAPPED) { int endpos = alignment.ENDPOSITION; int length = endpos - alignment.POSITION + 1; if (alignment.POSITION >= 0 && length > 0) { if (!stablyLeftAlign(alignment, reference.getSubSequence( referenceIDToName[alignment.REFID], alignment.POSITION, length), maxiterations, debug)) { cerr << "unstable realignment of " << alignment.QNAME << " at " << referenceIDToName[alignment.REFID] << ":" << alignment.POSITION << endl << alignment.QUERYBASES << endl; } } } DEBUG("----------------------------------------------------------------" << endl); DEBUG(endl); if (!suppress_output) WRITEALIGNMENT(writer, alignment); } reader.Close(); if (!suppress_output) writer.Close(); return 0; }