int main(int argc, char** argv) { if (argc != 3) { cerr << "usage: " << argv[0] << " <vcf file> <vcf file>" << endl << "Adds info fields from the second file which are not present in the first vcf file." << endl; return 1; } string filenameA = argv[1]; string filenameB = argv[2]; if (filenameA == filenameB) { cerr << "it won't help to add info data from the same file!" << endl; return 1; } VariantCallFile variantFileA; if (filenameA == "-") { variantFileA.open(std::cin); } else { variantFileA.open(filenameA); } VariantCallFile variantFileB; if (filenameB == "-") { variantFileB.open(std::cin); } else { variantFileB.open(filenameB); } if (!variantFileA.is_open() || !variantFileB.is_open()) { return 1; } Variant varA(variantFileA); Variant varB(variantFileB); // while the first file doesn't match the second positionally, // step forward, annotating each genotype record with an empty genotype // when the two match, iterate through the genotypes from the first file // and get the genotypes reported in the second file variantFileA.getNextVariant(varA); variantFileB.getNextVariant(varB); variantFileA.header = unionInfoHeaderLines(variantFileA.header, variantFileB.header); cout << variantFileA.header << endl; do { while (!variantFileB.done() && (varB.sequenceName < varA.sequenceName || (varB.sequenceName == varA.sequenceName && varB.position < varA.position)) ) { variantFileB.getNextVariant(varB); } while (!variantFileA.done() && (varA.sequenceName < varB.sequenceName || (varA.sequenceName == varB.sequenceName && varA.position < varB.position)) ) { cout << varA << endl; variantFileA.getNextVariant(varA); } while (!variantFileB.done() && (varB.sequenceName < varA.sequenceName || (varB.sequenceName == varA.sequenceName && varB.position < varA.position)) ) { variantFileB.getNextVariant(varB); } while (!variantFileA.done() && varA.sequenceName == varB.sequenceName && varA.position == varB.position) { addInfo(varA, varB); cout << varA << endl; variantFileA.getNextVariant(varA); variantFileB.getNextVariant(varB); } } while (!variantFileA.done() && !variantFileB.done()); if (!variantFileA.done()) { cout << varA << endl; while (variantFileA.getNextVariant(varA)) { cout << varA << endl; } } return 0; }
int main(int argc, char** argv) { string vcfFileName; string fastaFileName; int windowsize = 30; if (argc == 1) printSummary(argv); int c; while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"window-size", required_argument, 0, 'w'}, {"reference", required_argument, 0, 'r'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hw:r:", long_options, &option_index); if (c == -1) break; switch (c) { case 'w': windowsize = atoi(optarg); break; case 'r': fastaFileName = string(optarg); break; case 'h': printSummary(argv); break; case '?': printSummary(argv); exit(1); break; default: abort (); } } VariantCallFile variantFile; string inputFilename; if (optind == argc - 1) { inputFilename = argv[optind]; variantFile.open(inputFilename); } else { variantFile.open(std::cin); } if (!variantFile.is_open()) { cerr << "could not open VCF file" << endl; exit(1); } FastaReference reference; if (fastaFileName.empty()) { cerr << "a reference is required for haplotype allele generation" << endl; exit(1); } reference.open(fastaFileName); // pattern // when variants are within windowSize from each other, build up local haplotypes // establish all the haplotypes which exist within the window using genotypes+allele#+position map // generate a haplotype allele string for each unique haplotype // for completeness retain phasing information in the genotypes // write a new VCF record in which there are haplotype alleles and correctly described genotypes for each sample // if the variants are outside of the windowSize, just write out the record Variant var(variantFile); Variant outputVar(variantFile); cout << variantFile.header << endl; // get the first distances vector<Variant> cluster; while (variantFile.getNextVariant(var) || !cluster.empty()) { bool haplotypeCluster = false; if (variantFile.done()) { if (cluster.size() >= 1) { haplotypeCluster = true; } else { cout << cluster.front() << endl; cluster.clear(); } } else if (isPhased(var)) { if (cluster.empty() || cluster.back().sequenceName == var.sequenceName && var.position - cluster.back().position + cluster.back().ref.size() - 1 <= windowsize) { cluster.push_back(var); } else { if (cluster.size() == 1) { cout << cluster.front() << endl; cluster.clear(); if (!variantFile.done()) { cluster.push_back(var); } } else { haplotypeCluster = true; } } } else { // not phased if (cluster.empty()) { cout << var << endl; } else if (cluster.size() == 1) { cout << cluster.front() << endl; cout << var << endl; } else { haplotypeCluster = true; } } // we need to deal with the current cluster, as our next var is outside of bounds // process the last cluster if it's more than 1 var if (haplotypeCluster) { /* cerr << "cluster: "; for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) { cerr << " " << v->position; } cerr << endl; */ // generate haplotype alleles and genotypes! // get the reference sequence across the haplotype in question string referenceHaplotype = reference.getSubSequence(cluster.front().sequenceName, cluster.front().position - 1, cluster.back().position + cluster.back().ref.size() - cluster.front().position); // establish what haplotypes there are by parsing the (phased) genotypes across the samples over these records map<string, vector<vector<int> > > sampleHaplotypes; for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) { // build the haplotype using the genotype fields in the variant cluster // only build haplotypes for samples with complete information string& sampleName = *s; vector<vector<int> >& haplotypes = sampleHaplotypes[sampleName]; bool completeCoverage = true; // ensure complete genotype coverage over the haplotype cluster for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) { if (v->samples.find(sampleName) == v->samples.end() || v->samples[sampleName].find("GT") == v->samples[sampleName].end()) { completeCoverage = false; break; } } if (!completeCoverage) { continue; // skip samples without complete coverage } // what's the ploidy? { string& gt = cluster.front().samples[sampleName]["GT"].front(); vector<string> gtspec = split(gt, "|"); for (vector<string>::iterator g = gtspec.begin(); g != gtspec.end(); ++g) { vector<int> haplotype; haplotypes.push_back(haplotype); } } for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) { string& gt = v->samples[sampleName]["GT"].front(); vector<string> gtspec = split(gt, "|"); vector<string>::iterator g = gtspec.begin(); for (vector<vector<int> >::iterator h = haplotypes.begin(); h != haplotypes.end(); ++h, ++g) { int j; convert(*g, j); h->push_back(j); } } } set<vector<int> > uniqueHaplotypes; for (map<string, vector<vector<int> > >::iterator hs = sampleHaplotypes.begin(); hs != sampleHaplotypes.end(); ++hs) { vector<vector<int> >& haps = hs->second; for (vector<vector<int> >::iterator h = haps.begin(); h != haps.end(); ++h) { uniqueHaplotypes.insert(*h); } } // write new haplotypes map<vector<int>, string> haplotypeSeqs; map<vector<int>, int> haplotypeIndexes; map<int, string> alleles; int impossibleHaplotypes = 0; // always include the reference haplotype as 0 // when we come to it in the haplotypes, we'll ignore it int alleleIndex = 1; for (set<vector<int> >::iterator u = uniqueHaplotypes.begin(); u != uniqueHaplotypes.end(); ++u) { /* for (vector<int>::const_iterator z = u->begin(); z != u->end(); ++z) { cerr << *z; } cerr << endl; */ string haplotype = referenceHaplotype; bool isreference = true; bool impossibleHaplotype = false; int referenceInsertOffset = 0; int j = 0; // index into variant cluster int lastpos = 0; int lastrefend = 0; for (vector<int>::const_iterator z = u->begin(); z != u->end(); ++z, ++j) { int i = *z; if (i != 0) { isreference = false; Variant& vartoInsert = cluster.at(j); string& alternate = vartoInsert.alleles.at(i); if (vartoInsert.position < lastrefend) { cerr << "impossible haplotype, overlapping alleles at " << vartoInsert.sequenceName << ":" << vartoInsert.position << endl; impossibleHaplotype = true; break; } else { //cerr << vartoInsert.position << " " << cluster.front().position + referenceInsertOffset << endl; //cerr << "replacing " << vartoInsert.ref << " at " << vartoInsert.position - cluster.front().position + referenceInsertOffset << " with " << alternate << endl; haplotype.replace(vartoInsert.position - cluster.front().position + referenceInsertOffset, vartoInsert.ref.size(), alternate); if (alternate.size() != vartoInsert.ref.size()) { referenceInsertOffset += alternate.size() - vartoInsert.ref.size(); } lastpos = vartoInsert.position; lastrefend = vartoInsert.position + vartoInsert.ref.size(); } } } if (impossibleHaplotype) { ++impossibleHaplotypes; haplotypeIndexes[*u] = -1; // indicates impossible haplotype impossibleHaplotype = false; } else if (isreference) { alleles[0] = haplotype; haplotypeIndexes[*u] = 0; } else { alleles[alleleIndex] = haplotype; haplotypeIndexes[*u] = alleleIndex; ++alleleIndex; } haplotypeSeqs[*u] = haplotype; // if there's not a reference allele, add it if (alleles.find(0) == alleles.end()) { alleles[0] = referenceHaplotype; // nb, there is no reference haplotype among // the samples, so we don't have to add it to // the haplotypeIndexes } } outputVar.ref = alleles[0]; outputVar.alt.clear(); for (int i = 1; i < alleleIndex; ++i) { outputVar.alt.push_back(alleles[i]); } outputVar.sequenceName = cluster.front().sequenceName; outputVar.position = cluster.front().position; outputVar.filter = "."; outputVar.id = "."; outputVar.info = cluster.front().info; outputVar.samples.clear(); outputVar.format = cluster.front().format; // now the genotypes for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) { string& sampleName = *s; vector<string> gt; vector<vector<int> > & hs = sampleHaplotypes[sampleName]; for (vector<vector<int> >::iterator h = hs.begin(); h != hs.end(); ++h) { int hi = haplotypeIndexes[*h]; if (hi != -1) { gt.push_back(convert(hi)); } else { // nonexistent or impossible haplotype gt.push_back("."); } } if (gt.size() != 0) { outputVar.samples[sampleName]["GT"].push_back(join(gt, "|")); } } if (cluster.size() - impossibleHaplotypes < 2) { for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) { cout << *v << endl; } } else { if (!outputVar.alt.empty()) { cout << outputVar << endl; } else { cerr << "no alternate alleles remain at " << outputVar.sequenceName << ":" << outputVar.position << " after haplotype validation" << endl; } } cluster.clear(); if (!variantFile.done()) cluster.push_back(var); } } exit(0); // why? return 0; }
int main(int argc, char** argv) { if (argc != 4) { cerr << "usage: " << argv[0] << " <annotation-tag> <vcf file> <vcf file>" << endl << "annotates genotypes in the first file with genotypes in the second" << endl << "adding the genotype as another flag to each sample filed in the first file." << endl << "annotation-tag is the name of the sample flag which is added to store the annotation." << endl << "also adds a 'has_variant' flag for sites where the second file has a variant." << endl; return 1; } string annotag = argv[1]; string filenameA = argv[2]; string filenameB = argv[3]; if (filenameA == filenameB) { cerr << "it won't help to annotate samples with their own genotypes!" << endl; return 1; } VariantCallFile variantFileA; if (filenameA == "-") { variantFileA.open(std::cin); } else { variantFileA.open(filenameA); } VariantCallFile variantFileB; if (filenameB == "-") { variantFileB.open(std::cin); } else { variantFileB.open(filenameB); } if (!variantFileA.is_open() || !variantFileB.is_open()) { return 1; } Variant varA(variantFileA); Variant varB(variantFileB); // while the first file doesn't match the second positionally, // step forward, annotating each genotype record with an empty genotype // when the two match, iterate through the genotypes from the first file // and get the genotypes reported in the second file variantFileA.getNextVariant(varA); variantFileB.getNextVariant(varB); string line = "##INFO=<ID=" + annotag + ".has_variant,Number=0,Type=Flag,Description=\"True if " + annotag + " has a called alternate among samples under comparison.\">"; variantFileA.addHeaderLine(line); line = "##FORMAT=<ID=" + annotag + ",Number=1,Type=String,Description=\"Genotype from " + annotag + ".\">"; variantFileA.addHeaderLine(line); cout << variantFileA.header << endl; do { // this is broken. to do it right, it'll be necessary to get reference ids from the fasta reference used to make the alignments... // if B is NOT done, and is less than A, read new B. if (!variantFileB.done() && (varB.sequenceName != varA.sequenceName || (varB.sequenceName == varA.sequenceName && varB.position < varA.position) || variantFileA.done()) ) { variantFileB.getNextVariant(varB); } // if A is not done- and A is less than B, read A. // should also read if variant B is done. if (!variantFileA.done() && (varA.sequenceName != varB.sequenceName || (varA.sequenceName == varB.sequenceName && varA.position < varB.position) || variantFileB.done()) ) { annotateWithBlankGenotypes(varA, annotag); cout << varA << endl; variantFileA.getNextVariant(varA); } vector<Variant> varsA; vector<Variant> varsB; bool hasMultipleAlts = false; long int thisPosition = 0; string thisSequenceName; if (varA.position == varB.position && varA.sequenceName == varB.sequenceName) { thisPosition = varA.position; thisSequenceName = varA.sequenceName; } while (!variantFileA.done() && !variantFileB.done() && thisPosition == varA.position && thisSequenceName == varA.sequenceName && varA.sequenceName == varB.sequenceName && varA.position == varB.position) { // accumulate all the alts at the current position varsA.push_back(varA); varsB.push_back(varB); if (varA.alt.size() > 1 || varB.alt.size() > 1) hasMultipleAlts = true; variantFileA.getNextVariant(varA); variantFileB.getNextVariant(varB); } // multiple lines per position if (!hasMultipleAlts && (varsA.size() > 1 || varsB.size() > 1)) { map<pair<string, string>, Variant> varsAParsed; map<pair<string, string>, Variant> varsBParsed; for (vector<Variant>::iterator v = varsA.begin(); v != varsA.end(); ++v) { varsAParsed[make_pair(v->ref, v->alt.front())] = *v; } for (vector<Variant>::iterator v = varsB.begin(); v != varsB.end(); ++v) { varsBParsed[make_pair(v->ref, v->alt.front())] = *v; } for (map<pair<string, string>, Variant>::iterator vs = varsAParsed.begin(); vs != varsAParsed.end(); ++vs) { Variant& varA = vs->second; annotateWithBlankGenotypes(varA, annotag); if (varsBParsed.find(make_pair(varA.ref, varA.alt.front())) != varsBParsed.end()) { Variant& varB = varsBParsed[make_pair(varA.ref, varA.alt.front())]; // TODO cleanup annotateWithGenotypes(varA, varB, annotag); varA.infoFlags[annotag + ".has_variant"] = true; } cout << varA << endl; } } else if (!varsA.empty() && !varsB.empty()) { // one line per multi-allelic Variant& varA = varsA.front(); annotateWithBlankGenotypes(varA, annotag); Variant& varB = varsB.front(); annotateWithGenotypes(varA, varB, annotag); // XXX TODO, and also allow for records with multiple alts // XXX assume that if the other file has a corresponding record, some kind of variation was detected at the same site varA.infoFlags[annotag + ".has_variant"] = true; cout << varA << endl; } else { for (vector<Variant>::iterator v = varsA.begin(); v != varsA.end(); ++v) { Variant& varA = *v; annotateWithBlankGenotypes(varA, annotag); cout << varA << endl; } } } while (!variantFileA.done() || !variantFileB.done()); return 0; }