int main(int argc, char** argv) { VariantCallFile variantFile; if (argc > 1) { string filename = argv[1];; } else {; } if (!variantFile.is_open()) { return 1; } variantFile.addHeaderLine("##FORMAT=<ID=SN,Number=1,Type=String,Description=\"The name of the sample.\">"); cout << variantFile.header << endl; Variant var(variantFile); while (variantFile.getNextVariant(var)) { var.format.push_back("SN"); for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) { s->second["SN"].clear(); s->second["SN"].push_back(s->first); } cout << var << endl; } return 0; }
int main(int argc, char** argv) { if (argc != 2) { cerr << "usage: " << argv[0] << " <vcf file>" << endl << "outputs the het/hom ratio for each individual in the file" << endl; return 1; } string filename = argv[1]; VariantCallFile variantFile; if (filename == "-") {; } else {; } if (!variantFile.is_open()) { cerr << "could not open " << filename << endl; return 1; } map<string, unsigned int> hetCounts; map<string, unsigned int> homCounts; for (vector<string>::iterator s = variantFile.sampleNames.begin(); s != variantFile.sampleNames.end(); ++s) { hetCounts[*s] = 0; homCounts[*s] = 0; } Variant var(variantFile); while (variantFile.getNextVariant(var)) { //cout << var << endl; for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) { string name = s->first; map<string, vector<string> >& sample = s->second; string& gt = sample["GT"].front(); map<int, int> genotype = decomposeGenotype(gt); if (isHet(genotype)) { ++hetCounts[name]; } else if (isHomNonRef(genotype)) { ++homCounts[name]; } } } for (vector<string>::iterator s = variantFile.sampleNames.begin(); s != variantFile.sampleNames.end(); ++s) { cout << (s == variantFile.sampleNames.begin() ? "" : "\t") << *s; } cout << endl; for (vector<string>::iterator s = variantFile.sampleNames.begin(); s != variantFile.sampleNames.end(); ++s) { cout << (s == variantFile.sampleNames.begin() ? "" : "\t") << (double) hetCounts[*s] / (double) homCounts[*s]; } cout << endl; return 0; }
int main(int argc, char** argv) { if (argc < 3) { cerr << "usage: " << argv[0] << " <vcf file> [FIELD1] [FIELD2] ..." << endl << "outputs each record in the vcf file, removing INFO fields not listed on the command line" << endl; return 1; } string filename = argv[1]; set<string> fieldsToKeep; for (int i = 2; i < argc; ++i) { fieldsToKeep.insert(argv[i]); } VariantCallFile variantFile; if (filename == "-") {; } else {; } if (!variantFile.is_open()) { return 1; } Variant var(variantFile); vector<string> fieldsToErase; vector<string> infoIds = variantFile.infoIds(); for (vector<string>::iterator i = infoIds.begin(); i != infoIds.end(); ++i) { if (!fieldsToKeep.count(*i)) { fieldsToErase.push_back(*i); variantFile.removeInfoHeaderLine(*i); } } // write the header cout << variantFile.header << endl; // print the records, filtering is done via the setting of varA's output sample names while (variantFile.getNextVariant(var)) { for (vector<string>::iterator f = fieldsToErase.begin(); f != fieldsToErase.end(); ++f) {*f); var.infoFlags.erase(*f); } cout << var << endl; } return 0; }
int main(int argc, char** argv) { if (argc < 3) { cerr << "usage: " << argv[0] << " <vcf file> [SAMPLE1] [SAMPLE2] ..." << endl << "outputs each record in the vcf file, removing samples listed on the command line" << endl; return 1; } string filename = argv[1]; vector<string> samplesToRemove; for (int i = 2; i < argc; ++i) { samplesToRemove.push_back(argv[i]); } VariantCallFile variantFile; if (filename == "-") {; } else {; } if (!variantFile.is_open()) { return 1; } Variant var(variantFile); vector<string> samplesToKeep = removeElems(samplesToRemove, variantFile.sampleNames); // update sample list in header variantFile.updateSamples(samplesToKeep); // and restrict the output sample names in the variant to those we are keeping var.setOutputSampleNames(samplesToKeep); // write the new header cout << variantFile.header << endl; // print the records, filtering is done via the setting of varA's output sample names while (variantFile.getNextVariant(var)) { cout << var << endl; } return 0; }
int main(int argc, char** argv) { VariantCallFile variantFile; if (argc > 1) { string filename = argv[1];; } else {; } if (!variantFile.is_open()) { return 1; } //cout << variantFile.header << endl; Variant var(variantFile); while (variantFile.getNextVariant(var)) { //cout << var << endl; double afref = 1; map<double, vector<string> > allelesByAf; vector<double> afd; vector<string>& afstr =["AF"]; for (vector<string>::iterator af = afstr.begin(); af != afstr.end(); ++af) { double r; convert(*af, r); afd.push_back(r); } vector<double>::iterator af = afd.begin(); for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a, ++af) { afref -= *af; allelesByAf[*af].push_back(*a); } cout << var.ref; for (map<double, vector<string> >::reverse_iterator a = allelesByAf.rbegin(); a != allelesByAf.rend(); ++a) { cout << " -> " << join(a->second, ", "); } cout << endl; } return 0; }
int main(int argc, char** argv) { VariantCallFile variantFile; if (argc > 1) { string filename = argv[1];; } else {; } if (!variantFile.is_open()) { return 1; } cout << variantFile.header << endl; string lastsn; long int lastpos; string lastref; vector<string> lastalt; variantFile.parseSamples = false; Variant var(variantFile); while (variantFile.getNextVariant(var)) { if (!lastsn.empty() && (lastsn == var.sequenceName && lastpos == var.position && lastref == var.ref && lastalt == var.alt)) { continue; } else { lastsn = var.sequenceName; lastpos = var.position; lastref = var.ref; lastalt = var.alt; cout << var.originalLine << endl; } } return 0; }
int main(int argc, char** argv) { VariantCallFile variantFile; if (argc > 1) { string filename = argv[1];; } else {; } if (!variantFile.is_open()) { return 1; } variantFile.addHeaderLine("##INFO=<ID=length,Number=A,Type=Integer,Description=\"length(ALT) - length(REF) for each ALT\">"); variantFile.addHeaderLine("##INFO=<ID=length.ref,Number=1,Type=Integer,Description=\"length(REF)\">"); variantFile.addHeaderLine("##INFO=<ID=length.alt,Number=A,Type=Integer,Description=\"length(ALT) for each ALT\">"); cout << variantFile.header << endl; Variant var(variantFile); while (variantFile.getNextVariant(var)) { vector<string>& lengths =["length"]; lengths.clear(); for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { lengths.push_back(convert((int) a->size() - (int) var.ref.size())); } vector<string>& lengthsRef =["length.ref"]; lengthsRef.clear(); lengthsRef.push_back(convert(var.ref.size())); vector<string>& lengthsAlt =["length.alt"]; lengthsAlt.clear(); for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { lengthsAlt.push_back(convert((int) a->size())); } cout << var << endl; } return 0; }
int main(int argc, char** argv) { VariantCallFile variantFile; if (argc > 1) { string filename = argv[1];; } else {; } if (!variantFile.is_open()) { return 1; } cout << variantFile.header; Variant var(variantFile); while (variantFile.getNextVariant(var)) { map<string, vector<VariantAllele> > variants = var.parsedAlternates(); cout << var << endl; for (map<string, vector<VariantAllele> >::iterator va = variants.begin(); va != variants.end(); ++va) { cout << " ( " << va->first << " :: "; vector<VariantAllele>& vars = va->second; vector<VariantAllele>::iterator g = vars.begin(); for (; g != vars.end(); ++g) { cout << *g << "; "; } cout << " ) "; } cout << endl; } return 0; }
int main(int argc, char** argv) { string ref_file = ""; vector<string> insertion_files; int max_interval = -1; bool replace_sequences = true; int c = 0; while (true) { static struct option long_options[] = { {"insertions", no_argument, 0, 'i'}, {"help", no_argument, 0, 'h'}, {"reference", required_argument, 0, 'r'}, {"no-replace-sequences", no_argument, 0, 's'}, {0, 0, 0, 0} }; int option_index = 0; c = getopt_long (argc, argv, "sr:i:h", long_options, &option_index); if (c == -1) break; /* Detect the end of the options. */ switch(c){ case 's': replace_sequences = false; break; case 'r': ref_file = optarg; break; case 'i': insertion_files.push_back(optarg); break; case 'h': case '?': print_help(argv); exit(1); default: print_help(argv); abort(); } } if (argc < 2){ print_help(argv); exit(1); } VariantCallFile variantFile; string filename = argv[argc - 1];; if (!variantFile.is_open()) { return 1; } vector<FastaReference*> insertions; if (!insertion_files.empty()){ for (auto x : insertion_files){ FastaReference* ins = new FastaReference(); insertions.push_back(ins); ins->open(x); } } FastaReference ref; if(!ref_file.empty()){; } cout << variantFile.header << endl; Variant var; while (variantFile.getNextVariant(var)) { bool valid = var.canonicalize_sv(ref, insertions, replace_sequences, max_interval); if (!valid){ cerr << "Variant could not be normalized" << var << endl; } cout << var << endl; } return 0; }
int main(int argc, char** argv) { if (argc != 2) { cerr << "usage: " << argv[0] << " <annotation-tag> <vcf file> <vcf file>" << endl << "adds a tag (BasesToNextVariant) to each variant record which indicates" << endl << "the distance to the nearest variant" << endl; return 1; } string filename = argv[1]; VariantCallFile variantFile; if (filename == "-") {; } else {; } if (!variantFile.is_open()) { return 1; } Variant varA(variantFile); Variant varB(variantFile); Variant varC(variantFile); vector<Variant*> vars; vars.push_back(&varA); vars.push_back(&varB); vars.push_back(&varC); for (vector<Variant*>::iterator v = vars.begin(); v != vars.end(); ++v) { variantFile.getNextVariant(**v); } string tag = "BasesToClosestVariant"; string line = "##INFO=<ID=" + tag + ",Number=1,Type=Integer,Description=\"" \ + "Number of bases to the closest variant in the file.\">"; variantFile.addHeaderLine(line); cout << variantFile.header << endl; // get the first distances if (>sequenceName ==>sequenceName) {>info[tag].push_back(convert(>position ->position)); } while (variantFile.getNextVariant(*vars.back())) { if (>sequenceName ==>sequenceName &&>sequenceName ==>sequenceName) {>info[tag].push_back(convert(min(>position ->position,>position ->position))); } else if (>sequenceName ==>sequenceName) {>info[tag].push_back(convert(>position ->position)); } else if (>sequenceName ==>sequenceName) {>info[tag].push_back(convert(>position ->position)); } else { // don't add the tag } cout << *vars.front() << endl; // rotate Variant* v =; =; =; = v; } // assign the last distances if (>sequenceName ==>sequenceName) {>info[tag].push_back(convert(>position ->position)); cout << * << endl;>info[tag].push_back(convert(>position ->position)); cout << * << endl; } return 0; }
int main(int argc, char** argv) { int window = 150; VariantCallFile variantFile; string fastaFileName; int c; while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"reference", required_argument, 0, 'r'}, {"window", required_argument, 0, 'w'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hw:r:", long_options, &option_index); if (c == -1) break; switch (c) { case 'r': fastaFileName = optarg; break; case 'w': window = atoi(optarg); break; case '?': printSummary(argv); exit(1); break; case 'h': printSummary(argv); break; default: abort (); } } if (optind < argc) { string filename = argv[optind];; } else {; } if (!variantFile.is_open()) { cerr << "could not open VCF file" << endl; exit(1); } FastaReference fastaReference; if (fastaFileName.empty()) { cerr << "a reference is required" << endl; exit(1); } else {; } /* variantFile.addHeaderLine("##INFO=<ID=TYPE,Number=A,Type=String,Description=\"The type of allele, either snp, mnp, ins, del, or complex.\">"); variantFile.addHeaderLine("##INFO=<ID=LEN,Number=A,Type=Integer,Description=\"allele length\">"); if (!parseFlag.empty()) { variantFile.addHeaderLine("##INFO=<ID="+parseFlag+",Number=0,Type=Flag,Description=\"The allele was parsed using vcfallelicprimitives.\">"); } */ cout << variantFile.header << endl; Variant var(variantFile); while (variantFile.getNextVariant(var)) { // if there is no indel, there is nothing to realign bool hasIndel = false; for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { if (a->size() != var.ref.size()) { hasIndel = true; break; } } if (!hasIndel) { cout << var << endl; continue; } vector<AltAlignment> alignments; string ref; // determine window size to prevent mismapping with SW algorithm int currentWindow = window; int scale = 2; if (var.ref.size()*scale > currentWindow) currentWindow = var.ref.size()*scale; for (vector<string>::iterator a = var.alleles.begin(); a != var.alleles.end(); ++a) { if (a->size()*scale > currentWindow) { currentWindow = a->size()*scale; } } // while the entropy of either flank is < some target entropy (~1 is fine), increase the flank sizes while (currentWindow < 2000) { // limit to one step > than this string refTarget = fastaReference.getSubSequence(var.sequenceName, var.position - 1 - currentWindow/2, currentWindow); if (entropy(refTarget.substr(0, refTarget.size()/2)) < 1 || entropy(refTarget.substr(refTarget.size()/2)) < 1) { currentWindow *= scale; } else { break; } } // do the alignments getAlignment(var, fastaReference, ref, alignments, currentWindow); // stably left align the alignments for (vector<AltAlignment>::iterator a = alignments.begin(); a != alignments.end(); ++a) { Cigar cigarBefore = a->cigar; //cerr << a->seq << endl; //cerr << "before : " << a->pos << " " << joinCigar(a->cigar) << endl; long int prev = a->pos; stablyLeftAlign(a->seq, ref, a->cigar, 20, false); //cerr << "after : " << a->pos << " " << joinCigar(a->cigar) << endl; if (a->pos != prev) cerr << "modified alignment @ " << var << endl; } //cout << var << endl; // transform the mappings // chop off leading matching bases // find the range of bp in the alleles // make the new ref allele // make the new alt alleles // emit the var long int newPosition = var.position+currentWindow/2; long int newEndPosition = var.position-currentWindow/2; // check for no-indel case int newLength = var.ref.size(); bool giveUp = false; for (vector<AltAlignment>::iterator a = alignments.begin(); a != alignments.end() && !giveUp; ++a) { // get the first mismatching position Cigar::iterator c = a->cigar.begin(); int rp = 0; int sp = 0; bool hitMismatch = false; int matchingBpAtStart = 0; int matchingBpAtEnd = 0; // will be set to true if the first reference position match is broken by a SNP, not an indel bool leadingSNP = false; while (c != a->cigar.end()) { char op = c->second[0]; if (c == a->cigar.begin()) { if (op != 'M') { cerr << "alignment does not start on matched sequence" << endl; cerr << var << endl; exit(1); } int i = 0; for ( ; i < c->first; ++i) { if (ref[i] != a->seq[i]) { leadingSNP = true; break; } } matchingBpAtStart = i; } if (!leadingSNP && c == (a->cigar.begin()+1)) { // if the first thing we run into is an indel, step back, per VCF spec if (op == 'D' || op == 'I') { --matchingBpAtStart; } } if (c == (a->cigar.end()-1)) { if (op != 'M') { // soft clip at end // it'll be hard to interpret this // the alignments sometimes generate this // best thing to do is to move on //cerr << "alignment does not end on matched sequence" << endl; //cout << var << endl; //exit(1); giveUp = true; break; } int i = 0; for ( ; i < c->first; ++i) { if (ref[ref.size()-1-i] != a->seq[a->seq.size()-1-i]) { break; } } matchingBpAtEnd = i; } ++c; } int altMismatchLength = a->seq.size() - matchingBpAtEnd - matchingBpAtStart; int refMismatchLength = (var.ref.size() + currentWindow) - matchingBpAtEnd - matchingBpAtStart; //cerr << "alt mismatch length " << altMismatchLength << endl // << "ref mismatch length " << refMismatchLength << endl; long int newStart = var.position - currentWindow/2 + matchingBpAtStart; long int newEnd = newStart + refMismatchLength; //cerr << "ref should run from " << newStart << " to " << newStart + refMismatchLength << endl; newPosition = min(newStart, newPosition); newEndPosition = max(newEnd, newEndPosition); //cerr << newPosition << " " << newEndPosition << endl; //if (newRefSize < refMismatchLength) newRefSize = refMismatchLength; } // the alignment failed for some reason, continue if (giveUp) { cout << var << endl; continue; } //cerr << "new ref start " << newPosition << " and end " << newEndPosition << " was " << var.position << "," << var.position + var.ref.size() << endl; int newRefSize = newEndPosition - newPosition; string newRef = fastaReference.getSubSequence(var.sequenceName, newPosition-1, newRefSize); // get the number of bp to strip from the alts int stripFromStart = currentWindow/2 - (var.position - newPosition); int stripFromEnd = (currentWindow + newRefSize) - (stripFromStart + newRefSize) + (var.ref.size() - newRefSize); //cerr << "strip from start " << stripFromStart << endl; //cerr << "strip from end " << stripFromEnd << endl; vector<string> newAlt; vector<string>::iterator l = var.alt.begin(); bool failedAlt = false; for (vector<AltAlignment>::iterator a = alignments.begin(); a != alignments.end(); ++a, ++l) { int diff = newRef.size() - l->size(); string alt = a->seq.substr(stripFromStart, a->seq.size() - (stripFromEnd + stripFromStart)); newAlt.push_back(alt); if (alt.empty()) failedAlt = true; } // check the before/after haplotypes bool brokenRealignment = false; if (!newRef.empty() && !failedAlt) { int slop = 50; // 50 extra bp! int haplotypeStart = min(var.position, newPosition) - slop; int haplotypeEnd = max(var.position + var.ref.size(), newPosition + newRef.size()) + slop; string referenceHaplotype = fastaReference.getSubSequence(var.sequenceName, haplotypeStart - 1, haplotypeEnd - haplotypeStart); vector<string>::iterator o = var.alt.begin(); vector<string>::iterator n = newAlt.begin(); for ( ; o != var.alt.end() ; ++o, ++n) { // map the haplotypes string oldHaplotype = referenceHaplotype; string newHaplotype = referenceHaplotype; oldHaplotype.replace(var.position - haplotypeStart, var.ref.size(), *o); newHaplotype.replace(newPosition - haplotypeStart, newRef.size(), *n); if (oldHaplotype != newHaplotype) { cerr << "broken left alignment!" << endl << "old " << oldHaplotype << endl << "new " << newHaplotype << endl; cerr << "was: " << var << endl; brokenRealignment = true; } } } // *if* everything is OK, update the variant if (!brokenRealignment && !newRef.empty() && !failedAlt) { var.ref = newRef; var.alt = newAlt; var.position = newPosition; } cout << var << endl; // for each parsedalternate, get the position // build a new vcf record for that position // unless we are already at the position ! // take everything which is unique to that allele (records) and append it to the new record // then handle genotypes; determine the mapping between alleleic primitives and convert to phased haplotypes // this means taking all the parsedAlternates and, for each one, generating a pattern of allele indecies corresponding to it //for (vector<Variant>::iterator v = variants.begin(); v != variants.end(); ++v) { } return 0; }
int main(int argc, char** argv) { int c; string fastaRef; bool keepFailures = false; bool excludeFailures = false; if (argc == 1) printSummary(argv); while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"fasta-reference", required_argument, 0, 'f'}, {"exclude-failures", no_argument, 0, 'x'}, {"keep-failures", no_argument, 0, 'k'}, //{"length", no_argument, &printLength, true}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hxkf:", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) break; switch (c) { case 0: /* If this option set a flag, do nothing else now. */ if (long_options[option_index].flag != 0) break; printf ("option %s", long_options[option_index].name); if (optarg) printf (" with arg %s", optarg); printf ("\n"); break; case 'f': fastaRef = optarg; break; case 'x': excludeFailures = true; break; case 'k': keepFailures = true; break; case 'h': printSummary(argv); exit(0); break; case '?': /* getopt_long already printed an error message. */ printSummary(argv); exit(1); break; default: abort (); } } if (fastaRef.empty()) { cerr << "a FASTA reference sequence must be specified" << endl; exit(1); } FastaReference ref;; VariantCallFile variantFile; string inputFilename; if (optind == argc - 1) { inputFilename = argv[optind];; } else {; } if (!variantFile.is_open()) { return 1; } if (keepFailures || excludeFailures) { cout << variantFile.header << endl; } Variant var(variantFile); while (variantFile.getNextVariant(var)) { int refstart = var.position - 1; // convert to 0-based string matchedRef = ref.getSubSequence(var.sequenceName, refstart, var.ref.size()); if (var.ref != matchedRef) { if (keepFailures) { cout << var << endl; } else if (!excludeFailures) { cout << "mismatched reference " << var.ref << " should be " << matchedRef << " at " << var.sequenceName << ":" << var.position << endl; } } else if (excludeFailures) { cout << var << endl; } } return 0; }
int main(int argc, char** argv) { // set the random seed for MCMC srand((unsigned)time(NULL)); // the filename string filename = "NA"; // using vcflib; thanks to Erik Garrison VariantCallFile variantFile ; // zero based index for the target and background indivudals map<int, int> it, ib; // deltaaf is the difference of allele frequency we bother to look at string deltaaf ; double daf = -1; const struct option longopts[] = { {"version" , 0, 0, 'v'}, {"help" , 0, 0, 'h'}, {"file" , 1, 0, 'f'}, {"target" , 1, 0, 't'}, {"background", 1, 0, 'b'}, {"deltaaf" , 1, 0, 'd'}, {0,0,0,0} }; int index; int iarg = 0; while(iarg != -1) { iarg = getopt_long(argc, argv, "d:t:b:f:hv", longopts, &index); switch (iarg) { case 0: break; case 'h': cerr << endl; cerr << "INFO: help: " << endl << endl; cerr << " bFst is a Bayesian approach to Fst. Importantly bFst account for genotype uncertainty in the model using genotype likelihoods." << endl; cerr << " For a more detailed description see: Holsinger et al. Molecular Ecology Vol 11, issue 7 2002. The likelihood function has been " << endl; cerr << " modified to use genotype likelihoods provided by variant callers. There are five free parameters estimated in the model: each " << endl; cerr << " subpopulation's allele frequency and Fis (fixation index, within each subpopulation), a free parameter for the total population\'s " << endl; cerr << " allele frequency, and Fst. " << endl << endl; cerr << "Output : 11 columns : " << endl; cerr << " 1. Seqid " << endl; cerr << " 2. Position " << endl; cerr << " 3. Observed allele frequency in target. " << endl; cerr << " 4. Estimated allele frequency in target. " << endl; cerr << " 5. Observed allele frequency in background. " << endl; cerr << " 6. Estimated allele frequency in background. " << endl; cerr << " 7. Observed allele frequency combined. " << endl; cerr << " 8. Estimated allele frequency in combined. " << endl; cerr << " 9. ML estimate of Fst (mean) " << endl; cerr << " 10. Lower bound of the 95% credible interval " << endl; cerr << " 11. Upper bound of the 95% credible interval " << endl << endl; cerr << "INFO: usage: bFst --target 0,1,2,3,4,5,6,7 --background 11,12,13,16,17,19,22 --file my.vcf --deltaaf 0.1" << endl; cerr << endl; cerr << "INFO: required: t,target -- a zero bases comma separated list of target individuals corrisponding to VCF columns" << endl; cerr << "INFO: required: b,background -- a zero bases comma separated list of background individuals corrisponding to VCF columns" << endl; cerr << "INFO: required: f,file a -- a proper formatted VCF file. the FORMAT field MUST contain \"PL\"" << endl; cerr << "INFO: required: d,deltaaf -- skip sites were the difference in allele frequency is less than deltaaf" << endl; cerr << endl; printVersion(); cerr << endl << endl; return 0; case 'v': printVersion(); return 0; case 't': loadIndices(ib, optarg); cerr << "INFO: There are " << ib.size() << " individuals in the target" << endl; break; case 'b': loadIndices(it, optarg); cerr << "INFO: There are " << it.size() << " individuals in the background" << endl; break; case 'f': cerr << "INFO: File: " << optarg << endl; filename = optarg; break; case 'd': cerr << "INFO: difference in allele frequency : " << optarg << endl; deltaaf = optarg; daf = atof(deltaaf.c_str()); break; default: break; cerr << endl; cerr << "FATAL: unknown command line option " << optarg << endl << endl ; cerr << "INFO: please use bFst --help " << endl; cerr << endl; return(1); } } if(daf == -1){ cerr << endl; cerr << "FATAL: did not specify deltaaf" << endl; cerr << "INFO: please use bFst --help " << endl; cerr << endl; return(1); } if(filename == "NA"){ cerr << endl; cerr << "FATAL: did not specify VCF file" << endl; cerr << "INFO: please use bFst --help " << endl; cerr << endl; return(1); }; if (!variantFile.is_open()) { cerr << endl; cerr << "FATAL: could not open VCF file" << endl; cerr << "INFO: please use bFst --help" << endl; cerr << endl; return(1); } if(it.size() < 2){ cerr << endl; cerr << "FATAL: target not specified or less than two indviduals" << endl; cerr << "INFO: please use bFst --help " << endl; cerr << endl; } if(ib.size() < 2){ cerr << endl; cerr << "FATAL: target not specified or less than two indviduals"<< endl; cerr << "INFO: please use bFst --help " << endl; cerr << endl; } Variant var(variantFile); vector<string> samples = variantFile.sampleNames; int nsamples = samples.size(); while (variantFile.getNextVariant(var)) { // biallelic sites naturally if(var.alt.size() > 1){ continue; } vector < map< string, vector<string> > > target, background, total; int index = 0; for(int nsamp = 0; nsamp < nsamples; nsamp++){ map<string, vector<string> > sample = var.samples[ samples[nsamp]]; if(sample["GT"].front() != "./."){ if(it.find(index) != it.end() ){ target.push_back(sample); total.push_back(sample); } if(ib.find(index) != ib.end()){ background.push_back(sample); total.push_back(sample); } } index += 1; } if(target.size() < 2 || background.size() < 2 ){ continue; } pop popt, popb, popTotal; initPop(popt); initPop(popb); initPop(popTotal); loadPop(target, popt); loadPop(background, popb); loadPop(total, popTotal); if( == -1 || == -1){ continue; } if( == 1 && == 1){ continue; } if( == 0 && == 0){ continue; } double afdiff = abs( -; if(afdiff < daf){ continue; } cerr << "INFO: target has " << popt.questionable.size() << " questionable genotypes " << endl; cerr << "INFO: background has " << popb.questionable.size() << " questionable genotypes " << endl; // Parameters- targetAf backgroundAf targetFis backgroundFis totalAf fst vector<double> parameters; parameters.push_back(; parameters.push_back(; parameters.push_back(popt.fis); parameters.push_back(popb.fis); parameters.push_back(; parameters.push_back(0.1); parameters.push_back(; double sums [6] = {0}; double fsts [10000] ; for(int i = 0; i < 15000; i++){ // update each of j parameters for(int j = 0; j < 6; j++ ){ updateParameters(popt, popb, parameters, j); if(i > 4999){ sums[j] += parameters[j]; } } if(i > 4999){ fsts[i - 5000] = parameters[5]; } for(vector<int>::iterator itt = popt.questionable.begin(); itt != popt.questionable.end(); itt++){ updateGenotypes(popt, popb, parameters, (*itt), 0); } for(vector<int>::iterator itb = popb.questionable.begin(); itb != popb.questionable.end(); itb++){ updateGenotypes(popt, popb, parameters, (*itb) , 1); } } qsort (fsts, sizeof(fsts)/sizeof(fsts[0]), sizeof(fsts[0]), cmp ); double lcredint = fsts[500]; double hcredint = fsts[9500]; cout << var.sequenceName << "\t" << var.position << "\t" << << "\t" << sums[0]/10000 << "\t" << << "\t" << sums[1]/10000 << "\t" << << "\t" << sums[4]/10000 << "\t" << sums[5]/10000 << "\t" << lcredint << "\t" << hcredint << endl; } return 0; }
int main(int argc, char** argv) { string bedFileName; string annotationInfoKey; string defaultAnnotationValue; if (argc == 1) printSummary(argv); int c; while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"bed", required_argument, 0, 'b'}, {"key", required_argument, 0, 'k'}, {"default", required_argument, 0, 'd'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hb:k:d:", long_options, &option_index); if (c == -1) break; switch (c) { case 'b': bedFileName = string(optarg); break; case 'k': annotationInfoKey = string(optarg); break; case 'd': defaultAnnotationValue = string(optarg); break; case 'h': printSummary(argv); break; case '?': printSummary(argv); exit(1); break; default: abort (); } } if (bedFileName.empty()) { cerr << "a BED file is required when intersecting" << endl; exit(1); } BedReader bed(bedFileName); VariantCallFile variantFile; string inputFilename; if (optind == argc - 1) { inputFilename = argv[optind];; } else {; } if (!variantFile.is_open()) { cout << "could not open VCF file" << endl; return 1; } string line = "##INFO=<ID=" + annotationInfoKey + ",Number=1,Type=String,Description=\"Annotation from " + bedFileName + " delimited by ':'\">"; variantFile.addHeaderLine(line); cout << variantFile.header << endl; Variant var(variantFile); while (variantFile.getNextVariant(var)) { BedTarget record(var.sequenceName, var.position, var.position + var.ref.size() - 1, ""); vector<BedTarget*> overlaps = bed.targetsOverlapping(record); vector<string> annotations; if (!overlaps.empty()) { for (vector<BedTarget*>::iterator t = overlaps.begin(); t != overlaps.end(); ++t) { annotations.push_back((*t)->desc); }[annotationInfoKey].push_back(join(annotations, ":")); } else if (!defaultAnnotationValue.empty()) {[annotationInfoKey].push_back(defaultAnnotationValue); } cout << var << endl; } return 0; }
int main(int argc, char** argv) { if (argc > 1 && (argv[1] == "-h" || argv[1] == "--help")) { cerr << "usage: " << argv[0] << " <vcf file>" << endl << "outputs a VCF stream where AC and NS have been generated for each record using sample genotypes" << endl; return 1; } VariantCallFile variantFile; if (argc == 1 || (argc == 2 && argv[1] == "-")) {; if (!variantFile.is_open()) { cerr << "vcffixup: could not open stdin" << endl; return 1; } } else { string filename = argv[1];; if (!variantFile.is_open()) { cerr << "vcffixup: could not open " << filename << endl; return 1; } } Variant var(variantFile); // remove header lines we're going to add variantFile.removeInfoHeaderLine("AC"); variantFile.removeInfoHeaderLine("AF"); variantFile.removeInfoHeaderLine("NS"); variantFile.removeInfoHeaderLine("AN"); // and add them back, so as not to duplicate them if they are already there variantFile.addHeaderLine("##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Total number of alternate alleles in called genotypes\">"); variantFile.addHeaderLine("##INFO=<ID=AF,Number=A,Type=Float,Description=\"Estimated allele frequency in the range (0,1]\">"); variantFile.addHeaderLine("##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of samples with data\">"); variantFile.addHeaderLine("##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">"); // write the new header cout << variantFile.header << endl; // print the records, filtering is done via the setting of varA's output sample names while (variantFile.getNextVariant(var)) { stringstream ns; ns << var.samples.size();["NS"].clear();["NS"].push_back(ns.str());["AC"].clear();["AF"].clear();["AN"].clear(); int allelecount = countAlleles(var); stringstream an; an << allelecount;["AN"].push_back(an.str()); for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { string& allele = *a; int altcount = countAlts(var, var.getAltAlleleIndex(allele) + 1); stringstream ac; ac << altcount;["AC"].push_back(ac.str()); stringstream af; af << (double) altcount / (double) allelecount;["AF"].push_back(af.str()); } cout << var << endl; } return 0; }
int main(int argc, char** argv) { globalOpts.threads = 1 ; = 0.05; // zero based index for the target and background indivudals map<int, int> it, ib; const struct option longopts[] = { {"version" , 0, 0, 'v'}, {"help" , 0, 0, 'h'}, {"file" , 1, 0, 'f'}, {"target" , 1, 0, 't'}, {"region" , 1, 0, 'r'}, {"gen" , 1, 0, 'g'}, {"type" , 1, 0, 'y'}, {"threads" , 1, 0, 'x'}, {"af" , 1, 0, 'a'}, {"pos" , 1, 0, 'p'}, {0,0,0,0} }; int findex; int iarg=0; while(iarg != -1) { iarg = getopt_long(argc, argv, "a:x:g:y:r:d:t:b:f:p:hv", longopts, &findex); switch (iarg) { case 'p': { globalOpts.pos = atoi(optarg); break; } case 'a': { = atof(optarg); break; } case 'x': { globalOpts.threads = atoi(optarg); break; } case 'g': { globalOpts.geneticMapFile = optarg; break; } case 'h': { printHelp(); break; } case 'v': { printVersion(); break; } case 'y': { globalOpts.type = optarg; break; } case 't': { loadIndices(it, optarg); cerr << "INFO: there are " << it.size() << " individuals in the target" << endl; cerr << "INFO: target ids: " << optarg << endl; break; } case 'f': { cerr << "INFO: file: " << optarg << endl; globalOpts.filename = optarg; break; } case 'r': { cerr << "INFO: set seqid region to : " << optarg << endl; globalOpts.region = optarg; break; default: break; } } } #if defined HAS_OPENMP omp_set_num_threads(globalOpts.threads); #endif map<string, int> okayGenotypeLikelihoods; okayGenotypeLikelihoods["PL"] = 1; okayGenotypeLikelihoods["GL"] = 1; okayGenotypeLikelihoods["GP"] = 1; okayGenotypeLikelihoods["GT"] = 1; // add an option for dumping // for(std::map<int, double>::iterator gm = geneticMap.begin(); gm != geneticMap.end(); gm++){ // cerr << "pos: " << gm->first << " cm: " << gm->second << endl; // } if(globalOpts.type.empty()){ cerr << "FATAL: failed to specify genotype likelihood format : PL or GL" << endl; printHelp(); exit(1); } if(okayGenotypeLikelihoods.find(globalOpts.type) == okayGenotypeLikelihoods.end()){ cerr << "FATAL: genotype likelihood is incorrectly formatted, only use: PL or GL" << endl; printHelp(); exit(1); } if(globalOpts.filename.empty()){ cerr << "FATAL: did not specify a file" << endl; printHelp(); exit(1); } if(it.size() < 2){ cerr << "FATAL: target option is required -- or -- less than two individuals in target\n"; printHelp(); exit(1); } // using vcflib; thanksErik VariantCallFile variantFile;; if(globalOpts.region.empty()){ cerr << "FATAL: region required" << endl; exit(1); } if(! variantFile.setRegion(globalOpts.region)){ cerr <<"FATAL: unable to set region" << endl; exit(1); } if (!variantFile.is_open()) { exit(1); } Variant var( variantFile ); vector<int> target_h, background_h; int index = 0; int indexi = 0; vector<string> samples = variantFile.sampleNames; int nsamples = samples.size(); for(vector<string>::iterator samp = samples.begin(); samp != samples.end(); samp++){ string sampleName = (*samp); if(it.find(index) != it.end() ){ target_h.push_back(indexi); indexi++; } index++; } vector<long int> positions; vector<double> afs; string **haplotypes = new string*[target_h.size()]; for (int i = 0; i < target_h.size(); i++) { haplotypes[i] = new string[2]; } while (variantFile.getNextVariant(var)) { globalOpts.seqid = var.sequenceName; if(!var.isPhased()){ cerr << "FATAL: Found an unphased variant. All genotypes must be phased!" << endl; exit(1); } if(var.alleles.size() > 2){ continue; } vector < map< string, vector<string> > > target, background, total; int sindex = 0; for(int nsamp = 0; nsamp < nsamples; nsamp++){ map<string, vector<string> > sample = var.samples[ samples[nsamp]]; if(it.find(sindex) != it.end() ){ target.push_back(sample); } sindex += 1; } genotype * populationTarget ; if(globalOpts.type == "PL"){ populationTarget = new pl(); } if(globalOpts.type == "GL"){ populationTarget = new gl(); } if(globalOpts.type == "GP"){ populationTarget = new gp(); } if(globalOpts.type == "GT"){ populationTarget = new gt(); } populationTarget->loadPop(target, var.sequenceName, var.position); if(populationTarget->af <= || populationTarget->nref < 2 || populationTarget->nalt < 2){ delete populationTarget; continue; } positions.push_back(var.position); afs.push_back(populationTarget->af); loadPhased(haplotypes, populationTarget, populationTarget->gts.size()); populationTarget = NULL; delete populationTarget; } if(!globalOpts.geneticMapFile.empty()){ cerr << "INFO: loading genetics map" << endl; loadGeneticMap(positions.front(), positions.back()); cerr << "INFO: finished loading genetics map" << endl; } calc(haplotypes, target_h.size(), afs, positions, target_h, background_h, globalOpts.seqid); clearHaplotypes(haplotypes, target_h.size()); exit(0); }
int main(int argc, char** argv) { bool includePreviousBaseForIndels = true; bool useMNPs = false; string parseFlag; int maxLength = 200; bool keepInfo = false; bool keepGeno = false; VariantCallFile variantFile; int c; while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"use-mnps", no_argument, 0, 'm'}, {"max-length", required_argument, 0, 'L'}, {"tag-parsed", required_argument, 0, 't'}, {"keep-info", no_argument, 0, 'k'}, {"keep-geno", no_argument, 0, 'g'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hmkgt:L:", long_options, &option_index); if (c == -1) break; switch (c) { case 'm': useMNPs = true; break; case 'k': keepInfo = true; break; case 'g': keepGeno = true; break; case 'h': printSummary(argv); break; case 't': parseFlag = optarg; break; case 'L': maxLength = atoi(optarg); break; case '?': printSummary(argv); exit(1); break; default: abort (); } } if (optind < argc) { string filename = argv[optind];; } else {; } if (!variantFile.is_open()) { return 1; } variantFile.addHeaderLine("##INFO=<ID=TYPE,Number=A,Type=String,Description=\"The type of allele, either snp, mnp, ins, del, or complex.\">"); variantFile.addHeaderLine("##INFO=<ID=LEN,Number=A,Type=Integer,Description=\"allele length\">"); if (!parseFlag.empty()) { variantFile.addHeaderLine("##INFO=<ID="+parseFlag+",Number=0,Type=Flag,Description=\"The allele was parsed using vcfallelicprimitives.\">"); } cout << variantFile.header << endl; Variant var(variantFile); while (variantFile.getNextVariant(var)) { // we can't decompose *1* bp events, these are already in simplest-form whether SNPs or indels // we also don't handle anything larger than maxLength bp if (var.alt.size() == 1 && ( var.alt.front().size() == 1 || var.ref.size() == 1 || var.alt.front().size() > maxLength || var.ref.size() > maxLength )) { // nothing to do cout << var << endl; continue; } // for each parsedalternate, get the position // build a new vcf record for that position // unless we are already at the position ! // take everything which is unique to that allele (records) and append it to the new record // then handle genotypes; determine the mapping between alleleic primitives and convert to phased haplotypes // this means taking all the parsedAlternates and, for each one, generating a pattern of allele indecies corresponding to it map<string, vector<VariantAllele> > varAlleles = var.parsedAlternates(includePreviousBaseForIndels, useMNPs); set<VariantAllele> alleles; // collect unique alleles for (map<string, vector<VariantAllele> >::iterator a = varAlleles.begin(); a != varAlleles.end(); ++a) { for (vector<VariantAllele>::iterator va = a->second.begin(); va != a->second.end(); ++va) { alleles.insert(*va); } } int altcount = 0; for (set<VariantAllele>::iterator a = alleles.begin(); a != alleles.end(); ++a) { if (a->ref != a->alt) { ++altcount; } } if (altcount == 1 && var.alt.size() == 1 && var.alt.front().size() == 1) { // if biallelic SNP cout << var << endl; continue; } // collect variant allele indexed membership map<string, vector<int> > variantAlleleIndexes; // from serialized VariantAllele to indexes for (map<string, vector<VariantAllele> >::iterator a = varAlleles.begin(); a != varAlleles.end(); ++a) { int index = var.altAlleleIndexes[a->first] + 1; // make non-relative for (vector<VariantAllele>::iterator va = a->second.begin(); va != a->second.end(); ++va) { variantAlleleIndexes[va->repr].push_back(index); } } map<VariantAllele, double> alleleFrequencies; map<VariantAllele, int> alleleCounts; map<VariantAllele, map<string, string> > alleleInfos; map<VariantAllele, map<string, map<string, string> > > alleleGenos; bool hasAf = false; if ("AF") != { hasAf = true; for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { vector<VariantAllele>& vars = varAlleles[*a]; for (vector<VariantAllele>::iterator va = vars.begin(); va != vars.end(); ++va) { double freq; try { convert(["AF"].at(var.altAlleleIndexes[*a]), freq); alleleFrequencies[*va] += freq; } catch (...) { cerr << "vcfallelicprimitives WARNING: AF does not have count == alts @ " << var.sequenceName << ":" << var.position << endl; } } } } bool hasAc = false; if ("AC") != { hasAc = true; for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { vector<VariantAllele>& vars = varAlleles[*a]; for (vector<VariantAllele>::iterator va = vars.begin(); va != vars.end(); ++va) { int freq; try { convert(["AC"].at(var.altAlleleIndexes[*a]), freq); alleleCounts[*va] += freq; } catch (...) { cerr << "vcfallelicprimitives WARNING: AC does not have count == alts @ " << var.sequenceName << ":" << var.position << endl; } } } } if (keepInfo) { for (map<string, vector<string> >::iterator infoit =; infoit !=; ++infoit) { string key = infoit->first; for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { vector<VariantAllele>& vars = varAlleles[*a]; for (vector<VariantAllele>::iterator va = vars.begin(); va != vars.end(); ++va) { string val; vector<string>& vals =[key]; if (vals.size() == var.alt.size()) { // allele count for info val =[*a]); } else if (vals.size() == 1) { // site-wise count val = vals.front(); } // don't handle other multiples... how would we do this without going crazy? if (!val.empty()) { alleleInfos[*va][key] = val; } } } } } /* if (keepGeno) { for (map<string, map<string, vector<string> > >::iterator sampleit = var.samples.begin(); sampleit != var.samples.end(); ++sampleit) { string& sampleName = sampleit->first; map<string, vector<string> >& sampleValues = var.samples[sampleName]; } } */ // from old allele index to a new series across the unpacked positions map<int, map<long unsigned int, int> > unpackedAlleleIndexes; map<long unsigned int, Variant> variants; //vector<Variant> variants; for (set<VariantAllele>::iterator a = alleles.begin(); a != alleles.end(); ++a) { if (a->ref == a->alt) { // ref allele continue; } string type; int len = 0; if (a-> == a-> { // well-behaved indels if (a->ref.size() > a->alt.size()) { type = "del"; len = a->ref.size() - a->alt.size(); } else if (a->ref.size() < a->alt.size()) { len = a->alt.size() - a->ref.size(); type = "ins"; } } else { if (a->ref.size() == a->alt.size()) { len = a->ref.size(); if (a->ref.size() == 1) { type = "snp"; } else { type = "mnp"; } } else { len = abs((int) a->ref.size() - (int) a->alt.size()); type = "complex"; } } if (variants.find(a->position) == variants.end()) { Variant newvar(variantFile); variants[a->position] = newvar; } Variant& v = variants[a->position]; // guaranteed to exist if (!parseFlag.empty()) { v.infoFlags[parseFlag] = true; } v.quality = var.quality; v.filter = var.filter; = "."; //v.format = var.format; vector<string> gtonlyformat; gtonlyformat.push_back("GT"); v.format = gtonlyformat;["TYPE"].push_back(type);["LEN"].push_back(convert(len)); if (hasAf) {["AF"].push_back(convert(alleleFrequencies[*a])); } if (hasAc) {["AC"].push_back(convert(alleleCounts[*a])); } if (keepInfo) { for (map<string, vector<string> >::iterator infoit =; infoit !=; ++infoit) { string key = infoit->first; if (key != "AF" && key != "AC" && key != "TYPE" && key != "LEN") { // don't clobber previous[key].push_back(alleleInfos[*a][key]); } } } // now, keep all the other infos if we are asked to v.sequenceName = var.sequenceName; v.position = a->position; // ... by definition, this should be == if the variant was found if (v.ref.size() < a->ref.size()) { for (vector<string>::iterator va = v.alt.begin(); va != v.alt.end(); ++va) { *va += a->ref.substr(v.ref.size()); } v.ref = a->ref; } v.alt.push_back(a->alt); int alleleIndex = v.alt.size(); vector<int>& originalIndexes = variantAlleleIndexes[a->repr]; for (vector<int>::iterator i = originalIndexes.begin(); i != originalIndexes.end(); ++i) { unpackedAlleleIndexes[*i][v.position] = alleleIndex; } // add null allele unpackedAlleleIndexes[ALLELE_NULL][v.position] = ALLELE_NULL; } // genotypes for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) { string& sampleName = *s; if (var.samples.find(sampleName) == var.samples.end()) { continue; } map<string, vector<string> >& sample = var.samples[sampleName]; if (sample.find("GT") == sample.end()) { continue; } string& genotype = sample["GT"].front(); vector<string> genotypeStrs = split(genotype, "|/"); vector<int> genotypeIndexes; for (vector<string>::iterator s = genotypeStrs.begin(); s != genotypeStrs.end(); ++s) { int i; if (!convert(*s, i)) { genotypeIndexes.push_back(ALLELE_NULL); } else { genotypeIndexes.push_back(i); } } map<long unsigned int, vector<int> > positionIndexes; for (vector<int>::iterator g = genotypeIndexes.begin(); g != genotypeIndexes.end(); ++g) { int oldIndex = *g; for (map<long unsigned int, Variant>::iterator v = variants.begin(); v != variants.end(); ++v) { const long unsigned int& p = v->first; if (oldIndex == 0) { // reference positionIndexes[p].push_back(0); } else { positionIndexes[p].push_back(unpackedAlleleIndexes[oldIndex][p]); } } } for (map<long unsigned int, Variant>::iterator v = variants.begin(); v != variants.end(); ++v) { Variant& variant = v->second; vector<int>& gtints = positionIndexes[v->first]; vector<string> gtstrs; for (vector<int>::iterator i = gtints.begin(); i != gtints.end(); ++i) { if (*i != ALLELE_NULL) { gtstrs.push_back(convert(*i)); } else { gtstrs.push_back("."); } } string genotype = join(gtstrs, "|"); // if we are keeping the geno info, pull it over here if (keepGeno) { variant.format = var.format; variant.samples[sampleName] = var.samples[sampleName]; } // note that this will replace the old geno, but otherwise it is the same variant.samples[sampleName]["GT"].clear(); variant.samples[sampleName]["GT"].push_back(genotype); } } //for (vector<Variant>::iterator v = variants.begin(); v != variants.end(); ++v) { for (map<long unsigned int, Variant>::iterator v = variants.begin(); v != variants.end(); ++v) { cout << v->second << endl; } } return 0; }
int main(int argc, char** argv) { string nullval = "."; bool genotypes = false; int c; while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"null-value", required_argument, 0, 'n'}, {"genotypes", no_argument, 0, 'g'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hn:g", long_options, &option_index); if (c == -1) break; switch (c) { case 'n': nullval = optarg; break; case 'g': genotypes = true; break; case 'h': printSummary(argv); break; case '?': printSummary(argv); exit(1); break; default: abort (); } } VariantCallFile variantFile; bool usingstdin = false; string inputFilename; if (optind == argc - 1) { inputFilename = argv[optind];; } else { if (! { if (argc == 1) { printSummary(argv); } else { cerr << "could not open stdin for reading as VCF" << endl; exit(1); } } usingstdin = true; } if (!variantFile.is_open()) { return 1; } // obtain all possible field names // true means it a bool field flag std::map<std::string, bool> keepFields; for (map<string, VariantFieldType>::iterator i = variantFile.infoTypes.begin(); i != variantFile.infoTypes.end(); ++i) { if (i->second == FIELD_BOOL) { keepFields[i->first] = true; } else { keepFields[i->first] = false; } } vector<string> formatfields; if (genotypes) { for (map<string, VariantFieldType>::iterator f = variantFile.formatTypes.begin(); f != variantFile.formatTypes.end(); ++f) { formatfields.push_back(f->first); } } // write header // defaults std::cout << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER"; for (std::map<std::string, bool>::iterator i = keepFields.begin(); i != keepFields.end(); ++i) { cout << "\t" << i->first; } if (genotypes) { cout << "\t" << "SAMPLE"; for (vector<string>::iterator f = formatfields.begin(); f != formatfields.end(); ++f) { cout << "\t" << *f; } } std::cout << std::endl; Variant var(variantFile); while (variantFile.getNextVariant(var)) { stringstream outputRecord; loadInfoSS(outputRecord, keepFields, var, variantFile, nullval, formatfields, genotypes); std::cout << outputRecord.str() ; } return 0; }
int main(int argc, char** argv) { // set the random seed for MCMC srand((unsigned)time(NULL)); // the filename string filename = "NA"; // set region to scaffold string region = "NA"; // using vcflib; thanks to Erik Garrison VariantCallFile variantFile; // zero based index for the target and background indivudals map<int, int> it, ib; // deltaaf is the difference of allele frequency we bother to look at // ancestral state is set to zero by default string mut = "1"; int counts = 0; // phased int phased = 0; const struct option longopts[] = { {"version" , 0, 0, 'v'}, {"help" , 0, 0, 'h'}, {"file" , 1, 0, 'f'}, {"target" , 1, 0, 't'}, {"background", 1, 0, 'b'}, {"deltaaf" , 1, 0, 'd'}, {"region" , 1, 0, 'r'}, {"mutation" , 1, 0, 'm'}, {"phased" , 1, 0, 'p'}, {0,0,0,0} }; int findex; int iarg=0; while(iarg != -1) { iarg = getopt_long(argc, argv, "p:m:r:d:t:b:f:hv", longopts, &findex); switch (iarg) { case 'h': cerr << endl << endl; cerr << "INFO: help" << endl; cerr << "INFO: description:" << endl; cerr << " gl-XPEHH estimates haplotype decay between the target and background populations. SNVs are integrated " << endl; cerr << " until EHH in the target and background is less than 0.05. The score is the itegrated EHH (target) / integrated EHH (background). " << endl; cerr << " gl-XPEHH does NOT integrate over genetic distance, as genetic maps are not availible for most non-model organisms. " << endl; cerr << " gl-XPEHH phases genotypes, imuputes missing genotypes, and changes poor quality genotypes. Phasing is done in a sliding window " << endl; cerr << " with a stochastic search, therefore, every time gl-XPEHH is run it will generate slightly different results. " << endl; cerr << "Output : 4 columns : " << endl; cerr << " 1. seqid " << endl; cerr << " 2. position " << endl; cerr << " 3. xp-ehh " << endl; cerr << " 4. iHS " << endl << endl; cerr << "INFO: gl-XPEHH --target 0,1,2,3,4,5,6,7 --background 11,12,13,16,17,19,22 --file my.vcf --deltaaf 0.1 --ancestral 0 " << endl; cerr << endl; cerr << "INFO: required: r,region -- a genomice range to calculate gl-XPEHH on in the format : \"seqid:start-end]\" or \"seqid\" " << endl; cerr << "INFO: required: t,target -- a zero base comma seperated list of target individuals corrisponding to VCF columns " << endl; cerr << "INFO: required: b,background -- a zero base comma seperated list of background individuals corrisponding to VCF columns " << endl; cerr << "INFO: required: f,file a -- proper formatted VCF. the FORMAT field MUST contain \"PL\" if option phased == 0 " << endl; cerr << "INFO: optional: m,mutation -- which state is derived in vcf [0,1] default is 1 " << endl; cerr << "INFO: optional: p,phased -- phasing flag [0,1] 0 = phase vcf, 1 = vcf is already phased " << endl; cerr << endl; cerr << "INFO: version 1.0.1 ; date: April 2014 ; author: Zev Kronenberg; email : [email protected] " << endl; cerr << endl << endl; return 0; case 'v': cerr << endl << endl; cerr << "INFO: version 1.0.1 ; date: April 2014 ; author: Zev Kronenberg; email : [email protected] " << endl; return 0; case 'p': phased = atoi(optarg); cerr << "INFO: setting phase to: " << phased << endl; break; case 'm': mut = optarg; cerr << "INFO: derived state set to " << mut << endl; break; case 't': loadIndices(it, optarg); cerr << "INFO: there are " << it.size() << " individuals in the target" << endl; cerr << "INFO: target ids: " << optarg << endl; break; case 'b': loadIndices(ib, optarg); cerr << "INFO: there are " << ib.size() << " individuals in the background" << endl; cerr << "INFO: background ids: " << optarg << endl; break; case 'f': cerr << "INFO: file: " << optarg << endl; filename = optarg; break; case 'r': cerr << "INFO: set seqid region to : " << optarg << endl; region = optarg; break; default: break; } } if(filename == "NA"){ cerr << "FATAL: did not specify a file" << endl; cerr << "INFO: please use gl-XPEHH --help" << endl; return(1); }; if(region == "NA"){ cerr << "FATAL: did not specify a region" << endl; cerr << "INFO: please use gl-XPEHH --help" << endl; } if(region != "NA"){ variantFile.setRegion(region); } if (!variantFile.is_open()) { return 1; } Variant var(variantFile); vector<string> samples = variantFile.sampleNames; vector<int> target_h, background_h; int index, indexi = 0; cerr << "INFO: there are " << samples.size() << " individuals in the VCF" << endl; if(samples.size() == 0){ cerr << "FATAL: too few samples or no VCF header" << endl; cerr << "INFO: please use gl-XPEHH --help" << endl; return(1); } for(vector<string>::iterator samp = samples.begin(); samp != samples.end(); samp++){ if(it.find(index) != it.end() ){ target_h.push_back(indexi); indexi++; } if(ib.find(index) != ib.end()){ background_h.push_back(indexi); indexi++; } index++; } list< pop > tdat, bdat, zdat; vector<long int> positions; string haplotypes [it.size() + ib.size()][2]; string seqid; while (variantFile.getNextVariant(var)) { map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); map<string, map<string, vector<string> > >::iterator sEnd = var.samples.end(); // biallelic sites naturally if(var.alt.size() > 1){ continue; } vector < map< string, vector<string> > > target, background, total; int sindex = 0; for (; s != sEnd; s++) { map<string, vector<string> >& sample = s->second; if(it.find(sindex) != it.end() ){ target.push_back(sample); total.push_back(sample); } if(ib.find(sindex) != ib.end()){ background.push_back(sample); total.push_back(sample); } sindex += 1; } seqid = var.sequenceName; pop popt, popb, popz; initPop(popt); initPop(popb); initPop(popz); loadPop(target, popt, var.sequenceName, var.position, phased ); loadPop(background, popb, var.sequenceName, var.position, phased ); loadPop(total, popz, var.sequenceName, var.position, phased ); if( == -1 || == -1){ continue; } if( > 0.95 || < 0.05){ continue; } if( == 0 && == 1){ continue; } if( == 1 && == 0){ continue; } tdat.push_back(popt); bdat.push_back(popb); zdat.push_back(popz); positions.push_back(var.position); counts += 1; if(counts >= 1000){ cerr << "INFO: processed " << haplotypes[0][0].size() << " SNPs; current location : " << var.position << endl; counts = 0; } while(zdat.size() >= 15 && !zdat.empty()){ if(phased == 0){ localPhase(haplotypes, zdat, (it.size() + ib.size())); } else{ loadPhased(haplotypes, zdat, (it.size() + ib.size())); } while(!zdat.empty()){ zdat.pop_front(); } } } if(phased == 0){ localPhase(haplotypes, zdat, (it.size() + ib.size())); } else{ loadPhased(haplotypes, zdat, (it.size() + ib.size())); } while(!zdat.empty()){ zdat.pop_front(); } cerr << "INFO: phasing done" << endl; calc(haplotypes, (it.size() + ib.size()), positions, target_h, background_h, mut, seqid); cerr << "INFO: gl-XPEHH finished" << endl; return 0; }
int main(int argc, char** argv) { int c; string fastaRef; int windowSize = 0; if (argc == 1) printSummary(argv); while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"fasta-reference", required_argument, 0, 'f'}, {"window-size", required_argument, 0, 'w'}, //{"length", no_argument, &printLength, true}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hf:w:", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) break; switch (c) { case 0: /* If this option set a flag, do nothing else now. */ if (long_options[option_index].flag != 0) break; printf ("option %s", long_options[option_index].name); if (optarg) printf (" with arg %s", optarg); printf ("\n"); break; case 'f': fastaRef = optarg; break; case 'w': windowSize = atoi(optarg); break; case 'h': printSummary(argv); exit(0); break; case '?': /* getopt_long already printed an error message. */ printSummary(argv); exit(1); break; default: abort (); } } if (windowSize == 0) { cerr << "a window size must be specified" << endl; exit(1); } if (fastaRef.empty()) { cerr << "a FASTA reference sequence must be specified" << endl; exit(1); } FastaReference ref;; VariantCallFile variantFile; string inputFilename; if (optind == argc - 1) { inputFilename = argv[optind];; } else {; } if (!variantFile.is_open()) { return 1; } variantFile.addHeaderLine("##INFO=<ID=EntropyLeft,Number=1,Type=Float,Description=\"Entropy of left-flanking sequence of "+ convert(windowSize) +"bp\">"); variantFile.addHeaderLine("##INFO=<ID=EntropyCenter,Number=1,Type=Float,Description=\"Entropy of centered sequence of "+ convert(windowSize) +"bp\">"); variantFile.addHeaderLine("##INFO=<ID=EntropyRight,Number=1,Type=Float,Description=\"Entropy of right-flanking sequence of "+ convert(windowSize) +"bp\">"); variantFile.addHeaderLine("##INFO=<ID=EntropyRef,Number=1,Type=Float,Description=\"Entropy of REF allele\">"); variantFile.addHeaderLine("##INFO=<ID=EntropyAlt,Number=A,Type=Float,Description=\"Entropy of each ALT allele\">"); cout << variantFile.header << endl; Variant var(variantFile); while (variantFile.getNextVariant(var)) { // get the ref start and end positions int refstart = var.position - 1; // convert to 0-based int refend = var.position + var.ref.size() - 1; string leftseq = ref.getSubSequence(var.sequenceName, refstart - windowSize, windowSize); string rightseq = ref.getSubSequence(var.sequenceName, refend, windowSize); string centerseq = ref.getSubSequence(var.sequenceName, refstart - windowSize/2, windowSize); double entropyLeft = shannon_H((char*) &leftseq[0], windowSize); double entropyRight = shannon_H((char*) &rightseq[0], windowSize); double entropyCenter = shannon_H((char*) ¢erseq[0], windowSize); double entropyRef = shannon_H((char*) var.ref.c_str(), var.ref.size());["EntropyLeft"].clear();["EntropyRight"].clear();["EntropyCenter"].clear();["EntropyRef"].clear();["EntropyAlt"].clear();["EntropyLeft"].push_back(convert(entropyLeft));["EntropyRight"].push_back(convert(entropyRight));["EntropyCenter"].push_back(convert(entropyCenter));["EntropyRef"].push_back(convert(entropyRef)); for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { double entropyAlt = shannon_H((char*) a->c_str(), a->size());["EntropyAlt"].push_back(convert(entropyAlt)); } cout << var << endl; } return 0; }
int main(int argc, char** argv) { if (argc != 2) { cerr << "usage: " << argv[0] << " <vcf file>" << endl << "unphases and sorts the genotypes in the file" << endl; return 1; } string filename = argv[1]; VariantCallFile variantFile; if (filename == "-") {; } else {; } if (!variantFile.is_open()) { return 1; } cout << variantFile.header << endl; Variant var(variantFile); while (variantFile.getNextVariant(var)) { map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); map<string, map<string, vector<string> > >::iterator sEnd = var.samples.end(); for (; s != sEnd; ++s) { map<string, vector<string> >& sample = s->second; string& genotype = sample["GT"].front(); vector<string> gt = split(genotype, "|/"); // now let's sort the genotype vector<int> gti; for (vector<string>::iterator g = gt.begin(); g != gt.end(); ++g) { if (*g == ".") { gti.push_back(-1); } else { gti.push_back(atoi(g->c_str())); } } std::sort(gti.begin(), gti.end()); stringstream gts; for (vector<int>::iterator g = gti.begin(); g != gti.end(); ++g) { if (g != gti.begin()) { gts << "/"; } if (*g == -1) { gts << "."; } else { gts << *g; } } genotype = gts.str(); } cout << var << endl; } return 0; }
int main(int argc, char** argv) { string bedFileName; string vcfFileName; string fastaFileName; bool intersecting = false; bool unioning = false; bool invert = false; bool contained = true; bool overlapping = false; int windowsize = 30; if (argc == 1) printSummary(argv); int c; while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"bed", required_argument, 0, 'b'}, {"invert", no_argument, 0, 'v'}, {"intersect-vcf", required_argument, 0, 'i'}, {"union-vcf", required_argument, 0, 'u'}, {"contained", no_argument, 0, 'c'}, {"overlapping", no_argument, 0, 'o'}, {"window-size", required_argument, 0, 'w'}, {"reference", required_argument, 0, 'r'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hvcob:i:u:w:r:", long_options, &option_index); if (c == -1) break; switch (c) { case 'w': windowsize = atoi(optarg); break; case 'b': bedFileName = string(optarg); break; case 'i': intersecting = true; vcfFileName = string(optarg); break; case 'u': unioning = true; vcfFileName = string(optarg); break; case 'r': fastaFileName = string(optarg); break; case 'v': invert = true; break; case 'c': contained = true; break; case 'o': overlapping = true; break; case 'h': printSummary(argv); break; case '?': printSummary(argv); exit(1); break; default: abort (); } } bool usingBED = false; if (!bedFileName.empty()) { usingBED = true; } BedReader bed; if (usingBED) {; } VariantCallFile variantFile; string inputFilename; if (optind == argc - 1) { inputFilename = argv[optind];; } else {; } if (!variantFile.is_open()) { cerr << "could not open VCF file" << endl; exit(1); } if (usingBED) { variantFile.parseSamples = false; } VariantCallFile otherVariantFile; if (!vcfFileName.empty()) {; if (!otherVariantFile.is_open()) { cerr << "could not open VCF file " << vcfFileName << endl; exit(1); } } FastaReference reference; if (unioning || intersecting) { if (fastaFileName.empty()) { cerr << "a reference is required for haplotype-based intersection and unioniong" << endl; exit(1); }; } if (!unioning && !intersecting) { variantFile.parseSamples = false; // faster, as when we are // only bed-intersecting we // can do position-only // output and don't have to // manipulate specific // alleles } // read the VCF file for union or intersection into an interval tree // indexed using some proximity window map<string, IntervalTree<Variant*> > variantIntervals; map<string, list<Variant> > otherVariants; map<string, vector<Interval<Variant*> > > otherVariantIntervals; if (unioning || intersecting) { Variant ovar(otherVariantFile); while (otherVariantFile.getNextVariant(ovar)) { long int left = ovar.position; long int right = left + ovar.ref.size(); // this should be 1-past the end otherVariants[ovar.sequenceName].push_back(ovar); Variant* v = &otherVariants[ovar.sequenceName].back(); otherVariantIntervals[ovar.sequenceName].push_back(Interval<Variant*>(left, right, v)); } for (map<string, vector<Interval<Variant*> > >::iterator j = otherVariantIntervals.begin(); j != otherVariantIntervals.end(); ++j) { variantIntervals[j->first] = IntervalTree<Variant*>(j->second); } } set<Variant*> outputVariants; long unsigned int lastOutputPosition = 0; string lastSequenceName; cout << variantFile.header; Variant var(variantFile); while (variantFile.getNextVariant(var)) { if (lastSequenceName.empty()) { lastSequenceName = var.sequenceName; } else if (lastSequenceName != var.sequenceName) { if (unioning) { vector<Interval<Variant*> > previousRecords; long int lastSeqLength = reference.sequenceLength(lastSequenceName); variantIntervals[lastSequenceName].findContained(lastOutputPosition, lastSeqLength, previousRecords); for (vector<Interval<Variant*> >::iterator r = previousRecords.begin(); r != previousRecords.end(); ++r) { Variant* v = r->value; if (outputVariants.find(v) == outputVariants.end()) { outputVariants.insert(v); cout << *v << endl; // does this output everything in correct order? } } lastSequenceName = var.sequenceName; lastOutputPosition = 0; } } if (usingBED) { BedTarget record(var.sequenceName, var.position, var.position + var.ref.size(), ""); vector<BedTarget*> overlaps = bed.targetsOverlapping(record); if (!invert && !overlaps.empty()) { cout << variantFile.line << endl; } else if (invert && overlaps.empty()) { cout << variantFile.line << endl; } } else if (unioning || intersecting) { // TODO check overlaps with union/intersection // hmm... for unioning, you might need to step through the original VCF records // but the idea is to exclude the haplotype-based duplicates vector<Interval<Variant*> > results; variantIntervals[var.sequenceName].findContained(var.position - windowsize, var.position + var.ref.size() + windowsize, results); vector<Variant*> overlapping; for (vector<Interval<Variant*> >::iterator r = results.begin(); r != results.end(); ++r) { overlapping.push_back(r->value); } if (unioning) { // unioning strategy // write out all the records from the last file // between the last one printed out and the first // one we're about to print out vector<Interval<Variant*> > previousRecords; variantIntervals[var.sequenceName].findOverlapping(lastOutputPosition, var.position - windowsize, previousRecords); map<long int, vector<Variant*> > variants; for (vector<Interval<Variant*> >::iterator r = previousRecords.begin(); r != previousRecords.end(); ++r) { Variant* v = r->value; if (outputVariants.find(v) == outputVariants.end()) { outputVariants.insert(v); variants[v->position].push_back(v); } } for (map<long int, vector<Variant*> >::iterator v = variants.begin(); v != variants.end(); ++v) { for (vector<Variant*>::iterator o = v->second.begin(); o != v->second.end(); ++o) { cout << **o << endl; lastOutputPosition = max(lastOutputPosition, (*o)->position); } } // TODO find the duplicates for the other file } if (overlapping.empty()) { if (unioning || (intersecting && invert)) { cout << var << endl; lastOutputPosition = max(lastOutputPosition, var.position); } } else { // get the min and max of the overlaps int haplotypeStart = var.position; int haplotypeEnd = var.position + var.ref.size(); for (vector<Variant*>::iterator v = overlapping.begin(); v != overlapping.end(); ++v) { haplotypeStart = min((*v)->position, (long unsigned int) haplotypeStart); haplotypeEnd = max((*v)->position + (*v)->ref.size(), (long unsigned int) haplotypeEnd); } // for everything overlapping and the current variant, construct the local haplotype within the bounds // if there is an exact match, the alllele in the current VCF does intersect string referenceHaplotype = reference.getSubSequence(var.sequenceName, haplotypeStart - 1, haplotypeEnd - haplotypeStart); map<string, vector<Variant*> > haplotypes; for (vector<Variant*>::iterator v = overlapping.begin(); v != overlapping.end(); ++v) { Variant& variant = **v; for (vector<string>::iterator a = variant.alt.begin(); a != variant.alt.end(); ++a) { string haplotype = referenceHaplotype; // get the relative start and end coordinates for the variant alternate allele int relativeStart = variant.position - haplotypeStart; haplotype.replace(relativeStart, variant.ref.size(), *a); haplotypes[haplotype].push_back(*v); } } // determine the non-intersecting alts vector<string> altsToRemove; for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { string haplotype = referenceHaplotype; int relativeStart = var.position - haplotypeStart; haplotype.replace(relativeStart, var.ref.size(), *a); map<string, vector<Variant*> >::iterator h = haplotypes.find(haplotype); if ((intersecting && !invert && h == haplotypes.end()) || (intersecting && invert && h != haplotypes.end()) || (unioning && h != haplotypes.end())) { altsToRemove.push_back(*a); } } // remove the non-overlapping (intersecting) or overlapping (unioning) alts for (vector<string>::iterator a = altsToRemove.begin(); a != altsToRemove.end(); ++a) { var.removeAlt(*a); } if (unioning) { // somehow sort the records and combine them? map<long int, vector<Variant*> > variants; for (vector<Variant*>::iterator o = overlapping.begin(); o != overlapping.end(); ++o) { if ((*o)->position <= var.position && // check ensures proper ordering of variants on output outputVariants.find(*o) == outputVariants.end()) { outputVariants.insert(*o); variants[(*o)->position].push_back(*o); } } // add in the current variant, if it has alts left if (!var.alt.empty()) { variants[var.position].push_back(&var); } for (map<long int, vector<Variant*> >::iterator v = variants.begin(); v != variants.end(); ++v) { for (vector<Variant*>::iterator o = v->second.begin(); o != v->second.end(); ++o) { cout << **o << endl; lastOutputPosition = max(lastOutputPosition, (*o)->position); } } } else { // if any alts remain, output the variant record if (!var.alt.empty()) { cout << var << endl; lastOutputPosition = max(lastOutputPosition, var.position); } } } } } // if unioning, and any variants remain, output them if (unioning) { for (map<string, list<Variant> >::iterator chrom = otherVariants.find(lastSequenceName); chrom != otherVariants.end(); ++chrom) { for (list<Variant>::iterator v = chrom->second.begin(); v != chrom->second.end(); ++v) { Variant* variant = &*v; if (outputVariants.find(variant) == outputVariants.end()) { outputVariants.insert(variant); cout << *variant << endl; // TODO guarantee sorting } } } } exit(0); // why? return 0; }
int main(int argc, char** argv) { if (argc != 3) { cerr << "usage: " << argv[0] << " <other-genotype-tag> <vcf file>" << endl << "adds statistics to the INFO field of the vcf file describing the" << endl << "amount of discrepancy between the genotypes (GT) in the vcf file and the" << endl << "genotypes reported in the <other-genotype-tag>. use this after" << endl << "vcfannotategenotypes to get correspondence statistics for two vcfs." << endl; return 1; } string otherGenoTag = argv[1]; string filename = argv[2]; VariantCallFile variantFile; if (filename == "-") {; } else {; } if (!variantFile.is_open()) { return 1; } vector<string> specs; specs.push_back("AA_AA"); specs.push_back("AA_AR"); specs.push_back("AA_RR"); specs.push_back("AA_NN"); specs.push_back("AR_AA"); specs.push_back("AR_AR"); specs.push_back("AR_RR"); specs.push_back("AR_NN"); specs.push_back("RR_AA"); specs.push_back("RR_AR"); specs.push_back("RR_RR"); specs.push_back("RR_NN"); specs.push_back("NN_AA"); specs.push_back("NN_AR"); specs.push_back("NN_RR"); specs.push_back("NN_NN"); for (vector<string>::iterator spec = specs.begin(); spec != specs.end(); ++spec) { string line = "##INFO=<ID=" + otherGenoTag + ".genotypes." + *spec + ",Number=1,Type=Integer,Description=\"Number of genotypes with " + *spec + " relationship with " + otherGenoTag + "\">"; variantFile.addHeaderLine(line); } string line; line = "##INFO=<ID=" + otherGenoTag + ".genotypes.count,Number=1,Type=Integer,Description=\"Count of genotypes under comparison.\">"; variantFile.addHeaderLine(line); line = "##INFO=<ID=" + otherGenoTag + ".genotypes.alternate_count,Number=1,Type=Integer,Description=\"Count of alternate genotypes in the first file.\">"; variantFile.addHeaderLine(line); line = "##INFO=<ID=" + otherGenoTag + ".site.alternate_positive_discrepancy,Number=1,Type=Integer,Description=\"Estimated positive discrepancy rate of " + otherGenoTag + " genotypes, where positive discrepancies are all cases where an alternate allele is called GT " + " but none is represented in " + otherGenoTag + " or " + otherGenoTag + " is null/no-call\">"; variantFile.addHeaderLine(line); line = "##INFO=<ID=" + otherGenoTag + ".site.alternate_negative_discrepancy,Number=1,Type=Integer,Description=\"Estimated negative discrepancy rate of " + otherGenoTag + " genotypes, where negative discrepancies are all cases where no alternate allele is called in " + " GT but an alternate is represented in " + otherGenoTag + ", including no-calls or partly null genotypes\">"; variantFile.addHeaderLine(line); line = "##INFO=<ID=" + otherGenoTag + ".site.alternate_null_discrepancy,Number=1,Type=Integer,Description=\"Estimated null discrepancy rate of " + otherGenoTag + " genotypes, where null discrepancies are all cases where GT is specified and contains an alternate but " + otherGenoTag + " is null. Cases where GT is null or partly null are excluded.\">"; variantFile.addHeaderLine(line); line = "##INFO=<ID=" + otherGenoTag + ".site.call_discrepancy,Number=1,Type=Integer,Description=\"Estimated call discrepancy rate of " + otherGenoTag + " genotypes (het->hom, hom->het) between " + otherGenoTag + " and GT\">"; variantFile.addHeaderLine(line); line = "##INFO=<ID=" + otherGenoTag + ".site.call_concordance,Number=1,Type=Integer,Description=\"Estimated call concorndance rate of " + otherGenoTag + " genotypes between " + otherGenoTag + " and GT\">"; variantFile.addHeaderLine(line); line = "##INFO=<ID=" + otherGenoTag + ".site.non_reference_discrepancy,Number=1,Type=Float,Description=\"Estimated non-reference discrepancy relative to " + otherGenoTag + " genotypes,\">"; variantFile.addHeaderLine(line); line = "##INFO=<ID=" + otherGenoTag + ".site.non_reference_discrepancy.count,Number=1,Type=Int,Description=\"non-reference discrepancy normalizer relative to " + otherGenoTag + " genotypes,\">"; variantFile.addHeaderLine(line); line = "##INFO=<ID=" + otherGenoTag + ".site.non_reference_discrepancy.normalizer,Number=1,Type=Int,Description=\"non-reference discrepancy count relative to " + otherGenoTag + " genotypes,\">"; variantFile.addHeaderLine(line); line = "##INFO=<ID=" + otherGenoTag + ".site.non_reference_sensitivity,Number=1,Type=Float,Description=\"Estimated non-reference sensitivity relative to " + otherGenoTag + " genotypes,\">"; variantFile.addHeaderLine(line); line = "##INFO=<ID=" + otherGenoTag + ".site.non_reference_sensitivity.count,Number=1,Type=Int,Description=\"non-reference sensitivity normalizer relative to " + otherGenoTag + " genotypes,\">"; variantFile.addHeaderLine(line); line = "##INFO=<ID=" + otherGenoTag + ".site.non_reference_sensitivity.normalizer,Number=1,Type=Int,Description=\"non-reference sensitivity count relative to " + otherGenoTag + " genotypes,\">"; variantFile.addHeaderLine(line); cout << variantFile.header << endl; Variant var(variantFile); while (variantFile.getNextVariant(var)) { //cout << "next: " << var << endl; // for each sample, check GT against <other-genotype-tag> // tally stats, and append to info map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); map<string, map<string, vector<string> > >::iterator sEnd = var.samples.end(); map<string, int> genotypeComparisonCounts; int gtCount = var.samples.size(); int gtAltCount = 0; // number of alternate-containing genotypes in the first file int pdCount = 0; // positive discrepancy count int ndCount = 0; // negative discrepancy count int nnCount = 0; // null discrepancy count int cdCount = 0; // call discrepancy count int ccCount = 0; // call concordance count int nrdCount = 0; // non-reference discrepancy count int nrdNormalizer = 0; // divisor for nrd rate int nrsCount = 0; // non-reference sensitivity count int nrsNormalizer = 0; // divisor for nrs rate for (; s != sEnd; ++s) { map<string, vector<string> >& sample = s->second; const string& name = s->first; // decompose genotypes into counts of strings // to facilitate comparison string gtA; if (sample.find("GT") == sample.end()) { gtA = "./."; } else { gtA = sample["GT"].front(); } string gtB; if (sample.find(otherGenoTag) == sample.end()) { gtB = "./."; } else { gtB = sample[otherGenoTag].front(); } map<int, int> genotypeA = decomposeGenotype(gtA); map<int, int> genotypeB = decomposeGenotype(gtB); string gtspecA = genotypeSpec(genotypeA); string gtspecB = genotypeSpec(genotypeB); //cout << gtA << " " << gtB << endl; //cout << gtspecA << " " << gtspecB << endl; ++genotypeComparisonCounts[gtspecA + "_" + gtspecB]; if (hasNonRef(genotypeA)) { ++gtAltCount; } if (genotypeA != genotypeB) { if (isNull(genotypeA)) { // TODO handle this somehow, maybe via a different flag? if (!isNull(genotypeB)) { ++nnCount; // null discrepancy, the second set makes a call, this one does not } } else if (hasNonRef(genotypeA)) { if (!isNull(genotypeB) && hasNonRef(genotypeB)) { // they cannot be the same, but they both represent an alternate ++cdCount; // the calls are discrepant } else { // the other call does not have an alternate ++pdCount; // it is also null if (isNull(genotypeB)) { ++nnCount; } } } else { // the current genotype has no non-ref alternate if (!isNull(genotypeB) && hasNonRef(genotypeB)) { ++ndCount; } if (isNull(genotypeB)) { ++nnCount; } } } else { if (!isNull(genotypeA)) { ++ccCount; } } if (!(isNull(genotypeA) || isNull(genotypeB)) && !(isHomRef(genotypeA) && isHomRef(genotypeB))) { ++nrdNormalizer; if (genotypeA != genotypeB) { ++nrdCount; } } if (!(isNull(genotypeB) || isHomRef(genotypeB))) { ++nrsNormalizer; if (!(isNull(genotypeA) || isHomRef(genotypeA))) { ++nrsCount; } } } for (map<string, int>::iterator g = genotypeComparisonCounts.begin(); g != genotypeComparisonCounts.end(); ++g) { stringstream c; c << g->second; vector<string>& t =[otherGenoTag + ".genotypes." + g->first]; t.clear(); t.push_back(c.str()); } stringstream gtc; gtc << gtCount;[otherGenoTag + ".genotypes.count"].push_back(gtc.str()); stringstream gtac; gtac << gtAltCount;[otherGenoTag + ".genotypes.alternate_count"].push_back(gtac.str()); stringstream pd; pd << pdCount;[otherGenoTag + ".site.alternate_positive_discrepancy"].push_back(pd.str()); stringstream nd; nd << ndCount;[otherGenoTag + ".site.alternate_negative_discrepancy"].push_back(nd.str()); stringstream nn; nn << nnCount;[otherGenoTag + ".site.alternate_null_discrepancy"].push_back(nn.str()); stringstream cd; cd << cdCount;[otherGenoTag + ".site.call_discrepancy"].push_back(cd.str()); stringstream cc; cc << ccCount;[otherGenoTag + ".site.call_concordance"].push_back(cc.str()); stringstream nrdc; nrdc << nrdCount;[otherGenoTag + ".site.non_reference_discrepancy.count"].push_back(nrdc.str()); stringstream nrdn; nrdn << nrdNormalizer;[otherGenoTag + ".site.non_reference_discrepancy.normalizer"].push_back(nrdn.str()); if (nrdNormalizer > 0) { stringstream nrd; nrd << (double) nrdCount / (double) nrdNormalizer;[otherGenoTag + ".site.non_reference_discrepancy"].push_back(nrd.str()); } stringstream nrsc; nrsc << nrsCount;[otherGenoTag + ".site.non_reference_sensitivity.count"].push_back(nrsc.str()); stringstream nrsn; nrsn << nrsNormalizer;[otherGenoTag + ".site.non_reference_sensitivity.normalizer"].push_back(nrsn.str()); if (nrsNormalizer > 0) { stringstream nrs; nrs << (double) nrsCount / (double) nrsNormalizer;[otherGenoTag + ".site.non_reference_sensitivity"].push_back(nrs.str()); } cout << var << endl; } return 0; }
int main(int argc, char** argv) { string nullval; bool genotypes = false; int c; while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"null-value", required_argument, 0, 'n'}, {"genotypes", no_argument, 0, 'g'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hn:g", long_options, &option_index); if (c == -1) break; switch (c) { case 'n': nullval = optarg; break; case 'g': genotypes = true; break; case 'h': printSummary(argv); break; case '?': printSummary(argv); exit(1); break; default: abort (); } } VariantCallFile variantFile; bool usingstdin = false; string inputFilename; if (optind == argc - 1) { inputFilename = argv[optind];; } else { if (! { if (argc == 1) { printSummary(argv); } else { cerr << "could not open stdin for reading as VCF" << endl; exit(1); } } usingstdin = true; } if (!variantFile.is_open()) { return 1; } // obtain all possible field names vector<string> infofields; vector<string> infoflags; for (map<string, VariantFieldType>::iterator i = variantFile.infoTypes.begin(); i != variantFile.infoTypes.end(); ++i) { if (i->second == FIELD_BOOL) { infoflags.push_back(i->first); } else { infofields.push_back(i->first); } } vector<string> formatfields; if (genotypes) { for (map<string, VariantFieldType>::iterator f = variantFile.formatTypes.begin(); f != variantFile.formatTypes.end(); ++f) { formatfields.push_back(f->first); } } // write header // defaults cout << "CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER"; // configurable info field for (vector<string>::iterator i = infofields.begin(); i != infofields.end(); ++i) { cout << "\t" << *i; } for (vector<string>::iterator i = infoflags.begin(); i != infoflags.end(); ++i) { cout << "\t" << *i; } if (genotypes) { cout << "\t" << "SAMPLE"; for (vector<string>::iterator f = formatfields.begin(); f != formatfields.end(); ++f) { cout << "\t" << *f; } } cout << endl; Variant var(variantFile); while (variantFile.getNextVariant(var)) { if (!genotypes) { int altindex = 0; for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a, ++altindex) { string& altallele = *a; cout << var.sequenceName << "\t" << var.position << "\t" << << "\t" << var.ref << "\t" << altallele << "\t" << var.quality << "\t" << var.filter; for (vector<string>::iterator i = infofields.begin(); i != infofields.end(); ++i) { vector<string> value; string& name = *i; map<string, vector<string> >::iterator f =; if (f != { value = f->second; if (value.size() == 1) { cout << "\t" << value.front(); } else if (value.size() == var.alt.size()) { cout << "\t" <<; } else { cout << "\t" << nullval; // null } } else { cout << "\t" << nullval; // null } } for (vector<string>::iterator i = infoflags.begin(); i != infoflags.end(); ++i) { string value; string& name = *i; map<string, bool>::iterator f = var.infoFlags.find(name); cout << "\t"; if (f != var.infoFlags.end()) { cout << 1; } else { cout << 0; } } cout << endl; } } else { stringstream o; // per-genotype output o << var.sequenceName << "\t" << var.position << "\t" << << "\t" << var.ref << "\t" << join(var.alt, ",") << "\t" << var.quality << "\t" << var.filter; for (vector<string>::iterator i = infofields.begin(); i != infofields.end(); ++i) { vector<string> value; string& name = *i; map<string, vector<string> >::iterator f =; if (f != { value = f->second; if (value.size() == 1) { o << "\t" << value.front(); } else if (value.size() == var.alt.size()) { o << "\t" << join(value, ","); } else { o << "\t" << nullval; // null } } else { o << "\t" << nullval; // null } } for (vector<string>::iterator i = infoflags.begin(); i != infoflags.end(); ++i) { string value; string& name = *i; map<string, bool>::iterator f = var.infoFlags.find(name); o << "\t"; if (f != var.infoFlags.end()) { o << 1; } else { o << 0; } } string siteinfo = o.str(); for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) { cout << siteinfo; const string& sampleName = s->first; cout << "\t" << sampleName; map<string, vector<string> >& sample = s->second; for (vector<string>::iterator f = formatfields.begin(); f != formatfields.end(); ++f) { if (sample.find(*f) != sample.end()) { cout << "\t" << join(sample[*f], ","); } else { cout << "\t" << nullval; } } cout << endl; } } } return 0; }
int main(int argc, char** argv) { string vcfFileName; string fastaFileName; int windowsize = 30; if (argc == 1) printSummary(argv); int c; while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"window-size", required_argument, 0, 'w'}, {"reference", required_argument, 0, 'r'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hw:r:", long_options, &option_index); if (c == -1) break; switch (c) { case 'w': windowsize = atoi(optarg); break; case 'r': fastaFileName = string(optarg); break; case 'h': printSummary(argv); break; case '?': printSummary(argv); exit(1); break; default: abort (); } } VariantCallFile variantFile; string inputFilename; if (optind == argc - 1) { inputFilename = argv[optind];; } else {; } if (!variantFile.is_open()) { cerr << "could not open VCF file" << endl; exit(1); } FastaReference reference; if (fastaFileName.empty()) { cerr << "a reference is required for haplotype allele generation" << endl; exit(1); }; // pattern // when variants are within windowSize from each other, build up local haplotypes // establish all the haplotypes which exist within the window using genotypes+allele#+position map // generate a haplotype allele string for each unique haplotype // for completeness retain phasing information in the genotypes // write a new VCF record in which there are haplotype alleles and correctly described genotypes for each sample // if the variants are outside of the windowSize, just write out the record Variant var(variantFile); Variant outputVar(variantFile); cout << variantFile.header << endl; // get the first distances vector<Variant> cluster; while (variantFile.getNextVariant(var) || !cluster.empty()) { bool haplotypeCluster = false; if (variantFile.done()) { if (cluster.size() >= 1) { haplotypeCluster = true; } else { cout << cluster.front() << endl; cluster.clear(); } } else if (isPhased(var)) { if (cluster.empty() || cluster.back().sequenceName == var.sequenceName && var.position - cluster.back().position + cluster.back().ref.size() - 1 <= windowsize) { cluster.push_back(var); } else { if (cluster.size() == 1) { cout << cluster.front() << endl; cluster.clear(); if (!variantFile.done()) { cluster.push_back(var); } } else { haplotypeCluster = true; } } } else { // not phased if (cluster.empty()) { cout << var << endl; } else if (cluster.size() == 1) { cout << cluster.front() << endl; cout << var << endl; } else { haplotypeCluster = true; } } // we need to deal with the current cluster, as our next var is outside of bounds // process the last cluster if it's more than 1 var if (haplotypeCluster) { /* cerr << "cluster: "; for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) { cerr << " " << v->position; } cerr << endl; */ // generate haplotype alleles and genotypes! // get the reference sequence across the haplotype in question string referenceHaplotype = reference.getSubSequence(cluster.front().sequenceName, cluster.front().position - 1, cluster.back().position + cluster.back().ref.size() - cluster.front().position); // establish what haplotypes there are by parsing the (phased) genotypes across the samples over these records map<string, vector<vector<int> > > sampleHaplotypes; for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) { // build the haplotype using the genotype fields in the variant cluster // only build haplotypes for samples with complete information string& sampleName = *s; vector<vector<int> >& haplotypes = sampleHaplotypes[sampleName]; bool completeCoverage = true; // ensure complete genotype coverage over the haplotype cluster for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) { if (v->samples.find(sampleName) == v->samples.end() || v->samples[sampleName].find("GT") == v->samples[sampleName].end()) { completeCoverage = false; break; } } if (!completeCoverage) { continue; // skip samples without complete coverage } // what's the ploidy? { string& gt = cluster.front().samples[sampleName]["GT"].front(); vector<string> gtspec = split(gt, "|"); for (vector<string>::iterator g = gtspec.begin(); g != gtspec.end(); ++g) { vector<int> haplotype; haplotypes.push_back(haplotype); } } for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) { string& gt = v->samples[sampleName]["GT"].front(); vector<string> gtspec = split(gt, "|"); vector<string>::iterator g = gtspec.begin(); for (vector<vector<int> >::iterator h = haplotypes.begin(); h != haplotypes.end(); ++h, ++g) { int j; convert(*g, j); h->push_back(j); } } } set<vector<int> > uniqueHaplotypes; for (map<string, vector<vector<int> > >::iterator hs = sampleHaplotypes.begin(); hs != sampleHaplotypes.end(); ++hs) { vector<vector<int> >& haps = hs->second; for (vector<vector<int> >::iterator h = haps.begin(); h != haps.end(); ++h) { uniqueHaplotypes.insert(*h); } } // write new haplotypes map<vector<int>, string> haplotypeSeqs; map<vector<int>, int> haplotypeIndexes; map<int, string> alleles; int impossibleHaplotypes = 0; // always include the reference haplotype as 0 // when we come to it in the haplotypes, we'll ignore it int alleleIndex = 1; for (set<vector<int> >::iterator u = uniqueHaplotypes.begin(); u != uniqueHaplotypes.end(); ++u) { /* for (vector<int>::const_iterator z = u->begin(); z != u->end(); ++z) { cerr << *z; } cerr << endl; */ string haplotype = referenceHaplotype; bool isreference = true; bool impossibleHaplotype = false; int referenceInsertOffset = 0; int j = 0; // index into variant cluster int lastpos = 0; int lastrefend = 0; for (vector<int>::const_iterator z = u->begin(); z != u->end(); ++z, ++j) { int i = *z; if (i != 0) { isreference = false; Variant& vartoInsert =; string& alternate =; if (vartoInsert.position < lastrefend) { cerr << "impossible haplotype, overlapping alleles at " << vartoInsert.sequenceName << ":" << vartoInsert.position << endl; impossibleHaplotype = true; break; } else { //cerr << vartoInsert.position << " " << cluster.front().position + referenceInsertOffset << endl; //cerr << "replacing " << vartoInsert.ref << " at " << vartoInsert.position - cluster.front().position + referenceInsertOffset << " with " << alternate << endl; haplotype.replace(vartoInsert.position - cluster.front().position + referenceInsertOffset, vartoInsert.ref.size(), alternate); if (alternate.size() != vartoInsert.ref.size()) { referenceInsertOffset += alternate.size() - vartoInsert.ref.size(); } lastpos = vartoInsert.position; lastrefend = vartoInsert.position + vartoInsert.ref.size(); } } } if (impossibleHaplotype) { ++impossibleHaplotypes; haplotypeIndexes[*u] = -1; // indicates impossible haplotype impossibleHaplotype = false; } else if (isreference) { alleles[0] = haplotype; haplotypeIndexes[*u] = 0; } else { alleles[alleleIndex] = haplotype; haplotypeIndexes[*u] = alleleIndex; ++alleleIndex; } haplotypeSeqs[*u] = haplotype; // if there's not a reference allele, add it if (alleles.find(0) == alleles.end()) { alleles[0] = referenceHaplotype; // nb, there is no reference haplotype among // the samples, so we don't have to add it to // the haplotypeIndexes } } outputVar.ref = alleles[0]; outputVar.alt.clear(); for (int i = 1; i < alleleIndex; ++i) { outputVar.alt.push_back(alleles[i]); } outputVar.sequenceName = cluster.front().sequenceName; outputVar.position = cluster.front().position; outputVar.filter = "."; = "."; = cluster.front().info; outputVar.samples.clear(); outputVar.format = cluster.front().format; // now the genotypes for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) { string& sampleName = *s; vector<string> gt; vector<vector<int> > & hs = sampleHaplotypes[sampleName]; for (vector<vector<int> >::iterator h = hs.begin(); h != hs.end(); ++h) { int hi = haplotypeIndexes[*h]; if (hi != -1) { gt.push_back(convert(hi)); } else { // nonexistent or impossible haplotype gt.push_back("."); } } if (gt.size() != 0) { outputVar.samples[sampleName]["GT"].push_back(join(gt, "|")); } } if (cluster.size() - impossibleHaplotypes < 2) { for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) { cout << *v << endl; } } else { if (!outputVar.alt.empty()) { cout << outputVar << endl; } else { cerr << "no alternate alleles remain at " << outputVar.sequenceName << ":" << outputVar.position << " after haplotype validation" << endl; } } cluster.clear(); if (!variantFile.done()) cluster.push_back(var); } } exit(0); // why? return 0; }
int main(int argc, char** argv) { if (argc != 3) { cerr << "usage: " << argv[0] << " <vcf file> <vcf file>" << endl << "Adds info fields from the second file which are not present in the first vcf file." << endl; return 1; } string filenameA = argv[1]; string filenameB = argv[2]; if (filenameA == filenameB) { cerr << "it won't help to add info data from the same file!" << endl; return 1; } VariantCallFile variantFileA; if (filenameA == "-") {; } else {; } VariantCallFile variantFileB; if (filenameB == "-") {; } else {; } if (!variantFileA.is_open() || !variantFileB.is_open()) { return 1; } Variant varA(variantFileA); Variant varB(variantFileB); // while the first file doesn't match the second positionally, // step forward, annotating each genotype record with an empty genotype // when the two match, iterate through the genotypes from the first file // and get the genotypes reported in the second file variantFileA.getNextVariant(varA); variantFileB.getNextVariant(varB); variantFileA.header = unionInfoHeaderLines(variantFileA.header, variantFileB.header); cout << variantFileA.header << endl; do { while (!variantFileB.done() && (varB.sequenceName < varA.sequenceName || (varB.sequenceName == varA.sequenceName && varB.position < varA.position)) ) { variantFileB.getNextVariant(varB); } while (!variantFileA.done() && (varA.sequenceName < varB.sequenceName || (varA.sequenceName == varB.sequenceName && varA.position < varB.position)) ) { cout << varA << endl; variantFileA.getNextVariant(varA); } while (!variantFileB.done() && (varB.sequenceName < varA.sequenceName || (varB.sequenceName == varA.sequenceName && varB.position < varA.position)) ) { variantFileB.getNextVariant(varB); } while (!variantFileA.done() && varA.sequenceName == varB.sequenceName && varA.position == varB.position) { addInfo(varA, varB); cout << varA << endl; variantFileA.getNextVariant(varA); variantFileB.getNextVariant(varB); } } while (!variantFileA.done() && !variantFileB.done()); if (!variantFileA.done()) { cout << varA << endl; while (variantFileA.getNextVariant(varA)) { cout << varA << endl; } } return 0; }
int main(int argc, char** argv) { if (argc < 5) { printSummary(argv); exit(0); } bool strict = false; int c; while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"strict", no_argument, 0, 's'}, //{"length", no_argument, &printLength, true}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hs", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) break; switch (c) { case 0: /* If this option set a flag, do nothing else now. */ if (long_options[option_index].flag != 0) break; printf ("option %s", long_options[option_index].name); if (optarg) printf (" with arg %s", optarg); printf ("\n"); break; case 's': strict = true; break; case 'h': printSummary(argv); exit(0); break; case '?': /* getopt_long already printed an error message. */ printSummary(argv); exit(1); break; default: abort (); } } string tag = argv[optind]; vector<string> samples; for (int i = optind+1; i < argc - 1; ++i) { samples.push_back(argv[i]); } string filename = argv[argc-1]; VariantCallFile variantFile; if (filename == "-") {; } else {; } if (!variantFile.is_open()) { cerr << "could not open " << filename << endl; return 1; } assert(samples.size() == 2); Variant var(variantFile); // TODO check if AC is present // ensure that AC is listed as an info field string line = "##INFO=<ID=" + tag + ",Number=1,Type=String,Description=\"Samples"; for (vector<string>::iterator s = samples.begin(); s != samples.end(); ++s) { line += " " + *s; } line += " have different genotypes\">"; variantFile.addHeaderLine(line); variantFile.addHeaderLine("##INFO=<ID=SSC,Number=1,Type=Float,Description=\"Somatic variant score (phred-scaled probability that the somatic variant call is correct).\">"); // write the new header cout << variantFile.header << endl; // print the records, filtering is done via the setting of varA's output sample names while (variantFile.getNextVariant(var)) { if (var.samples.find(samples.front()) != var.samples.end() && var.samples.find(samples.back()) != var.samples.end()) { map<string, vector<string> >& germline = var.samples[samples.front()]; map<string, vector<string> >& somatic = var.samples[samples.back()]; map<int, int> gtGermline = decomposeGenotype(germline["GT"].front()); map<int, int> gtSomatic = decomposeGenotype(somatic["GT"].front()); int germlineAltCount = 0; convert(germline["AO"].front(), germlineAltCount);[tag].clear(); // remove previous if (gtGermline == gtSomatic) {[tag].push_back("germline"); } else { //if (isHet(gtGermline) && isHom(gtSomatic)) { //[tag].push_back("loh"); if (isHet(gtGermline) && isHomNonRef(gtSomatic) || isHomRef(gtGermline) && (isHet(gtSomatic) || isHomNonRef(gtSomatic))) { if (!strict || strict && germlineAltCount == 0) {[tag].push_back("somatic"); } } else if (isHom(gtGermline) && isHet(gtSomatic)) { if (var.alt.size() == 1) {[tag].push_back("reversion"); } else {[tag].push_back("somatic"); } } } if (germline.find("GQ") != germline.end() && somatic.find("GQ") != somatic.end()) { double germlineGQ; convert(germline["GQ"].front(), germlineGQ); double somaticGQ; convert(somatic["GQ"].front(), somaticGQ); double somaticScore = min(var.quality, min(germlineGQ, somaticGQ));["SSC"].clear();["SSC"].push_back(convert(somaticScore)); } } cout << var << endl; } return 0; }
int main(int argc, char** argv) { vector<string> regions; int c; while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"region", required_argument, 0, 'r'}, //{"length", no_argument, &printLength, true}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hr:", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) break; switch (c) { case 0: /* If this option set a flag, do nothing else now. */ if (long_options[option_index].flag != 0) break; printf ("option %s", long_options[option_index].name); if (optarg) printf (" with arg %s", optarg); printf ("\n"); break; case 'h': printSummary(argv); exit(0); break; case 'r': regions.push_back(optarg); break; default: abort (); } } VariantCallFile variantFile; string inputFilename; if (optind == argc - 1) { inputFilename = argv[optind];; } else {; } if (!variantFile.is_open()) { return 1; } Variant var(variantFile); vector<string>::iterator regionItr = regions.begin(); int variantAlleles = 0; int variantSites = 0; int snps = 0; int transitions = 0; int transversions = 0; int totalinsertions = 0; int totaldeletions = 0; int insertedbases = 0; int deletedbases = 0; int totalmnps = 0; int totalcomplex = 0; map<int, int> insertions; map<int, int> deletions; map<int, int> mnps; map<int, int> complexsubs; do { if (!inputFilename.empty() && !regions.empty()) { string regionStr = *regionItr++; variantFile.setRegion(regionStr); } while (variantFile.getNextVariant(var)) { ++variantSites; map<string, vector<VariantAllele> > alternates = var.parsedAlternates(); for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { ++variantAlleles; string& alternate = *a; if (var.ref.size() == alternate.size()) { if (var.ref.size() == 1) { ++snps; if (isTransition(var.ref, alternate)) { ++transitions; } else { ++transversions; } } else { ++totalmnps; if (alternates[alternate].size() > 1) { } else { VariantAllele& va = alternates[alternate].front(); ++mnps[va.alt.size()]; // not entirely correct } } } else if (var.ref.size() > alternate.size()) { int diff = var.ref.size() - alternate.size(); deletedbases += diff; if (alternates[alternate].size() > 1) { ++totalcomplex; ++complexsubs[-diff]; } else { ++totaldeletions; ++deletions[diff]; } } else { int diff = alternate.size() - var.ref.size(); insertedbases += diff; if (alternates[alternate].size() > 1) { ++totalcomplex; ++complexsubs[diff]; } else { ++totalinsertions; ++insertions[diff]; } } } } } while (regionItr != regions.end()); // find the maximum indel size int maxindel = 0; for (map<int, int>::iterator i = insertions.begin(); i != insertions.end(); ++i) { if (i->first > maxindel) { maxindel = i->first; } } for (map<int, int>::iterator i = deletions.begin(); i != deletions.end(); ++i) { if (i->first > maxindel) { maxindel = i->first; } } // and maximum mnp int maxmnp = 0; for (map<int, int>::iterator i = mnps.begin(); i != mnps.end(); ++i) { if (i->first > maxmnp) { maxmnp = i->first; } } // now print the results cout << "total variant sites:\t" << variantSites << endl << "total variant alleles:\t" << variantAlleles << endl << endl << "snps:\t" << snps << endl << "indels:\t" << totalinsertions + totaldeletions << endl << "mnps:\t" << totalmnps << endl << "complex:\t" << totalcomplex << endl << endl << "ts/tv ratio:\t" << (double) transitions / (double) transversions << endl << endl << "ins/del length frequency distribution" << endl << "length\tins\tdel\tins/del" << endl; for (int i = 1; i <= maxindel; ++i) { int ins = insertions[i]; int del = deletions[i]; cout << i << "\t" << (ins > 0 ? convert(ins) : "" ) << "\t" << (del > 0 ? convert(del) : "") << "\t" << (ins > 0 && del > 0 ? convert((double) ins / (double) del) : "") << endl; } cout << endl << "insertion alleles / deletion alleles:\t" << (double) totalinsertions / (double) totaldeletions << endl << "inserted bases / deleted bases:\t" << (double) insertedbases / (double) deletedbases << endl << endl << "mnp length frequency distribution" << endl << "length\tcount" << endl; for (int i = 2; i <= maxmnp; ++i) { int mnp = mnps[i]; cout << i << "\t" << (mnp > 0 ? convert(mnp) : "") << endl; } cout << endl; cout << "complex event frequency distribution" << endl << "length\tcount" << endl; for (map<int, int>::iterator i = complexsubs.begin(); i != complexsubs.end(); ++i) { cout << i->first << "\t" << i->second << endl; } return 0; }
int main(int argc, char** argv) { int c; bool invert = false; bool logicalOr = false; bool filterSites = false; vector<string> infofilterStrs; vector<VariantFilter> infofilters; vector<string> genofilterStrs; vector<VariantFilter> genofilters; string tag = ""; string filterSpec; string alleleTag; vector<string> regions; if (argc == 1) printSummary(argv); while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"filter-sites", no_argument, 0, 's'}, {"info-filter", required_argument, 0, 'f'}, {"genotype-filter", required_argument, 0, 'g'}, {"tag", required_argument, 0, 't'}, {"allele-tag", required_argument, 0, 'a'}, {"invert", no_argument, 0, 'v'}, {"or", no_argument, 0, 'o'}, {"region", required_argument, 0, 'r'}, //{"length", no_argument, &printLength, true}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hvsof:g:t:r:a:", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) break; switch (c) { case 0: /* If this option set a flag, do nothing else now. */ if (long_options[option_index].flag != 0) break; printf ("option %s", long_options[option_index].name); if (optarg) printf (" with arg %s", optarg); printf ("\n"); break; case 'f': filterSpec += " " + string(optarg); infofilterStrs.push_back(string(optarg)); break; case 's': filterSites = true; break; case 'a': alleleTag = optarg; break; case 'g': filterSpec += " genotypes filtered with: " + string(optarg); genofilterStrs.push_back(string(optarg)); break; case 't': tag = optarg; break; case 'h': printSummary(argv); exit(0); break; case 'v': invert = true; break; case 'o': logicalOr = true; break; case 'r': regions.push_back(optarg); break; case '?': /* getopt_long already printed an error message. */ printSummary(argv); exit(1); break; default: abort (); } } filterSpec = filterSpec.substr(1); // strip leading " " VariantCallFile variantFile; string inputFilename; if (optind == argc - 1) { inputFilename = argv[optind];; } else {; } if (!variantFile.is_open()) { return 1; } for (vector<string>::iterator f = infofilterStrs.begin(); f != infofilterStrs.end(); ++f) { infofilters.push_back(VariantFilter(*f, VariantFilter::RECORD, variantFile.infoTypes)); } for (vector<string>::iterator f = genofilterStrs.begin(); f != genofilterStrs.end(); ++f) { genofilters.push_back(VariantFilter(*f, VariantFilter::SAMPLE, variantFile.formatTypes)); } vector<string> headerlines = split(variantFile.header, "\n"); variantFile.header.clear(); for (vector<string>::iterator l = headerlines.begin(); l != headerlines.end(); ++l) { if (!filterSpec.empty() && (l->find("INFO") != string::npos || l + 1 == headerlines.end())) { variantFile.header += "##filter=\"" + filterSpec + "\"\n"; filterSpec.clear(); } variantFile.header += *l + ((l + 1 == headerlines.end()) ? "" : "\n"); } if (!alleleTag.empty()) { variantFile.addHeaderLine("##INFO=<ID="+ alleleTag +",Number=A,Type=String,Description=\"" + tag + " if this allele passes the filters, '.' if not, filters are: " + filterSpec + ".\">"); } cout << variantFile.header << endl; /* if (genofilters.empty() && tag.empty()) { variantFile.parseSamples = false; } */ Variant var(variantFile); vector<string>::iterator regionItr = regions.begin(); do { if (!inputFilename.empty() && !regions.empty()) { string regionStr = *regionItr++; variantFile.setRegion(regionStr); } while (variantFile.getNextVariant(var)) { if (!genofilters.empty()) { for (vector<VariantFilter>::iterator f = genofilters.begin(); f != genofilters.end(); ++f) { f->removeFilteredGenotypes(var); } } if (!infofilters.empty()) { if (filterSites) { bool passes = passesFilters(var, infofilters, logicalOr); if (invert) { passes = !passes; } if (passes) { if (!tag.empty()) { if (alleleTag.empty()) { var.addFilter(tag); } else {[alleleTag].clear(); for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {[alleleTag].push_back(tag); } } cout << var << endl; } else { cout << var << endl; } } else if (!tag.empty()) { cout << var << endl; } } else { // filter out alleles which pass // removes the failing alleles vector<string> failingAlts; vector<string> passingAlts; vector<bool> passes; for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { if (!passesFilters(var, infofilters, logicalOr, *a)) { failingAlts.push_back(*a); passes.push_back(false); } else { passingAlts.push_back(*a); passes.push_back(true); } } if (tag.empty()) { // if there is no specified tag, just remove the failing alts if (failingAlts.size() < var.alt.size()) { for (vector<string>::iterator a = failingAlts.begin(); a != failingAlts.end(); ++a) { var.removeAlt(*a); } cout << var << endl; } } else { // otherwise, apply the tag if (alleleTag.empty()) { if (!passingAlts.empty()) { var.addFilter(tag); } } else {[alleleTag].clear(); for (vector<bool>::iterator p = passes.begin(); p != passes.end(); ++p) { if (*p) {[alleleTag].push_back(tag); } else {[alleleTag].push_back("."); } } } cout << var << endl; } } } else { if (genofilters.empty()) { cout << variantFile.line << endl; } else { cout << var << endl; } } } } while (regionItr != regions.end()); return 0; }
int main(int argc, char** argv) { string vcfFileName; string fastaFileName; int windowsize = 100; bool includePreviousBaseForIndels = false; bool useMNPs = true; int altwindowsize = 50; // constants for SmithWaterman algorithm float matchScore = 10.0f; float mismatchScore = -9.0f; float gapOpenPenalty = 15.0f; float gapExtendPenalty = 6.66f; bool useEntropy = false; bool useRepeatGapExtendPenalty = false; float repeatGapExtendPenalty = 1; bool adjustVcf = false; string adjustedTag = "remappedCIGAR"; if (argc == 1) printSummary(argv); int c; while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"ref-window-size", required_argument, 0, 'w'}, {"reference", required_argument, 0, 'r'}, {"match-score", required_argument, 0, 'm'}, {"mismatch-score", required_argument, 0, 'x'}, {"gap-open-penalty", required_argument, 0, 'o'}, {"gap-extend-penalty", required_argument, 0, 'e'}, {"alt-window-size", required_argument, 0, 's'}, {"entropy-gap-open", no_argument, 0, 'z'}, {"repeat-gap-extend", no_argument, 0, 'R'}, {"adjust-vcf", required_argument, 0, 'a'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hza:w:r:m:x:o:e:s:R:", long_options, &option_index); if (c == -1) break; switch (c) { case 'w': windowsize = atoi(optarg); break; case 'a': adjustVcf = true; adjustedTag = optarg; break; case 'r': fastaFileName = string(optarg); break; case 'h': printSummary(argv); break; case 'm': matchScore = atof(optarg); break; case 'x': mismatchScore = atof(optarg); break; case 'o': gapOpenPenalty = atof(optarg); break; case 'e': gapExtendPenalty = atof(optarg); break; case 's': altwindowsize = atoi(optarg); break; case 'z': useEntropy = true; break; case 'R': useRepeatGapExtendPenalty = true; repeatGapExtendPenalty = atof(optarg); break; case '?': printSummary(argv); exit(1); break; default: abort (); } } VariantCallFile variantFile; string inputFilename; if (optind == argc - 1) { inputFilename = argv[optind];; } else {; } if (!variantFile.is_open()) { cerr << "could not open VCF file" << endl; exit(1); } FastaReference freference; if (fastaFileName.empty()) { cerr << "a reference is required" << endl; exit(1); } else {; } if (adjustVcf) { vector<string> commandline; for (int i = 0; i < argc; ++i) commandline.push_back(argv[i]); variantFile.addHeaderLine("##INFO=<ID=" + adjustedTag + ",Number=A,Type=String,Description=\"CIGAR when remapped using"+ join(commandline, " ") +"\">"); } cout << variantFile.header << endl; Variant var(variantFile); while (variantFile.getNextVariant(var)) { //if (!adjustVcf) { cout << endl; cout << var << endl; //} map<string, vector<VariantAllele> > variantAlleles; vector<vector<pair<int, char> > > cigars; vector<int> positionDiffs; for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { //if (!adjustVcf) cout << endl; cout << endl; // try to remap locally string reference = freference.getSubSequence(var.sequenceName, var.position - 1 - windowsize, windowsize * 2 + var.ref.size()); // passed to sw align unsigned int referencePos; string cigar; string& alternate = *a; vector<VariantAllele>& variants = variantAlleles[alternate]; string alternateQuery = reference.substr(windowsize - altwindowsize, altwindowsize) + alternate + reference.substr(reference.size() - windowsize, altwindowsize); //cout << "REF:\t" << reference << endl; //cout << "ALT:\t" << string(windowsize - altwindowsize, ' ') << alternateQuery << endl; CSmithWatermanGotoh sw(matchScore, mismatchScore, gapOpenPenalty, gapExtendPenalty); if (useEntropy) sw.EnableEntropyGapPenalty(1); if (useRepeatGapExtendPenalty) sw.EnableRepeatGapExtensionPenalty(repeatGapExtendPenalty); sw.Align(referencePos, cigar, reference, alternateQuery); int altpos = 0; int refpos = 0; int len; string slen; vector<pair<int, char> > cigarData; string ref = reference.substr(referencePos); positionDiffs.push_back(referencePos); // TODO this... is borked stringstream refss; stringstream altss; if (!adjustVcf) cout << cigar << endl; cout << cigar << endl; for (string::iterator c = cigar.begin(); c != cigar.end(); ++c) { switch (*c) { case 'I': len = atoi(slen.c_str()); slen.clear(); if (altpos < altwindowsize) { cigarData.push_back(make_pair(len, 'M')); } else { cigarData.push_back(make_pair(len, *c)); } altss << alternateQuery.substr(altpos, len); refss << string(len, '-'); altpos += len; break; case 'D': len = atoi(slen.c_str()); slen.clear(); if (altpos < altwindowsize) { } else { cigarData.push_back(make_pair(len, *c)); } refss << ref.substr(refpos, len); altss << string(len, '-'); refpos += len; break; case 'M': len = atoi(slen.c_str()); slen.clear(); { for (int i = 0; i < len; ++i) { if ( + i) == + i)) { if (!cigarData.empty() && cigarData.back().second == 'M') { cigarData.back().first++; } else { cigarData.push_back(make_pair(1, 'M')); } } else { if (!cigarData.empty() && cigarData.back().second == 'X') { cigarData.back().first++; } else { cigarData.push_back(make_pair(1, 'X')); } } } } refss << ref.substr(refpos, len); altss << alternateQuery.substr(altpos, len); refpos += len; altpos += len; break; case 'S': len = atoi(slen.c_str()); slen.clear(); cigarData.push_back(make_pair(len, *c)); refss << ref.substr(refpos, len); //altss << alternateQuery.substr(altpos, len); // TODO deal with soft clipping, weird behavior refpos += len; altpos += len; break; default: len = 0; slen += *c; break; } } if (!adjustVcf) { cout << "ref:\t" << refss.str() << endl; cout << "alt:\t" << altss.str() << endl; } else { cout << "ref:\t" << refss.str() << endl; cout << "alt:\t" << altss.str() << endl; cigars.push_back(cigarData); } } if (adjustVcf) { int substart = cigars.front().front().first; int subend = cigars.front().back().first; // find the min and max match for (vector<vector<pair<int, char> > >::iterator c = cigars.begin(); c != cigars.end(); ++c) { if (c->front().second == 'M' && c->front().first <= substart) { substart = c->front().first; if (c->size() > 1 && c->at(1).second != 'X') { --substart; } } if (c->back().second == 'M' && c->back().first <= subend) { subend = c->back().first; } } // adjust the cigars and get the new reference length int reflen = 0; for (vector<vector<pair<int, char> > >::iterator c = cigars.begin(); c != cigars.end(); ++c) { c->front().first -= substart; c->back().first -= subend; int crf = cigarRefLen(*c); if (crf > reflen) reflen = crf;[adjustedTag].push_back(joinCigar(*c)); } // find the lowest positional difference int pdiff = 0; for (vector<int>::iterator d = positionDiffs.begin(); d != positionDiffs.end(); ++d) { if (*d + altwindowsize < pdiff) pdiff = *d + altwindowsize; } // adjust the reference string var.position += pdiff; // adjust the variant position var.ref = freference.getSubSequence(var.sequenceName, var.position - 1, reflen); cout << var << endl; } } return 0; }