int main(int argc, char** argv) { VariantCallFile variantFile; if (argc > 1) { string filename = argv[1]; variantFile.open(filename); } else { variantFile.open(std::cin); } if (!variantFile.is_open()) { return 1; } variantFile.addHeaderLine("##FORMAT=<ID=SN,Number=1,Type=String,Description=\"The name of the sample.\">"); cout << variantFile.header << endl; Variant var(variantFile); while (variantFile.getNextVariant(var)) { var.format.push_back("SN"); for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) { s->second["SN"].clear(); s->second["SN"].push_back(s->first); } cout << var << endl; } return 0; }
int main(int argc, char** argv) { VariantCallFile variantFile; if (argc > 1) { string filename = argv[1]; variantFile.open(filename); } else { variantFile.open(std::cin); } if (!variantFile.is_open()) { return 1; } variantFile.addHeaderLine("##INFO=<ID=length,Number=A,Type=Integer,Description=\"length(ALT) - length(REF) for each ALT\">"); variantFile.addHeaderLine("##INFO=<ID=length.ref,Number=1,Type=Integer,Description=\"length(REF)\">"); variantFile.addHeaderLine("##INFO=<ID=length.alt,Number=A,Type=Integer,Description=\"length(ALT) for each ALT\">"); cout << variantFile.header << endl; Variant var(variantFile); while (variantFile.getNextVariant(var)) { vector<string>& lengths = var.info["length"]; lengths.clear(); for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { lengths.push_back(convert((int) a->size() - (int) var.ref.size())); } vector<string>& lengthsRef = var.info["length.ref"]; lengthsRef.clear(); lengthsRef.push_back(convert(var.ref.size())); vector<string>& lengthsAlt = var.info["length.alt"]; lengthsAlt.clear(); for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { lengthsAlt.push_back(convert((int) a->size())); } cout << var << endl; } return 0; }
int main(int argc, char** argv) { int c; bool invert = false; bool logicalOr = false; bool filterSites = false; vector<string> infofilterStrs; vector<VariantFilter> infofilters; vector<string> genofilterStrs; vector<VariantFilter> genofilters; string tag = ""; string filterSpec; string alleleTag; vector<string> regions; if (argc == 1) printSummary(argv); while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"filter-sites", no_argument, 0, 's'}, {"info-filter", required_argument, 0, 'f'}, {"genotype-filter", required_argument, 0, 'g'}, {"tag", required_argument, 0, 't'}, {"allele-tag", required_argument, 0, 'a'}, {"invert", no_argument, 0, 'v'}, {"or", no_argument, 0, 'o'}, {"region", required_argument, 0, 'r'}, //{"length", no_argument, &printLength, true}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hvsof:g:t:r:a:", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) break; switch (c) { case 0: /* If this option set a flag, do nothing else now. */ if (long_options[option_index].flag != 0) break; printf ("option %s", long_options[option_index].name); if (optarg) printf (" with arg %s", optarg); printf ("\n"); break; case 'f': filterSpec += " " + string(optarg); infofilterStrs.push_back(string(optarg)); break; case 's': filterSites = true; break; case 'a': alleleTag = optarg; break; case 'g': filterSpec += " genotypes filtered with: " + string(optarg); genofilterStrs.push_back(string(optarg)); break; case 't': tag = optarg; break; case 'h': printSummary(argv); exit(0); break; case 'v': invert = true; break; case 'o': logicalOr = true; break; case 'r': regions.push_back(optarg); break; case '?': /* getopt_long already printed an error message. */ printSummary(argv); exit(1); break; default: abort (); } } filterSpec = filterSpec.substr(1); // strip leading " " VariantCallFile variantFile; string inputFilename; if (optind == argc - 1) { inputFilename = argv[optind]; variantFile.open(inputFilename); } else { variantFile.open(std::cin); } if (!variantFile.is_open()) { return 1; } for (vector<string>::iterator f = infofilterStrs.begin(); f != infofilterStrs.end(); ++f) { infofilters.push_back(VariantFilter(*f, VariantFilter::RECORD, variantFile.infoTypes)); } for (vector<string>::iterator f = genofilterStrs.begin(); f != genofilterStrs.end(); ++f) { genofilters.push_back(VariantFilter(*f, VariantFilter::SAMPLE, variantFile.formatTypes)); } vector<string> headerlines = split(variantFile.header, "\n"); variantFile.header.clear(); for (vector<string>::iterator l = headerlines.begin(); l != headerlines.end(); ++l) { if (!filterSpec.empty() && (l->find("INFO") != string::npos || l + 1 == headerlines.end())) { variantFile.header += "##filter=\"" + filterSpec + "\"\n"; filterSpec.clear(); } variantFile.header += *l + ((l + 1 == headerlines.end()) ? "" : "\n"); } if (!alleleTag.empty()) { variantFile.addHeaderLine("##INFO=<ID="+ alleleTag +",Number=A,Type=String,Description=\"" + tag + " if this allele passes the filters, '.' if not, filters are: " + filterSpec + ".\">"); } cout << variantFile.header << endl; /* if (genofilters.empty() && tag.empty()) { variantFile.parseSamples = false; } */ Variant var(variantFile); vector<string>::iterator regionItr = regions.begin(); do { if (!inputFilename.empty() && !regions.empty()) { string regionStr = *regionItr++; variantFile.setRegion(regionStr); } while (variantFile.getNextVariant(var)) { if (!genofilters.empty()) { for (vector<VariantFilter>::iterator f = genofilters.begin(); f != genofilters.end(); ++f) { f->removeFilteredGenotypes(var); } } if (!infofilters.empty()) { if (filterSites) { bool passes = passesFilters(var, infofilters, logicalOr); if (invert) { passes = !passes; } if (passes) { if (!tag.empty()) { if (alleleTag.empty()) { var.addFilter(tag); } else { var.info[alleleTag].clear(); for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { var.info[alleleTag].push_back(tag); } } cout << var << endl; } else { cout << var << endl; } } else if (!tag.empty()) { cout << var << endl; } } else { // filter out alleles which pass // removes the failing alleles vector<string> failingAlts; vector<string> passingAlts; vector<bool> passes; for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { if (!passesFilters(var, infofilters, logicalOr, *a)) { failingAlts.push_back(*a); passes.push_back(false); } else { passingAlts.push_back(*a); passes.push_back(true); } } if (tag.empty()) { // if there is no specified tag, just remove the failing alts if (failingAlts.size() < var.alt.size()) { for (vector<string>::iterator a = failingAlts.begin(); a != failingAlts.end(); ++a) { var.removeAlt(*a); } cout << var << endl; } } else { // otherwise, apply the tag if (alleleTag.empty()) { if (!passingAlts.empty()) { var.addFilter(tag); } } else { var.info[alleleTag].clear(); for (vector<bool>::iterator p = passes.begin(); p != passes.end(); ++p) { if (*p) { var.info[alleleTag].push_back(tag); } else { var.info[alleleTag].push_back("."); } } } cout << var << endl; } } } else { if (genofilters.empty()) { cout << variantFile.line << endl; } else { cout << var << endl; } } } } while (regionItr != regions.end()); return 0; }
int main(int argc, char** argv) { if (argc < 5) { printSummary(argv); exit(0); } bool strict = false; int c; while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"strict", no_argument, 0, 's'}, //{"length", no_argument, &printLength, true}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hs", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) break; switch (c) { case 0: /* If this option set a flag, do nothing else now. */ if (long_options[option_index].flag != 0) break; printf ("option %s", long_options[option_index].name); if (optarg) printf (" with arg %s", optarg); printf ("\n"); break; case 's': strict = true; break; case 'h': printSummary(argv); exit(0); break; case '?': /* getopt_long already printed an error message. */ printSummary(argv); exit(1); break; default: abort (); } } string tag = argv[optind]; vector<string> samples; for (int i = optind+1; i < argc - 1; ++i) { samples.push_back(argv[i]); } string filename = argv[argc-1]; VariantCallFile variantFile; if (filename == "-") { variantFile.open(std::cin); } else { variantFile.open(filename); } if (!variantFile.is_open()) { cerr << "could not open " << filename << endl; return 1; } assert(samples.size() == 2); Variant var(variantFile); // TODO check if AC is present // ensure that AC is listed as an info field string line = "##INFO=<ID=" + tag + ",Number=1,Type=String,Description=\"Samples"; for (vector<string>::iterator s = samples.begin(); s != samples.end(); ++s) { line += " " + *s; } line += " have different genotypes\">"; variantFile.addHeaderLine(line); variantFile.addHeaderLine("##INFO=<ID=SSC,Number=1,Type=Float,Description=\"Somatic variant score (phred-scaled probability that the somatic variant call is correct).\">"); // write the new header cout << variantFile.header << endl; // print the records, filtering is done via the setting of varA's output sample names while (variantFile.getNextVariant(var)) { if (var.samples.find(samples.front()) != var.samples.end() && var.samples.find(samples.back()) != var.samples.end()) { map<string, vector<string> >& germline = var.samples[samples.front()]; map<string, vector<string> >& somatic = var.samples[samples.back()]; map<int, int> gtGermline = decomposeGenotype(germline["GT"].front()); map<int, int> gtSomatic = decomposeGenotype(somatic["GT"].front()); int germlineAltCount = 0; convert(germline["AO"].front(), germlineAltCount); var.info[tag].clear(); // remove previous if (gtGermline == gtSomatic) { var.info[tag].push_back("germline"); } else { //if (isHet(gtGermline) && isHom(gtSomatic)) { // var.info[tag].push_back("loh"); if (isHet(gtGermline) && isHomNonRef(gtSomatic) || isHomRef(gtGermline) && (isHet(gtSomatic) || isHomNonRef(gtSomatic))) { if (!strict || strict && germlineAltCount == 0) { var.info[tag].push_back("somatic"); } } else if (isHom(gtGermline) && isHet(gtSomatic)) { if (var.alt.size() == 1) { var.info[tag].push_back("reversion"); } else { var.info[tag].push_back("somatic"); } } } if (germline.find("GQ") != germline.end() && somatic.find("GQ") != somatic.end()) { double germlineGQ; convert(germline["GQ"].front(), germlineGQ); double somaticGQ; convert(somatic["GQ"].front(), somaticGQ); double somaticScore = min(var.quality, min(germlineGQ, somaticGQ)); var.info["SSC"].clear(); var.info["SSC"].push_back(convert(somaticScore)); } } cout << var << endl; } return 0; }
int main(int argc, char** argv) { string bedFileName; string annotationInfoKey; string defaultAnnotationValue; if (argc == 1) printSummary(argv); int c; while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"bed", required_argument, 0, 'b'}, {"key", required_argument, 0, 'k'}, {"default", required_argument, 0, 'd'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hb:k:d:", long_options, &option_index); if (c == -1) break; switch (c) { case 'b': bedFileName = string(optarg); break; case 'k': annotationInfoKey = string(optarg); break; case 'd': defaultAnnotationValue = string(optarg); break; case 'h': printSummary(argv); break; case '?': printSummary(argv); exit(1); break; default: abort (); } } if (bedFileName.empty()) { cerr << "a BED file is required when intersecting" << endl; exit(1); } BedReader bed(bedFileName); VariantCallFile variantFile; string inputFilename; if (optind == argc - 1) { inputFilename = argv[optind]; variantFile.open(inputFilename); } else { variantFile.open(std::cin); } if (!variantFile.is_open()) { cout << "could not open VCF file" << endl; return 1; } string line = "##INFO=<ID=" + annotationInfoKey + ",Number=1,Type=String,Description=\"Annotation from " + bedFileName + " delimited by ':'\">"; variantFile.addHeaderLine(line); cout << variantFile.header << endl; Variant var(variantFile); while (variantFile.getNextVariant(var)) { BedTarget record(var.sequenceName, var.position, var.position + var.ref.size() - 1, ""); vector<BedTarget*> overlaps = bed.targetsOverlapping(record); vector<string> annotations; if (!overlaps.empty()) { for (vector<BedTarget*>::iterator t = overlaps.begin(); t != overlaps.end(); ++t) { annotations.push_back((*t)->desc); } var.info[annotationInfoKey].push_back(join(annotations, ":")); } else if (!defaultAnnotationValue.empty()) { var.info[annotationInfoKey].push_back(defaultAnnotationValue); } cout << var << endl; } return 0; }
int main(int argc, char** argv) { if (argc != 3) { cerr << "usage: " << argv[0] << " <other-genotype-tag> <vcf file>" << endl << "adds statistics to the INFO field of the vcf file describing the" << endl << "amount of discrepancy between the genotypes (GT) in the vcf file and the" << endl << "genotypes reported in the <other-genotype-tag>. use this after" << endl << "vcfannotategenotypes to get correspondence statistics for two vcfs." << endl; return 1; } string otherGenoTag = argv[1]; string filename = argv[2]; VariantCallFile variantFile; if (filename == "-") { variantFile.open(std::cin); } else { variantFile.open(filename); } if (!variantFile.is_open()) { return 1; } vector<string> specs; specs.push_back("AA_AA"); specs.push_back("AA_AR"); specs.push_back("AA_RR"); specs.push_back("AA_NN"); specs.push_back("AR_AA"); specs.push_back("AR_AR"); specs.push_back("AR_RR"); specs.push_back("AR_NN"); specs.push_back("RR_AA"); specs.push_back("RR_AR"); specs.push_back("RR_RR"); specs.push_back("RR_NN"); specs.push_back("NN_AA"); specs.push_back("NN_AR"); specs.push_back("NN_RR"); specs.push_back("NN_NN"); for (vector<string>::iterator spec = specs.begin(); spec != specs.end(); ++spec) { string line = "##INFO=<ID=" + otherGenoTag + ".genotypes." + *spec + ",Number=1,Type=Integer,Description=\"Number of genotypes with " + *spec + " relationship with " + otherGenoTag + "\">"; variantFile.addHeaderLine(line); } string line; line = "##INFO=<ID=" + otherGenoTag + ".genotypes.count,Number=1,Type=Integer,Description=\"Count of genotypes under comparison.\">"; variantFile.addHeaderLine(line); line = "##INFO=<ID=" + otherGenoTag + ".genotypes.alternate_count,Number=1,Type=Integer,Description=\"Count of alternate genotypes in the first file.\">"; variantFile.addHeaderLine(line); line = "##INFO=<ID=" + otherGenoTag + ".site.alternate_positive_discrepancy,Number=1,Type=Integer,Description=\"Estimated positive discrepancy rate of " + otherGenoTag + " genotypes, where positive discrepancies are all cases where an alternate allele is called GT " + " but none is represented in " + otherGenoTag + " or " + otherGenoTag + " is null/no-call\">"; variantFile.addHeaderLine(line); line = "##INFO=<ID=" + otherGenoTag + ".site.alternate_negative_discrepancy,Number=1,Type=Integer,Description=\"Estimated negative discrepancy rate of " + otherGenoTag + " genotypes, where negative discrepancies are all cases where no alternate allele is called in " + " GT but an alternate is represented in " + otherGenoTag + ", including no-calls or partly null genotypes\">"; variantFile.addHeaderLine(line); line = "##INFO=<ID=" + otherGenoTag + ".site.alternate_null_discrepancy,Number=1,Type=Integer,Description=\"Estimated null discrepancy rate of " + otherGenoTag + " genotypes, where null discrepancies are all cases where GT is specified and contains an alternate but " + otherGenoTag + " is null. Cases where GT is null or partly null are excluded.\">"; variantFile.addHeaderLine(line); line = "##INFO=<ID=" + otherGenoTag + ".site.call_discrepancy,Number=1,Type=Integer,Description=\"Estimated call discrepancy rate of " + otherGenoTag + " genotypes (het->hom, hom->het) between " + otherGenoTag + " and GT\">"; variantFile.addHeaderLine(line); line = "##INFO=<ID=" + otherGenoTag + ".site.call_concordance,Number=1,Type=Integer,Description=\"Estimated call concorndance rate of " + otherGenoTag + " genotypes between " + otherGenoTag + " and GT\">"; variantFile.addHeaderLine(line); line = "##INFO=<ID=" + otherGenoTag + ".site.non_reference_discrepancy,Number=1,Type=Float,Description=\"Estimated non-reference discrepancy relative to " + otherGenoTag + " genotypes,\">"; variantFile.addHeaderLine(line); line = "##INFO=<ID=" + otherGenoTag + ".site.non_reference_discrepancy.count,Number=1,Type=Int,Description=\"non-reference discrepancy normalizer relative to " + otherGenoTag + " genotypes,\">"; variantFile.addHeaderLine(line); line = "##INFO=<ID=" + otherGenoTag + ".site.non_reference_discrepancy.normalizer,Number=1,Type=Int,Description=\"non-reference discrepancy count relative to " + otherGenoTag + " genotypes,\">"; variantFile.addHeaderLine(line); line = "##INFO=<ID=" + otherGenoTag + ".site.non_reference_sensitivity,Number=1,Type=Float,Description=\"Estimated non-reference sensitivity relative to " + otherGenoTag + " genotypes,\">"; variantFile.addHeaderLine(line); line = "##INFO=<ID=" + otherGenoTag + ".site.non_reference_sensitivity.count,Number=1,Type=Int,Description=\"non-reference sensitivity normalizer relative to " + otherGenoTag + " genotypes,\">"; variantFile.addHeaderLine(line); line = "##INFO=<ID=" + otherGenoTag + ".site.non_reference_sensitivity.normalizer,Number=1,Type=Int,Description=\"non-reference sensitivity count relative to " + otherGenoTag + " genotypes,\">"; variantFile.addHeaderLine(line); cout << variantFile.header << endl; Variant var(variantFile); while (variantFile.getNextVariant(var)) { //cout << "next: " << var << endl; // for each sample, check GT against <other-genotype-tag> // tally stats, and append to info map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); map<string, map<string, vector<string> > >::iterator sEnd = var.samples.end(); map<string, int> genotypeComparisonCounts; int gtCount = var.samples.size(); int gtAltCount = 0; // number of alternate-containing genotypes in the first file int pdCount = 0; // positive discrepancy count int ndCount = 0; // negative discrepancy count int nnCount = 0; // null discrepancy count int cdCount = 0; // call discrepancy count int ccCount = 0; // call concordance count int nrdCount = 0; // non-reference discrepancy count int nrdNormalizer = 0; // divisor for nrd rate int nrsCount = 0; // non-reference sensitivity count int nrsNormalizer = 0; // divisor for nrs rate for (; s != sEnd; ++s) { map<string, vector<string> >& sample = s->second; const string& name = s->first; // decompose genotypes into counts of strings // to facilitate comparison string gtA; if (sample.find("GT") == sample.end()) { gtA = "./."; } else { gtA = sample["GT"].front(); } string gtB; if (sample.find(otherGenoTag) == sample.end()) { gtB = "./."; } else { gtB = sample[otherGenoTag].front(); } map<int, int> genotypeA = decomposeGenotype(gtA); map<int, int> genotypeB = decomposeGenotype(gtB); string gtspecA = genotypeSpec(genotypeA); string gtspecB = genotypeSpec(genotypeB); //cout << gtA << " " << gtB << endl; //cout << gtspecA << " " << gtspecB << endl; ++genotypeComparisonCounts[gtspecA + "_" + gtspecB]; if (hasNonRef(genotypeA)) { ++gtAltCount; } if (genotypeA != genotypeB) { if (isNull(genotypeA)) { // TODO handle this somehow, maybe via a different flag? if (!isNull(genotypeB)) { ++nnCount; // null discrepancy, the second set makes a call, this one does not } } else if (hasNonRef(genotypeA)) { if (!isNull(genotypeB) && hasNonRef(genotypeB)) { // they cannot be the same, but they both represent an alternate ++cdCount; // the calls are discrepant } else { // the other call does not have an alternate ++pdCount; // it is also null if (isNull(genotypeB)) { ++nnCount; } } } else { // the current genotype has no non-ref alternate if (!isNull(genotypeB) && hasNonRef(genotypeB)) { ++ndCount; } if (isNull(genotypeB)) { ++nnCount; } } } else { if (!isNull(genotypeA)) { ++ccCount; } } if (!(isNull(genotypeA) || isNull(genotypeB)) && !(isHomRef(genotypeA) && isHomRef(genotypeB))) { ++nrdNormalizer; if (genotypeA != genotypeB) { ++nrdCount; } } if (!(isNull(genotypeB) || isHomRef(genotypeB))) { ++nrsNormalizer; if (!(isNull(genotypeA) || isHomRef(genotypeA))) { ++nrsCount; } } } for (map<string, int>::iterator g = genotypeComparisonCounts.begin(); g != genotypeComparisonCounts.end(); ++g) { stringstream c; c << g->second; vector<string>& t = var.info[otherGenoTag + ".genotypes." + g->first]; t.clear(); t.push_back(c.str()); } stringstream gtc; gtc << gtCount; var.info[otherGenoTag + ".genotypes.count"].push_back(gtc.str()); stringstream gtac; gtac << gtAltCount; var.info[otherGenoTag + ".genotypes.alternate_count"].push_back(gtac.str()); stringstream pd; pd << pdCount; var.info[otherGenoTag + ".site.alternate_positive_discrepancy"].push_back(pd.str()); stringstream nd; nd << ndCount; var.info[otherGenoTag + ".site.alternate_negative_discrepancy"].push_back(nd.str()); stringstream nn; nn << nnCount; var.info[otherGenoTag + ".site.alternate_null_discrepancy"].push_back(nn.str()); stringstream cd; cd << cdCount; var.info[otherGenoTag + ".site.call_discrepancy"].push_back(cd.str()); stringstream cc; cc << ccCount; var.info[otherGenoTag + ".site.call_concordance"].push_back(cc.str()); stringstream nrdc; nrdc << nrdCount; var.info[otherGenoTag + ".site.non_reference_discrepancy.count"].push_back(nrdc.str()); stringstream nrdn; nrdn << nrdNormalizer; var.info[otherGenoTag + ".site.non_reference_discrepancy.normalizer"].push_back(nrdn.str()); if (nrdNormalizer > 0) { stringstream nrd; nrd << (double) nrdCount / (double) nrdNormalizer; var.info[otherGenoTag + ".site.non_reference_discrepancy"].push_back(nrd.str()); } stringstream nrsc; nrsc << nrsCount; var.info[otherGenoTag + ".site.non_reference_sensitivity.count"].push_back(nrsc.str()); stringstream nrsn; nrsn << nrsNormalizer; var.info[otherGenoTag + ".site.non_reference_sensitivity.normalizer"].push_back(nrsn.str()); if (nrsNormalizer > 0) { stringstream nrs; nrs << (double) nrsCount / (double) nrsNormalizer; var.info[otherGenoTag + ".site.non_reference_sensitivity"].push_back(nrs.str()); } cout << var << endl; } return 0; }
int main(int argc, char** argv) { if (argc > 1 && (argv[1] == "-h" || argv[1] == "--help")) { cerr << "usage: " << argv[0] << " <vcf file>" << endl << "outputs a VCF stream where AC and NS have been generated for each record using sample genotypes" << endl; return 1; } VariantCallFile variantFile; if (argc == 1 || (argc == 2 && argv[1] == "-")) { variantFile.open(std::cin); if (!variantFile.is_open()) { cerr << "vcffixup: could not open stdin" << endl; return 1; } } else { string filename = argv[1]; variantFile.open(filename); if (!variantFile.is_open()) { cerr << "vcffixup: could not open " << filename << endl; return 1; } } Variant var(variantFile); // remove header lines we're going to add variantFile.removeInfoHeaderLine("AC"); variantFile.removeInfoHeaderLine("AF"); variantFile.removeInfoHeaderLine("NS"); variantFile.removeInfoHeaderLine("AN"); // and add them back, so as not to duplicate them if they are already there variantFile.addHeaderLine("##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Total number of alternate alleles in called genotypes\">"); variantFile.addHeaderLine("##INFO=<ID=AF,Number=A,Type=Float,Description=\"Estimated allele frequency in the range (0,1]\">"); variantFile.addHeaderLine("##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of samples with data\">"); variantFile.addHeaderLine("##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">"); // write the new header cout << variantFile.header << endl; // print the records, filtering is done via the setting of varA's output sample names while (variantFile.getNextVariant(var)) { stringstream ns; ns << var.samples.size(); var.info["NS"].clear(); var.info["NS"].push_back(ns.str()); var.info["AC"].clear(); var.info["AF"].clear(); var.info["AN"].clear(); int allelecount = countAlleles(var); stringstream an; an << allelecount; var.info["AN"].push_back(an.str()); for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { string& allele = *a; int altcount = countAlts(var, var.getAltAlleleIndex(allele) + 1); stringstream ac; ac << altcount; var.info["AC"].push_back(ac.str()); stringstream af; af << (double) altcount / (double) allelecount; var.info["AF"].push_back(af.str()); } cout << var << endl; } return 0; }
int main(int argc, char** argv) { if (argc != 2) { cerr << "usage: " << argv[0] << " <annotation-tag> <vcf file> <vcf file>" << endl << "adds a tag (BasesToNextVariant) to each variant record which indicates" << endl << "the distance to the nearest variant" << endl; return 1; } string filename = argv[1]; VariantCallFile variantFile; if (filename == "-") { variantFile.open(std::cin); } else { variantFile.open(filename); } if (!variantFile.is_open()) { return 1; } Variant varA(variantFile); Variant varB(variantFile); Variant varC(variantFile); vector<Variant*> vars; vars.push_back(&varA); vars.push_back(&varB); vars.push_back(&varC); for (vector<Variant*>::iterator v = vars.begin(); v != vars.end(); ++v) { variantFile.getNextVariant(**v); } string tag = "BasesToClosestVariant"; string line = "##INFO=<ID=" + tag + ",Number=1,Type=Integer,Description=\"" \ + "Number of bases to the closest variant in the file.\">"; variantFile.addHeaderLine(line); cout << variantFile.header << endl; // get the first distances if (vars.at(0)->sequenceName == vars.at(1)->sequenceName) { vars.at(0)->info[tag].push_back(convert(vars.at(1)->position - vars.at(0)->position)); } while (variantFile.getNextVariant(*vars.back())) { if (vars.at(1)->sequenceName == vars.at(0)->sequenceName && vars.at(1)->sequenceName == vars.at(2)->sequenceName) { vars.at(1)->info[tag].push_back(convert(min(vars.at(1)->position - vars.at(0)->position, vars.at(2)->position - vars.at(1)->position))); } else if (vars.at(1)->sequenceName == vars.at(0)->sequenceName) { vars.at(1)->info[tag].push_back(convert(vars.at(1)->position - vars.at(0)->position)); } else if (vars.at(2)->sequenceName == vars.at(1)->sequenceName) { vars.at(1)->info[tag].push_back(convert(vars.at(2)->position - vars.at(1)->position)); } else { // don't add the tag } cout << *vars.front() << endl; // rotate Variant* v = vars.at(0); vars.at(0) = vars.at(1); vars.at(1) = vars.at(2); vars.at(2) = v; } // assign the last distances if (vars.at(0)->sequenceName == vars.at(1)->sequenceName) { vars.at(0)->info[tag].push_back(convert(vars.at(1)->position - vars.at(0)->position)); cout << *vars.at(0) << endl; vars.at(1)->info[tag].push_back(convert(vars.at(1)->position - vars.at(0)->position)); cout << *vars.at(1) << endl; } return 0; }
int main(int argc, char** argv) { string vcfFileName; string fastaFileName; int windowsize = 100; bool includePreviousBaseForIndels = false; bool useMNPs = true; int altwindowsize = 50; // constants for SmithWaterman algorithm float matchScore = 10.0f; float mismatchScore = -9.0f; float gapOpenPenalty = 15.0f; float gapExtendPenalty = 6.66f; bool useEntropy = false; bool useRepeatGapExtendPenalty = false; float repeatGapExtendPenalty = 1; bool adjustVcf = false; string adjustedTag = "remappedCIGAR"; if (argc == 1) printSummary(argv); int c; while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"ref-window-size", required_argument, 0, 'w'}, {"reference", required_argument, 0, 'r'}, {"match-score", required_argument, 0, 'm'}, {"mismatch-score", required_argument, 0, 'x'}, {"gap-open-penalty", required_argument, 0, 'o'}, {"gap-extend-penalty", required_argument, 0, 'e'}, {"alt-window-size", required_argument, 0, 's'}, {"entropy-gap-open", no_argument, 0, 'z'}, {"repeat-gap-extend", no_argument, 0, 'R'}, {"adjust-vcf", required_argument, 0, 'a'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hza:w:r:m:x:o:e:s:R:", long_options, &option_index); if (c == -1) break; switch (c) { case 'w': windowsize = atoi(optarg); break; case 'a': adjustVcf = true; adjustedTag = optarg; break; case 'r': fastaFileName = string(optarg); break; case 'h': printSummary(argv); break; case 'm': matchScore = atof(optarg); break; case 'x': mismatchScore = atof(optarg); break; case 'o': gapOpenPenalty = atof(optarg); break; case 'e': gapExtendPenalty = atof(optarg); break; case 's': altwindowsize = atoi(optarg); break; case 'z': useEntropy = true; break; case 'R': useRepeatGapExtendPenalty = true; repeatGapExtendPenalty = atof(optarg); break; case '?': printSummary(argv); exit(1); break; default: abort (); } } VariantCallFile variantFile; string inputFilename; if (optind == argc - 1) { inputFilename = argv[optind]; variantFile.open(inputFilename); } else { variantFile.open(std::cin); } if (!variantFile.is_open()) { cerr << "could not open VCF file" << endl; exit(1); } FastaReference freference; if (fastaFileName.empty()) { cerr << "a reference is required" << endl; exit(1); } else { freference.open(fastaFileName); } if (adjustVcf) { vector<string> commandline; for (int i = 0; i < argc; ++i) commandline.push_back(argv[i]); variantFile.addHeaderLine("##INFO=<ID=" + adjustedTag + ",Number=A,Type=String,Description=\"CIGAR when remapped using"+ join(commandline, " ") +"\">"); } cout << variantFile.header << endl; Variant var(variantFile); while (variantFile.getNextVariant(var)) { //if (!adjustVcf) { cout << endl; cout << var << endl; //} map<string, vector<VariantAllele> > variantAlleles; vector<vector<pair<int, char> > > cigars; vector<int> positionDiffs; for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { //if (!adjustVcf) cout << endl; cout << endl; // try to remap locally string reference = freference.getSubSequence(var.sequenceName, var.position - 1 - windowsize, windowsize * 2 + var.ref.size()); // passed to sw align unsigned int referencePos; string cigar; string& alternate = *a; vector<VariantAllele>& variants = variantAlleles[alternate]; string alternateQuery = reference.substr(windowsize - altwindowsize, altwindowsize) + alternate + reference.substr(reference.size() - windowsize, altwindowsize); //cout << "REF:\t" << reference << endl; //cout << "ALT:\t" << string(windowsize - altwindowsize, ' ') << alternateQuery << endl; CSmithWatermanGotoh sw(matchScore, mismatchScore, gapOpenPenalty, gapExtendPenalty); if (useEntropy) sw.EnableEntropyGapPenalty(1); if (useRepeatGapExtendPenalty) sw.EnableRepeatGapExtensionPenalty(repeatGapExtendPenalty); sw.Align(referencePos, cigar, reference, alternateQuery); int altpos = 0; int refpos = 0; int len; string slen; vector<pair<int, char> > cigarData; string ref = reference.substr(referencePos); positionDiffs.push_back(referencePos); // TODO this... is borked stringstream refss; stringstream altss; if (!adjustVcf) cout << cigar << endl; cout << cigar << endl; for (string::iterator c = cigar.begin(); c != cigar.end(); ++c) { switch (*c) { case 'I': len = atoi(slen.c_str()); slen.clear(); if (altpos < altwindowsize) { cigarData.push_back(make_pair(len, 'M')); } else { cigarData.push_back(make_pair(len, *c)); } altss << alternateQuery.substr(altpos, len); refss << string(len, '-'); altpos += len; break; case 'D': len = atoi(slen.c_str()); slen.clear(); if (altpos < altwindowsize) { } else { cigarData.push_back(make_pair(len, *c)); } refss << ref.substr(refpos, len); altss << string(len, '-'); refpos += len; break; case 'M': len = atoi(slen.c_str()); slen.clear(); { for (int i = 0; i < len; ++i) { if (ref.at(refpos + i) == alternateQuery.at(altpos + i)) { if (!cigarData.empty() && cigarData.back().second == 'M') { cigarData.back().first++; } else { cigarData.push_back(make_pair(1, 'M')); } } else { if (!cigarData.empty() && cigarData.back().second == 'X') { cigarData.back().first++; } else { cigarData.push_back(make_pair(1, 'X')); } } } } refss << ref.substr(refpos, len); altss << alternateQuery.substr(altpos, len); refpos += len; altpos += len; break; case 'S': len = atoi(slen.c_str()); slen.clear(); cigarData.push_back(make_pair(len, *c)); refss << ref.substr(refpos, len); //altss << alternateQuery.substr(altpos, len); // TODO deal with soft clipping, weird behavior refpos += len; altpos += len; break; default: len = 0; slen += *c; break; } } if (!adjustVcf) { cout << "ref:\t" << refss.str() << endl; cout << "alt:\t" << altss.str() << endl; } else { cout << "ref:\t" << refss.str() << endl; cout << "alt:\t" << altss.str() << endl; cigars.push_back(cigarData); } } if (adjustVcf) { int substart = cigars.front().front().first; int subend = cigars.front().back().first; // find the min and max match for (vector<vector<pair<int, char> > >::iterator c = cigars.begin(); c != cigars.end(); ++c) { if (c->front().second == 'M' && c->front().first <= substart) { substart = c->front().first; if (c->size() > 1 && c->at(1).second != 'X') { --substart; } } if (c->back().second == 'M' && c->back().first <= subend) { subend = c->back().first; } } // adjust the cigars and get the new reference length int reflen = 0; for (vector<vector<pair<int, char> > >::iterator c = cigars.begin(); c != cigars.end(); ++c) { c->front().first -= substart; c->back().first -= subend; int crf = cigarRefLen(*c); if (crf > reflen) reflen = crf; var.info[adjustedTag].push_back(joinCigar(*c)); } // find the lowest positional difference int pdiff = 0; for (vector<int>::iterator d = positionDiffs.begin(); d != positionDiffs.end(); ++d) { if (*d + altwindowsize < pdiff) pdiff = *d + altwindowsize; } // adjust the reference string var.position += pdiff; // adjust the variant position var.ref = freference.getSubSequence(var.sequenceName, var.position - 1, reflen); cout << var << endl; } } return 0; }
int main(int argc, char** argv) { int c; string fastaRef; int windowSize = 0; if (argc == 1) printSummary(argv); while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"fasta-reference", required_argument, 0, 'f'}, {"window-size", required_argument, 0, 'w'}, //{"length", no_argument, &printLength, true}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hf:w:", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) break; switch (c) { case 0: /* If this option set a flag, do nothing else now. */ if (long_options[option_index].flag != 0) break; printf ("option %s", long_options[option_index].name); if (optarg) printf (" with arg %s", optarg); printf ("\n"); break; case 'f': fastaRef = optarg; break; case 'w': windowSize = atoi(optarg); break; case 'h': printSummary(argv); exit(0); break; case '?': /* getopt_long already printed an error message. */ printSummary(argv); exit(1); break; default: abort (); } } if (windowSize == 0) { cerr << "a window size must be specified" << endl; exit(1); } if (fastaRef.empty()) { cerr << "a FASTA reference sequence must be specified" << endl; exit(1); } FastaReference ref; ref.open(fastaRef); VariantCallFile variantFile; string inputFilename; if (optind == argc - 1) { inputFilename = argv[optind]; variantFile.open(inputFilename); } else { variantFile.open(std::cin); } if (!variantFile.is_open()) { return 1; } variantFile.addHeaderLine("##INFO=<ID=EntropyLeft,Number=1,Type=Float,Description=\"Entropy of left-flanking sequence of "+ convert(windowSize) +"bp\">"); variantFile.addHeaderLine("##INFO=<ID=EntropyCenter,Number=1,Type=Float,Description=\"Entropy of centered sequence of "+ convert(windowSize) +"bp\">"); variantFile.addHeaderLine("##INFO=<ID=EntropyRight,Number=1,Type=Float,Description=\"Entropy of right-flanking sequence of "+ convert(windowSize) +"bp\">"); variantFile.addHeaderLine("##INFO=<ID=EntropyRef,Number=1,Type=Float,Description=\"Entropy of REF allele\">"); variantFile.addHeaderLine("##INFO=<ID=EntropyAlt,Number=A,Type=Float,Description=\"Entropy of each ALT allele\">"); cout << variantFile.header << endl; Variant var(variantFile); while (variantFile.getNextVariant(var)) { // get the ref start and end positions int refstart = var.position - 1; // convert to 0-based int refend = var.position + var.ref.size() - 1; string leftseq = ref.getSubSequence(var.sequenceName, refstart - windowSize, windowSize); string rightseq = ref.getSubSequence(var.sequenceName, refend, windowSize); string centerseq = ref.getSubSequence(var.sequenceName, refstart - windowSize/2, windowSize); double entropyLeft = shannon_H((char*) &leftseq[0], windowSize); double entropyRight = shannon_H((char*) &rightseq[0], windowSize); double entropyCenter = shannon_H((char*) ¢erseq[0], windowSize); double entropyRef = shannon_H((char*) var.ref.c_str(), var.ref.size()); var.info["EntropyLeft"].clear(); var.info["EntropyRight"].clear(); var.info["EntropyCenter"].clear(); var.info["EntropyRef"].clear(); var.info["EntropyAlt"].clear(); var.info["EntropyLeft"].push_back(convert(entropyLeft)); var.info["EntropyRight"].push_back(convert(entropyRight)); var.info["EntropyCenter"].push_back(convert(entropyCenter)); var.info["EntropyRef"].push_back(convert(entropyRef)); for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { double entropyAlt = shannon_H((char*) a->c_str(), a->size()); var.info["EntropyAlt"].push_back(convert(entropyAlt)); } cout << var << endl; } return 0; }
int main(int argc, char** argv) { bool includePreviousBaseForIndels = true; bool useMNPs = false; string parseFlag; int maxLength = 200; bool keepInfo = false; bool keepGeno = false; VariantCallFile variantFile; int c; while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"use-mnps", no_argument, 0, 'm'}, {"max-length", required_argument, 0, 'L'}, {"tag-parsed", required_argument, 0, 't'}, {"keep-info", no_argument, 0, 'k'}, {"keep-geno", no_argument, 0, 'g'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hmkgt:L:", long_options, &option_index); if (c == -1) break; switch (c) { case 'm': useMNPs = true; break; case 'k': keepInfo = true; break; case 'g': keepGeno = true; break; case 'h': printSummary(argv); break; case 't': parseFlag = optarg; break; case 'L': maxLength = atoi(optarg); break; case '?': printSummary(argv); exit(1); break; default: abort (); } } if (optind < argc) { string filename = argv[optind]; variantFile.open(filename); } else { variantFile.open(std::cin); } if (!variantFile.is_open()) { return 1; } variantFile.addHeaderLine("##INFO=<ID=TYPE,Number=A,Type=String,Description=\"The type of allele, either snp, mnp, ins, del, or complex.\">"); variantFile.addHeaderLine("##INFO=<ID=LEN,Number=A,Type=Integer,Description=\"allele length\">"); if (!parseFlag.empty()) { variantFile.addHeaderLine("##INFO=<ID="+parseFlag+",Number=0,Type=Flag,Description=\"The allele was parsed using vcfallelicprimitives.\">"); } cout << variantFile.header << endl; Variant var(variantFile); while (variantFile.getNextVariant(var)) { // we can't decompose *1* bp events, these are already in simplest-form whether SNPs or indels // we also don't handle anything larger than maxLength bp if (var.alt.size() == 1 && ( var.alt.front().size() == 1 || var.ref.size() == 1 || var.alt.front().size() > maxLength || var.ref.size() > maxLength )) { // nothing to do cout << var << endl; continue; } // for each parsedalternate, get the position // build a new vcf record for that position // unless we are already at the position ! // take everything which is unique to that allele (records) and append it to the new record // then handle genotypes; determine the mapping between alleleic primitives and convert to phased haplotypes // this means taking all the parsedAlternates and, for each one, generating a pattern of allele indecies corresponding to it map<string, vector<VariantAllele> > varAlleles = var.parsedAlternates(includePreviousBaseForIndels, useMNPs); set<VariantAllele> alleles; // collect unique alleles for (map<string, vector<VariantAllele> >::iterator a = varAlleles.begin(); a != varAlleles.end(); ++a) { for (vector<VariantAllele>::iterator va = a->second.begin(); va != a->second.end(); ++va) { alleles.insert(*va); } } int altcount = 0; for (set<VariantAllele>::iterator a = alleles.begin(); a != alleles.end(); ++a) { if (a->ref != a->alt) { ++altcount; } } if (altcount == 1 && var.alt.size() == 1 && var.alt.front().size() == 1) { // if biallelic SNP cout << var << endl; continue; } // collect variant allele indexed membership map<string, vector<int> > variantAlleleIndexes; // from serialized VariantAllele to indexes for (map<string, vector<VariantAllele> >::iterator a = varAlleles.begin(); a != varAlleles.end(); ++a) { int index = var.altAlleleIndexes[a->first] + 1; // make non-relative for (vector<VariantAllele>::iterator va = a->second.begin(); va != a->second.end(); ++va) { variantAlleleIndexes[va->repr].push_back(index); } } map<VariantAllele, double> alleleFrequencies; map<VariantAllele, int> alleleCounts; map<VariantAllele, map<string, string> > alleleInfos; map<VariantAllele, map<string, map<string, string> > > alleleGenos; bool hasAf = false; if (var.info.find("AF") != var.info.end()) { hasAf = true; for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { vector<VariantAllele>& vars = varAlleles[*a]; for (vector<VariantAllele>::iterator va = vars.begin(); va != vars.end(); ++va) { double freq; try { convert(var.info["AF"].at(var.altAlleleIndexes[*a]), freq); alleleFrequencies[*va] += freq; } catch (...) { cerr << "vcfallelicprimitives WARNING: AF does not have count == alts @ " << var.sequenceName << ":" << var.position << endl; } } } } bool hasAc = false; if (var.info.find("AC") != var.info.end()) { hasAc = true; for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { vector<VariantAllele>& vars = varAlleles[*a]; for (vector<VariantAllele>::iterator va = vars.begin(); va != vars.end(); ++va) { int freq; try { convert(var.info["AC"].at(var.altAlleleIndexes[*a]), freq); alleleCounts[*va] += freq; } catch (...) { cerr << "vcfallelicprimitives WARNING: AC does not have count == alts @ " << var.sequenceName << ":" << var.position << endl; } } } } if (keepInfo) { for (map<string, vector<string> >::iterator infoit = var.info.begin(); infoit != var.info.end(); ++infoit) { string key = infoit->first; for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { vector<VariantAllele>& vars = varAlleles[*a]; for (vector<VariantAllele>::iterator va = vars.begin(); va != vars.end(); ++va) { string val; vector<string>& vals = var.info[key]; if (vals.size() == var.alt.size()) { // allele count for info val = vals.at(var.altAlleleIndexes[*a]); } else if (vals.size() == 1) { // site-wise count val = vals.front(); } // don't handle other multiples... how would we do this without going crazy? if (!val.empty()) { alleleInfos[*va][key] = val; } } } } } /* if (keepGeno) { for (map<string, map<string, vector<string> > >::iterator sampleit = var.samples.begin(); sampleit != var.samples.end(); ++sampleit) { string& sampleName = sampleit->first; map<string, vector<string> >& sampleValues = var.samples[sampleName]; } } */ // from old allele index to a new series across the unpacked positions map<int, map<long unsigned int, int> > unpackedAlleleIndexes; map<long unsigned int, Variant> variants; //vector<Variant> variants; for (set<VariantAllele>::iterator a = alleles.begin(); a != alleles.end(); ++a) { if (a->ref == a->alt) { // ref allele continue; } string type; int len = 0; if (a->ref.at(0) == a->alt.at(0)) { // well-behaved indels if (a->ref.size() > a->alt.size()) { type = "del"; len = a->ref.size() - a->alt.size(); } else if (a->ref.size() < a->alt.size()) { len = a->alt.size() - a->ref.size(); type = "ins"; } } else { if (a->ref.size() == a->alt.size()) { len = a->ref.size(); if (a->ref.size() == 1) { type = "snp"; } else { type = "mnp"; } } else { len = abs((int) a->ref.size() - (int) a->alt.size()); type = "complex"; } } if (variants.find(a->position) == variants.end()) { Variant newvar(variantFile); variants[a->position] = newvar; } Variant& v = variants[a->position]; // guaranteed to exist if (!parseFlag.empty()) { v.infoFlags[parseFlag] = true; } v.quality = var.quality; v.filter = var.filter; v.id = "."; //v.format = var.format; vector<string> gtonlyformat; gtonlyformat.push_back("GT"); v.format = gtonlyformat; v.info["TYPE"].push_back(type); v.info["LEN"].push_back(convert(len)); if (hasAf) { v.info["AF"].push_back(convert(alleleFrequencies[*a])); } if (hasAc) { v.info["AC"].push_back(convert(alleleCounts[*a])); } if (keepInfo) { for (map<string, vector<string> >::iterator infoit = var.info.begin(); infoit != var.info.end(); ++infoit) { string key = infoit->first; if (key != "AF" && key != "AC" && key != "TYPE" && key != "LEN") { // don't clobber previous v.info[key].push_back(alleleInfos[*a][key]); } } } // now, keep all the other infos if we are asked to v.sequenceName = var.sequenceName; v.position = a->position; // ... by definition, this should be == if the variant was found if (v.ref.size() < a->ref.size()) { for (vector<string>::iterator va = v.alt.begin(); va != v.alt.end(); ++va) { *va += a->ref.substr(v.ref.size()); } v.ref = a->ref; } v.alt.push_back(a->alt); int alleleIndex = v.alt.size(); vector<int>& originalIndexes = variantAlleleIndexes[a->repr]; for (vector<int>::iterator i = originalIndexes.begin(); i != originalIndexes.end(); ++i) { unpackedAlleleIndexes[*i][v.position] = alleleIndex; } // add null allele unpackedAlleleIndexes[ALLELE_NULL][v.position] = ALLELE_NULL; } // genotypes for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) { string& sampleName = *s; if (var.samples.find(sampleName) == var.samples.end()) { continue; } map<string, vector<string> >& sample = var.samples[sampleName]; if (sample.find("GT") == sample.end()) { continue; } string& genotype = sample["GT"].front(); vector<string> genotypeStrs = split(genotype, "|/"); vector<int> genotypeIndexes; for (vector<string>::iterator s = genotypeStrs.begin(); s != genotypeStrs.end(); ++s) { int i; if (!convert(*s, i)) { genotypeIndexes.push_back(ALLELE_NULL); } else { genotypeIndexes.push_back(i); } } map<long unsigned int, vector<int> > positionIndexes; for (vector<int>::iterator g = genotypeIndexes.begin(); g != genotypeIndexes.end(); ++g) { int oldIndex = *g; for (map<long unsigned int, Variant>::iterator v = variants.begin(); v != variants.end(); ++v) { const long unsigned int& p = v->first; if (oldIndex == 0) { // reference positionIndexes[p].push_back(0); } else { positionIndexes[p].push_back(unpackedAlleleIndexes[oldIndex][p]); } } } for (map<long unsigned int, Variant>::iterator v = variants.begin(); v != variants.end(); ++v) { Variant& variant = v->second; vector<int>& gtints = positionIndexes[v->first]; vector<string> gtstrs; for (vector<int>::iterator i = gtints.begin(); i != gtints.end(); ++i) { if (*i != ALLELE_NULL) { gtstrs.push_back(convert(*i)); } else { gtstrs.push_back("."); } } string genotype = join(gtstrs, "|"); // if we are keeping the geno info, pull it over here if (keepGeno) { variant.format = var.format; variant.samples[sampleName] = var.samples[sampleName]; } // note that this will replace the old geno, but otherwise it is the same variant.samples[sampleName]["GT"].clear(); variant.samples[sampleName]["GT"].push_back(genotype); } } //for (vector<Variant>::iterator v = variants.begin(); v != variants.end(); ++v) { for (map<long unsigned int, Variant>::iterator v = variants.begin(); v != variants.end(); ++v) { cout << v->second << endl; } } return 0; }
int main(int argc, char** argv) { vector<string> regions; bool addTags = false; bool addType = false; bool lengthFrequency = true; // constants for SmithWaterman algorithm float matchScore = 10.0f; float mismatchScore = -9.0f; float gapOpenPenalty = 15.0f; float gapExtendPenalty = 6.66f; bool useReferenceAlignment = false; int c; while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"region", required_argument, 0, 'r'}, {"add-info", no_argument, 0, 'a'}, {"add-type", no_argument, 0, 't'}, {"no-length-frequency", no_argument, 0, 'l'}, {"match-score", required_argument, 0, 'm'}, {"mismatch-score", required_argument, 0, 'x'}, {"gap-open-penalty", required_argument, 0, 'o'}, {"gap-extend-penalty", required_argument, 0, 'e'}, //{"length", no_argument, &printLength, true}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hlatr:m:x:o:e:", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) break; switch (c) { case 0: /* If this option set a flag, do nothing else now. */ if (long_options[option_index].flag != 0) break; printf ("option %s", long_options[option_index].name); if (optarg) printf (" with arg %s", optarg); printf ("\n"); break; case 'h': printSummary(argv); exit(0); break; case 'r': regions.push_back(optarg); break; case 'l': lengthFrequency = false; break; case 'a': addTags = true; break; case 't': addType = true; break; case 'm': matchScore = atof(optarg); break; case 'x': mismatchScore = atof(optarg); break; case 'o': gapOpenPenalty = atof(optarg); break; case 'e': gapExtendPenalty = atof(optarg); break; default: abort (); } } VariantCallFile variantFile; string inputFilename; if (optind == argc - 1) { inputFilename = argv[optind]; variantFile.open(inputFilename); } else { variantFile.open(std::cin); } if (!variantFile.is_open()) { return 1; } if (addType && !addTags) { variantFile.addHeaderLine("##INFO=<ID=type,Number=A,Type=String,Description=\"The type of the allele, either snp, ins, del, complex, or ref.\">"); variantFile.addHeaderLine("##INFO=<ID=cigar,Number=A,Type=String,Description=\"The CIGAR-style representation of the alternate allele as aligned to the reference\">"); cout << variantFile.header << endl; } if (addTags) { variantFile.addHeaderLine("##INFO=<ID=transitions,Number=A,Type=Integer,Description=\"Total number of transitions in the alternate allele\">"); variantFile.addHeaderLine("##INFO=<ID=transversions,Number=A,Type=Integer,Description=\"Total number of transversions in the alternate allele\">"); variantFile.addHeaderLine("##INFO=<ID=deaminations,Number=A,Type=Integer,Description=\"Total number of deaminations in the alternate allele\">"); variantFile.addHeaderLine("##INFO=<ID=aminations,Number=A,Type=Integer,Description=\"Total number of aminations in the alternate allele\">"); variantFile.addHeaderLine("##INFO=<ID=mismatches,Number=A,Type=Integer,Description=\"Total number of mismatches in the alternate allele\">"); variantFile.addHeaderLine("##INFO=<ID=insertions,Number=A,Type=Integer,Description=\"Total number of inserted bases in the alternate allele\">"); variantFile.addHeaderLine("##INFO=<ID=deletions,Number=A,Type=Integer,Description=\"Total number of deleted bases in the alternate allele\">"); variantFile.addHeaderLine("##INFO=<ID=cigar,Number=A,Type=String,Description=\"The CIGAR-style representation of the alternate allele as aligned to the reference\">"); variantFile.addHeaderLine("##INFO=<ID=type,Number=A,Type=String,Description=\"The type of the allele, either snp, ins, del, complex, or ref.\">"); variantFile.addHeaderLine("##INFO=<ID=reflen,Number=1,Type=Integer,Description=\"The length of the reference allele\">"); variantFile.addHeaderLine("##INFO=<ID=altlen,Number=A,Type=Integer,Description=\"The length of the alternate allele\">"); cout << variantFile.header << endl; } Variant var(variantFile); vector<string>::iterator regionItr = regions.begin(); int variantAlleles = 0; int uniqueVariantAlleles = 0; int variantSites = 0; int snps = 0; int transitions = 0; int transversions = 0; int deaminations = 0; int aminations = 0; int totalinsertions = 0; int totaldeletions = 0; int insertedbases = 0; int deletedbases = 0; int totalmnps = 0; int totalcomplex = 0; int mismatchbases = 0; int mnpbases = 0; int biallelics = 0; int multiallelics = 0; map<int, int> insertions; map<int, int> deletions; map<int, int> mnps; map<int, int> complexsubs; bool includePreviousBaseForIndels = false; bool useMNPs = true; bool useEntropy = false; AlleleStats biallelicSNPs; // todo, add biallelic snp dialog to output and ts/tv for snps and mnps do { if (!inputFilename.empty() && !regions.empty()) { string regionStr = *regionItr++; variantFile.setRegion(regionStr); } while (variantFile.getNextVariant(var)) { ++variantSites; if (var.alt.size() > 1) { ++multiallelics; } else { ++biallelics; } map<string, vector<VariantAllele> > alternates = var.parsedAlternates(includePreviousBaseForIndels, useMNPs, useEntropy, matchScore, mismatchScore, gapOpenPenalty, gapExtendPenalty); map<VariantAllele, vector<string> > uniqueVariants; vector<string> cigars; for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { string& alternate = *a; if (addTags) var.info["altlen"].push_back(convert(alternate.size())); vector<VariantAllele>& vav = alternates[alternate]; if (vav.size() > 1) { // check that there are actually multiple non-reference alleles int nonRefAlleles = 0; for (vector<VariantAllele>::iterator z = vav.begin(); z != vav.end(); ++z) { if (z->ref != z->alt) ++nonRefAlleles; } if (nonRefAlleles > 1) ++totalcomplex; } for (vector<VariantAllele>::iterator v = vav.begin(); v != vav.end(); ++v) { uniqueVariants[*v].push_back(alternate); } if (addTags || addType) { string cigar; pair<int, string> element; for (vector<VariantAllele>::iterator v = vav.begin(); v != vav.end(); ++v) { VariantAllele& va = *v; if (va.ref != va.alt) { if (element.second == "M") { cigar += convert(element.first) + element.second; element.second = ""; element.first = 0; } if (va.ref.size() == va.alt.size()) { cigar += convert(va.ref.size()) + "X"; } else if (va.ref.size() > va.alt.size()) { cigar += convert(va.ref.size() - va.alt.size()) + "D"; } else { cigar += convert(va.alt.size() - va.ref.size()) + "I"; } } else { if (element.second == "M") { element.first += va.ref.size(); } else { element = make_pair(va.ref.size(), "M"); } } } if (element.second == "M") { cigar += convert(element.first) + element.second; } element.second = ""; element.first = 0; cigars.push_back(cigar); } } if (addTags) { var.info["cigar"] = cigars; var.info["reflen"].push_back(convert(var.ref.size())); } else if (addType) { var.info["cigar"] = cigars; } variantAlleles += var.alt.size(); map<string, AlleleStats> alleleStats; for (map<VariantAllele, vector<string> >::iterator v = uniqueVariants.begin(); v != uniqueVariants.end(); ++v) { const VariantAllele& va = v->first; vector<string>& alternates = v->second; if (!(addTags || addType)) { // don't add any tag information if we're not going to output it alternates.clear(); } if (va.ref != va.alt) { ++uniqueVariantAlleles; if (va.ref.size() == va.alt.size()) { if (va.ref.size() == 1) { ++snps; ++mismatchbases; for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) { ++alleleStats[*a].mismatches; } if (isTransition(va.ref, va.alt)) { ++transitions; for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) { ++alleleStats[*a].transitions; } } else { ++transversions; for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) { ++alleleStats[*a].transversions; } } if (isAmination(va.ref, va.alt)) { ++aminations; for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) { ++alleleStats[*a].aminations; } } if (isDeamination(va.ref, va.alt)) { ++deaminations; for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) { ++alleleStats[*a].deaminations; } } } else { ++totalmnps; ++mnps[va.alt.size()]; // not entirely correct for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) { alleleStats[*a].mismatches += va.alt.size(); } string::const_iterator r = va.ref.begin(); for (string::const_iterator a = va.alt.begin(); a != va.alt.end(); ++a, ++r) { string rstr = string(1, *r); string astr = string(1, *a); if (rstr == astr) { continue; } if (isTransition(rstr, astr)) { ++transitions; for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) { ++alleleStats[*a].transitions; } } else { ++transversions; for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) { ++alleleStats[*a].transversions; } } if (isAmination(rstr, astr)) { ++aminations; for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) { ++alleleStats[*a].aminations; } } if (isDeamination(rstr, astr)) { ++deaminations; for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) { ++alleleStats[*a].deaminations; } } ++mismatchbases; ++mnpbases; } } } else if (va.ref.size() > va.alt.size()) { int diff = va.ref.size() - va.alt.size(); deletedbases += diff; ++totaldeletions; ++deletions[diff]; for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) { alleleStats[*a].deletedbases += diff; alleleStats[*a].deletions += 1; } } else { int diff = va.alt.size() - va.ref.size(); insertedbases += diff; ++totalinsertions; ++insertions[diff]; for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) { alleleStats[*a].insertedbases += diff; alleleStats[*a].insertions += 1; } } } } if (addTags || addType) { for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { string vartype; if (alleleStats[*a].insertions + alleleStats[*a].deletions == 0) { if (alleleStats[*a].mismatches == 1) { vartype = "snp"; } else if (alleleStats[*a].mismatches > 1) { vartype = "complex"; } else { vartype = "ref"; } } else if (alleleStats[*a].insertions + alleleStats[*a].deletions == 1) { if (alleleStats[*a].insertions == 1) { vartype = "ins"; } else { vartype = "del"; } } else { vartype = "complex"; } if (addTags) { var.info["mismatches"].push_back(convert(alleleStats[*a].mismatches)); var.info["insertions"].push_back(convert(alleleStats[*a].insertions)); var.info["deletions"].push_back(convert(alleleStats[*a].deletions)); var.info["transitions"].push_back(convert(alleleStats[*a].transitions)); var.info["transversions"].push_back(convert(alleleStats[*a].transversions)); var.info["deaminations"].push_back(convert(alleleStats[*a].deaminations)); var.info["aminations"].push_back(convert(alleleStats[*a].aminations)); } var.info["type"].push_back(vartype); } cout << var << endl; } // biallelic SNP case if (var.alt.size() == 1 && var.ref.size() == 1 && var.alt.front().size() == 1) { if (isTransition(var.ref, var.alt.front())) { biallelicSNPs.transitions++; } else { biallelicSNPs.transversions++; } biallelicSNPs.mismatches++; } } } while (regionItr != regions.end()); // find the maximum indel size int maxindel = 0; for (map<int, int>::iterator i = insertions.begin(); i != insertions.end(); ++i) { if (i->first > maxindel) { maxindel = i->first; } } for (map<int, int>::iterator i = deletions.begin(); i != deletions.end(); ++i) { if (i->first > maxindel) { maxindel = i->first; } } // and maximum mnp int maxmnp = 0; for (map<int, int>::iterator i = mnps.begin(); i != mnps.end(); ++i) { if (i->first > maxmnp) { maxmnp = i->first; } } // now print the results if (!addTags && !addType) { cout << "total variant sites:\t" << variantSites << endl << "of which " << biallelics << " (" << (double) biallelics / variantSites << ") are biallelic and " << multiallelics << " (" << (double) multiallelics / variantSites << ") are multiallelic" << endl << "total variant alleles:\t" << variantAlleles << endl << "unique variant alleles:\t" << uniqueVariantAlleles << endl << endl << "snps:\t" << snps << endl << "mnps:\t" << totalmnps << endl << "indels:\t" << totalinsertions + totaldeletions << endl << "complex:\t" << totalcomplex << endl << endl << "mismatches:\t" << mismatchbases << endl << endl << "ts/tv ratio:\t" << (double) transitions / (double) transversions << endl << "deamination ratio:\t" << (double) deaminations / aminations << endl << "biallelic snps:\t" << biallelicSNPs.mismatches << " @ " << (double) biallelicSNPs.transitions / (double) biallelicSNPs.transversions << endl; if (lengthFrequency) { cout << endl << "ins/del length frequency distribution" << endl << "length\tins\tdel\tins/del" << endl; for (int i = 1; i <= maxindel; ++i) { int ins = insertions[i]; int del = deletions[i]; cout << i << "\t" << (ins > 0 ? convert(ins) : "" ) << "\t" << (del > 0 ? convert(del) : "") << "\t" << (ins > 0 && del > 0 ? convert((double) ins / (double) del) : "") << endl; } } cout << endl << "insertion alleles / deletion alleles:\t" << (double) totalinsertions / (double) totaldeletions << endl << "inserted bases / deleted bases:\t" << (double) insertedbases / (double) deletedbases << endl << endl; if (lengthFrequency) { cout << "mnp length frequency distribution" << endl << "length\tcount" << endl; for (int i = 2; i <= maxmnp; ++i) { int mnp = mnps[i]; cout << i << "\t" << (mnp > 0 ? convert(mnp) : "") << endl; } } cout << "total bases in mnps:\t" << mnpbases << endl; /* cout << "complex event frequency distribution" << endl << "length\tcount" << endl; for (map<int, int>::iterator i = complexsubs.begin(); i != complexsubs.end(); ++i) { cout << i->first << "\t" << i->second << endl; } */ } return 0; }
int main(int argc, char** argv) { if (argc != 4) { cerr << "usage: " << argv[0] << " <annotation-tag> <vcf file> <vcf file>" << endl << "annotates genotypes in the first file with genotypes in the second" << endl << "adding the genotype as another flag to each sample filed in the first file." << endl << "annotation-tag is the name of the sample flag which is added to store the annotation." << endl << "also adds a 'has_variant' flag for sites where the second file has a variant." << endl; return 1; } string annotag = argv[1]; string filenameA = argv[2]; string filenameB = argv[3]; if (filenameA == filenameB) { cerr << "it won't help to annotate samples with their own genotypes!" << endl; return 1; } VariantCallFile variantFileA; if (filenameA == "-") { variantFileA.open(std::cin); } else { variantFileA.open(filenameA); } VariantCallFile variantFileB; if (filenameB == "-") { variantFileB.open(std::cin); } else { variantFileB.open(filenameB); } if (!variantFileA.is_open() || !variantFileB.is_open()) { return 1; } Variant varA(variantFileA); Variant varB(variantFileB); // while the first file doesn't match the second positionally, // step forward, annotating each genotype record with an empty genotype // when the two match, iterate through the genotypes from the first file // and get the genotypes reported in the second file variantFileA.getNextVariant(varA); variantFileB.getNextVariant(varB); string line = "##INFO=<ID=" + annotag + ".has_variant,Number=0,Type=Flag,Description=\"True if " + annotag + " has a called alternate among samples under comparison.\">"; variantFileA.addHeaderLine(line); line = "##FORMAT=<ID=" + annotag + ",Number=1,Type=String,Description=\"Genotype from " + annotag + ".\">"; variantFileA.addHeaderLine(line); cout << variantFileA.header << endl; do { // this is broken. to do it right, it'll be necessary to get reference ids from the fasta reference used to make the alignments... // if B is NOT done, and is less than A, read new B. if (!variantFileB.done() && (varB.sequenceName != varA.sequenceName || (varB.sequenceName == varA.sequenceName && varB.position < varA.position) || variantFileA.done()) ) { variantFileB.getNextVariant(varB); } // if A is not done- and A is less than B, read A. // should also read if variant B is done. if (!variantFileA.done() && (varA.sequenceName != varB.sequenceName || (varA.sequenceName == varB.sequenceName && varA.position < varB.position) || variantFileB.done()) ) { annotateWithBlankGenotypes(varA, annotag); cout << varA << endl; variantFileA.getNextVariant(varA); } vector<Variant> varsA; vector<Variant> varsB; bool hasMultipleAlts = false; long int thisPosition = 0; string thisSequenceName; if (varA.position == varB.position && varA.sequenceName == varB.sequenceName) { thisPosition = varA.position; thisSequenceName = varA.sequenceName; } while (!variantFileA.done() && !variantFileB.done() && thisPosition == varA.position && thisSequenceName == varA.sequenceName && varA.sequenceName == varB.sequenceName && varA.position == varB.position) { // accumulate all the alts at the current position varsA.push_back(varA); varsB.push_back(varB); if (varA.alt.size() > 1 || varB.alt.size() > 1) hasMultipleAlts = true; variantFileA.getNextVariant(varA); variantFileB.getNextVariant(varB); } // multiple lines per position if (!hasMultipleAlts && (varsA.size() > 1 || varsB.size() > 1)) { map<pair<string, string>, Variant> varsAParsed; map<pair<string, string>, Variant> varsBParsed; for (vector<Variant>::iterator v = varsA.begin(); v != varsA.end(); ++v) { varsAParsed[make_pair(v->ref, v->alt.front())] = *v; } for (vector<Variant>::iterator v = varsB.begin(); v != varsB.end(); ++v) { varsBParsed[make_pair(v->ref, v->alt.front())] = *v; } for (map<pair<string, string>, Variant>::iterator vs = varsAParsed.begin(); vs != varsAParsed.end(); ++vs) { Variant& varA = vs->second; annotateWithBlankGenotypes(varA, annotag); if (varsBParsed.find(make_pair(varA.ref, varA.alt.front())) != varsBParsed.end()) { Variant& varB = varsBParsed[make_pair(varA.ref, varA.alt.front())]; // TODO cleanup annotateWithGenotypes(varA, varB, annotag); varA.infoFlags[annotag + ".has_variant"] = true; } cout << varA << endl; } } else if (!varsA.empty() && !varsB.empty()) { // one line per multi-allelic Variant& varA = varsA.front(); annotateWithBlankGenotypes(varA, annotag); Variant& varB = varsB.front(); annotateWithGenotypes(varA, varB, annotag); // XXX TODO, and also allow for records with multiple alts // XXX assume that if the other file has a corresponding record, some kind of variation was detected at the same site varA.infoFlags[annotag + ".has_variant"] = true; cout << varA << endl; } else { for (vector<Variant>::iterator v = varsA.begin(); v != varsA.end(); ++v) { Variant& varA = *v; annotateWithBlankGenotypes(varA, annotag); cout << varA << endl; } } } while (!variantFileA.done() || !variantFileB.done()); return 0; }
int main(int argc, char** argv) { int c; string sampleField; string infoField; StatType statType = MEAN; if (argc == 1) printSummary(argv); while (true) { static struct option long_options[] = { /* These options set a flag. */ {"help", no_argument, 0, 'h'}, {"field", required_argument, 0, 'f'}, {"info", required_argument, 0, 'i'}, {"average", no_argument, 0, 'a'}, {"median", no_argument, 0, 'm'}, {"min", no_argument, 0, 'n'}, {"max", no_argument, 0, 'x'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hamnxf:i:", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) break; switch (c) { case 0: /* If this option set a flag, do nothing else now. */ if (long_options[option_index].flag != 0) break; printf ("option %s", long_options[option_index].name); if (optarg) printf (" with arg %s", optarg); printf ("\n"); break; case 'f': sampleField = optarg; break; case 'i': infoField = optarg; break; case 'a': statType = MEAN; break; case 'm': statType = MEDIAN; break; case 'n': statType = MIN; break; case 'x': statType = MAX; break; case 'h': printSummary(argv); exit(0); case '?': /* getopt_long already printed an error message. */ printSummary(argv); exit(1); break; default: abort (); } } if (infoField.empty() || sampleField.empty()) { cerr << "Error: both a sample field and an info field are required." << endl; return 1; } VariantCallFile variantFile; string inputFilename; if (optind == argc - 1) { inputFilename = argv[optind]; variantFile.open(inputFilename); } else { variantFile.open(std::cin); } if (!variantFile.is_open()) { return 1; } string statTypeStr; switch (statType) { case MEAN: statTypeStr = "mean"; break; case MEDIAN: statTypeStr = "median"; break; case MIN: statTypeStr = "min"; break; case MAX: statTypeStr = "max"; break; default: cerr << "Error: failure to convert stat type to string" << endl; return 1; break; } variantFile.addHeaderLine("##INFO=<ID="+infoField+",Number=1,Type=Float,Description=\"Summary statistic generated by"+statTypeStr+" of per-sample values of "+sampleField+" \">"); cout << variantFile.header << endl; Variant var(variantFile); while (variantFile.getNextVariant(var)) { vector<double> vals; for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) { map<string, vector<string> >& sample = s->second; if (sample.find(sampleField) != sample.end()) { double val; string& s = sample[sampleField].front(); if (sample[sampleField].size() > 1) { cerr << "Error: cannot handle sample fields with multiple values" << endl; return 1; } convert(s, val); vals.push_back(val); } } double result; switch (statType) { case MEAN: result = mean(vals); break; case MEDIAN: result = median(vals); break; case MIN: result = *min_element(vals.begin(), vals.end()); break; case MAX: result = *max_element(vals.begin(), vals.end()); break; default: cerr << "Error: unrecognized StatType" << endl; return 1; break; } var.info[infoField].clear(); var.info[infoField].push_back(convert(result)); cout << var << endl; } return 0; }