int main(int argc, char** argv) { globalOpts.threads = 1 ; globalOpts.af = 0.05; // zero based index for the target and background indivudals map<int, int> it, ib; const struct option longopts[] = { {"version" , 0, 0, 'v'}, {"help" , 0, 0, 'h'}, {"file" , 1, 0, 'f'}, {"target" , 1, 0, 't'}, {"region" , 1, 0, 'r'}, {"gen" , 1, 0, 'g'}, {"type" , 1, 0, 'y'}, {"threads" , 1, 0, 'x'}, {"af" , 1, 0, 'a'}, {"pos" , 1, 0, 'p'}, {0,0,0,0} }; int findex; int iarg=0; while(iarg != -1) { iarg = getopt_long(argc, argv, "a:x:g:y:r:d:t:b:f:p:hv", longopts, &findex); switch (iarg) { case 'p': { globalOpts.pos = atoi(optarg); break; } case 'a': { globalOpts.af = atof(optarg); break; } case 'x': { globalOpts.threads = atoi(optarg); break; } case 'g': { globalOpts.geneticMapFile = optarg; break; } case 'h': { printHelp(); break; } case 'v': { printVersion(); break; } case 'y': { globalOpts.type = optarg; break; } case 't': { loadIndices(it, optarg); cerr << "INFO: there are " << it.size() << " individuals in the target" << endl; cerr << "INFO: target ids: " << optarg << endl; break; } case 'f': { cerr << "INFO: file: " << optarg << endl; globalOpts.filename = optarg; break; } case 'r': { cerr << "INFO: set seqid region to : " << optarg << endl; globalOpts.region = optarg; break; default: break; } } } #if defined HAS_OPENMP omp_set_num_threads(globalOpts.threads); #endif map<string, int> okayGenotypeLikelihoods; okayGenotypeLikelihoods["PL"] = 1; okayGenotypeLikelihoods["GL"] = 1; okayGenotypeLikelihoods["GP"] = 1; okayGenotypeLikelihoods["GT"] = 1; // add an option for dumping // for(std::map<int, double>::iterator gm = geneticMap.begin(); gm != geneticMap.end(); gm++){ // cerr << "pos: " << gm->first << " cm: " << gm->second << endl; // } if(globalOpts.type.empty()){ cerr << "FATAL: failed to specify genotype likelihood format : PL or GL" << endl; printHelp(); exit(1); } if(okayGenotypeLikelihoods.find(globalOpts.type) == okayGenotypeLikelihoods.end()){ cerr << "FATAL: genotype likelihood is incorrectly formatted, only use: PL or GL" << endl; printHelp(); exit(1); } if(globalOpts.filename.empty()){ cerr << "FATAL: did not specify a file" << endl; printHelp(); exit(1); } if(it.size() < 2){ cerr << "FATAL: target option is required -- or -- less than two individuals in target\n"; printHelp(); exit(1); } // using vcflib; thanksErik VariantCallFile variantFile; variantFile.open(globalOpts.filename); if(globalOpts.region.empty()){ cerr << "FATAL: region required" << endl; exit(1); } if(! variantFile.setRegion(globalOpts.region)){ cerr <<"FATAL: unable to set region" << endl; exit(1); } if (!variantFile.is_open()) { exit(1); } Variant var( variantFile ); vector<int> target_h, background_h; int index = 0; int indexi = 0; vector<string> samples = variantFile.sampleNames; int nsamples = samples.size(); for(vector<string>::iterator samp = samples.begin(); samp != samples.end(); samp++){ string sampleName = (*samp); if(it.find(index) != it.end() ){ target_h.push_back(indexi); indexi++; } index++; } vector<long int> positions; vector<double> afs; string **haplotypes = new string*[target_h.size()]; for (int i = 0; i < target_h.size(); i++) { haplotypes[i] = new string[2]; } while (variantFile.getNextVariant(var)) { globalOpts.seqid = var.sequenceName; if(!var.isPhased()){ cerr << "FATAL: Found an unphased variant. All genotypes must be phased!" << endl; exit(1); } if(var.alleles.size() > 2){ continue; } vector < map< string, vector<string> > > target, background, total; int sindex = 0; for(int nsamp = 0; nsamp < nsamples; nsamp++){ map<string, vector<string> > sample = var.samples[ samples[nsamp]]; if(it.find(sindex) != it.end() ){ target.push_back(sample); } sindex += 1; } genotype * populationTarget ; if(globalOpts.type == "PL"){ populationTarget = new pl(); } if(globalOpts.type == "GL"){ populationTarget = new gl(); } if(globalOpts.type == "GP"){ populationTarget = new gp(); } if(globalOpts.type == "GT"){ populationTarget = new gt(); } populationTarget->loadPop(target, var.sequenceName, var.position); if(populationTarget->af <= globalOpts.af || populationTarget->nref < 2 || populationTarget->nalt < 2){ delete populationTarget; continue; } positions.push_back(var.position); afs.push_back(populationTarget->af); loadPhased(haplotypes, populationTarget, populationTarget->gts.size()); populationTarget = NULL; delete populationTarget; } if(!globalOpts.geneticMapFile.empty()){ cerr << "INFO: loading genetics map" << endl; loadGeneticMap(positions.front(), positions.back()); cerr << "INFO: finished loading genetics map" << endl; } calc(haplotypes, target_h.size(), afs, positions, target_h, background_h, globalOpts.seqid); clearHaplotypes(haplotypes, target_h.size()); exit(0); }
int main(int argc, char** argv) { int c; bool invert = false; bool logicalOr = false; bool filterSites = false; vector<string> infofilterStrs; vector<VariantFilter> infofilters; vector<string> genofilterStrs; vector<VariantFilter> genofilters; string tag = ""; string filterSpec; string alleleTag; vector<string> regions; if (argc == 1) printSummary(argv); while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"filter-sites", no_argument, 0, 's'}, {"info-filter", required_argument, 0, 'f'}, {"genotype-filter", required_argument, 0, 'g'}, {"tag", required_argument, 0, 't'}, {"allele-tag", required_argument, 0, 'a'}, {"invert", no_argument, 0, 'v'}, {"or", no_argument, 0, 'o'}, {"region", required_argument, 0, 'r'}, //{"length", no_argument, &printLength, true}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hvsof:g:t:r:a:", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) break; switch (c) { case 0: /* If this option set a flag, do nothing else now. */ if (long_options[option_index].flag != 0) break; printf ("option %s", long_options[option_index].name); if (optarg) printf (" with arg %s", optarg); printf ("\n"); break; case 'f': filterSpec += " " + string(optarg); infofilterStrs.push_back(string(optarg)); break; case 's': filterSites = true; break; case 'a': alleleTag = optarg; break; case 'g': filterSpec += " genotypes filtered with: " + string(optarg); genofilterStrs.push_back(string(optarg)); break; case 't': tag = optarg; break; case 'h': printSummary(argv); exit(0); break; case 'v': invert = true; break; case 'o': logicalOr = true; break; case 'r': regions.push_back(optarg); break; case '?': /* getopt_long already printed an error message. */ printSummary(argv); exit(1); break; default: abort (); } } filterSpec = filterSpec.substr(1); // strip leading " " VariantCallFile variantFile; string inputFilename; if (optind == argc - 1) { inputFilename = argv[optind]; variantFile.open(inputFilename); } else { variantFile.open(std::cin); } if (!variantFile.is_open()) { return 1; } for (vector<string>::iterator f = infofilterStrs.begin(); f != infofilterStrs.end(); ++f) { infofilters.push_back(VariantFilter(*f, VariantFilter::RECORD, variantFile.infoTypes)); } for (vector<string>::iterator f = genofilterStrs.begin(); f != genofilterStrs.end(); ++f) { genofilters.push_back(VariantFilter(*f, VariantFilter::SAMPLE, variantFile.formatTypes)); } vector<string> headerlines = split(variantFile.header, "\n"); variantFile.header.clear(); for (vector<string>::iterator l = headerlines.begin(); l != headerlines.end(); ++l) { if (!filterSpec.empty() && (l->find("INFO") != string::npos || l + 1 == headerlines.end())) { variantFile.header += "##filter=\"" + filterSpec + "\"\n"; filterSpec.clear(); } variantFile.header += *l + ((l + 1 == headerlines.end()) ? "" : "\n"); } if (!alleleTag.empty()) { variantFile.addHeaderLine("##INFO=<ID="+ alleleTag +",Number=A,Type=String,Description=\"" + tag + " if this allele passes the filters, '.' if not, filters are: " + filterSpec + ".\">"); } cout << variantFile.header << endl; /* if (genofilters.empty() && tag.empty()) { variantFile.parseSamples = false; } */ Variant var(variantFile); vector<string>::iterator regionItr = regions.begin(); do { if (!inputFilename.empty() && !regions.empty()) { string regionStr = *regionItr++; variantFile.setRegion(regionStr); } while (variantFile.getNextVariant(var)) { if (!genofilters.empty()) { for (vector<VariantFilter>::iterator f = genofilters.begin(); f != genofilters.end(); ++f) { f->removeFilteredGenotypes(var); } } if (!infofilters.empty()) { if (filterSites) { bool passes = passesFilters(var, infofilters, logicalOr); if (invert) { passes = !passes; } if (passes) { if (!tag.empty()) { if (alleleTag.empty()) { var.addFilter(tag); } else { var.info[alleleTag].clear(); for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { var.info[alleleTag].push_back(tag); } } cout << var << endl; } else { cout << var << endl; } } else if (!tag.empty()) { cout << var << endl; } } else { // filter out alleles which pass // removes the failing alleles vector<string> failingAlts; vector<string> passingAlts; vector<bool> passes; for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { if (!passesFilters(var, infofilters, logicalOr, *a)) { failingAlts.push_back(*a); passes.push_back(false); } else { passingAlts.push_back(*a); passes.push_back(true); } } if (tag.empty()) { // if there is no specified tag, just remove the failing alts if (failingAlts.size() < var.alt.size()) { for (vector<string>::iterator a = failingAlts.begin(); a != failingAlts.end(); ++a) { var.removeAlt(*a); } cout << var << endl; } } else { // otherwise, apply the tag if (alleleTag.empty()) { if (!passingAlts.empty()) { var.addFilter(tag); } } else { var.info[alleleTag].clear(); for (vector<bool>::iterator p = passes.begin(); p != passes.end(); ++p) { if (*p) { var.info[alleleTag].push_back(tag); } else { var.info[alleleTag].push_back("."); } } } cout << var << endl; } } } else { if (genofilters.empty()) { cout << variantFile.line << endl; } else { cout << var << endl; } } } } while (regionItr != regions.end()); return 0; }
int main(int argc, char** argv) { vector<string> regions; int c; while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"region", required_argument, 0, 'r'}, //{"length", no_argument, &printLength, true}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hr:", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) break; switch (c) { case 0: /* If this option set a flag, do nothing else now. */ if (long_options[option_index].flag != 0) break; printf ("option %s", long_options[option_index].name); if (optarg) printf (" with arg %s", optarg); printf ("\n"); break; case 'h': printSummary(argv); exit(0); break; case 'r': regions.push_back(optarg); break; default: abort (); } } VariantCallFile variantFile; string inputFilename; if (optind == argc - 1) { inputFilename = argv[optind]; variantFile.open(inputFilename); } else { variantFile.open(std::cin); } if (!variantFile.is_open()) { return 1; } Variant var(variantFile); vector<string>::iterator regionItr = regions.begin(); int variantAlleles = 0; int variantSites = 0; int snps = 0; int transitions = 0; int transversions = 0; int totalinsertions = 0; int totaldeletions = 0; int insertedbases = 0; int deletedbases = 0; int totalmnps = 0; int totalcomplex = 0; map<int, int> insertions; map<int, int> deletions; map<int, int> mnps; map<int, int> complexsubs; do { if (!inputFilename.empty() && !regions.empty()) { string regionStr = *regionItr++; variantFile.setRegion(regionStr); } while (variantFile.getNextVariant(var)) { ++variantSites; map<string, vector<VariantAllele> > alternates = var.parsedAlternates(); for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { ++variantAlleles; string& alternate = *a; if (var.ref.size() == alternate.size()) { if (var.ref.size() == 1) { ++snps; if (isTransition(var.ref, alternate)) { ++transitions; } else { ++transversions; } } else { ++totalmnps; if (alternates[alternate].size() > 1) { } else { VariantAllele& va = alternates[alternate].front(); ++mnps[va.alt.size()]; // not entirely correct } } } else if (var.ref.size() > alternate.size()) { int diff = var.ref.size() - alternate.size(); deletedbases += diff; if (alternates[alternate].size() > 1) { ++totalcomplex; ++complexsubs[-diff]; } else { ++totaldeletions; ++deletions[diff]; } } else { int diff = alternate.size() - var.ref.size(); insertedbases += diff; if (alternates[alternate].size() > 1) { ++totalcomplex; ++complexsubs[diff]; } else { ++totalinsertions; ++insertions[diff]; } } } } } while (regionItr != regions.end()); // find the maximum indel size int maxindel = 0; for (map<int, int>::iterator i = insertions.begin(); i != insertions.end(); ++i) { if (i->first > maxindel) { maxindel = i->first; } } for (map<int, int>::iterator i = deletions.begin(); i != deletions.end(); ++i) { if (i->first > maxindel) { maxindel = i->first; } } // and maximum mnp int maxmnp = 0; for (map<int, int>::iterator i = mnps.begin(); i != mnps.end(); ++i) { if (i->first > maxmnp) { maxmnp = i->first; } } // now print the results cout << "total variant sites:\t" << variantSites << endl << "total variant alleles:\t" << variantAlleles << endl << endl << "snps:\t" << snps << endl << "indels:\t" << totalinsertions + totaldeletions << endl << "mnps:\t" << totalmnps << endl << "complex:\t" << totalcomplex << endl << endl << "ts/tv ratio:\t" << (double) transitions / (double) transversions << endl << endl << "ins/del length frequency distribution" << endl << "length\tins\tdel\tins/del" << endl; for (int i = 1; i <= maxindel; ++i) { int ins = insertions[i]; int del = deletions[i]; cout << i << "\t" << (ins > 0 ? convert(ins) : "" ) << "\t" << (del > 0 ? convert(del) : "") << "\t" << (ins > 0 && del > 0 ? convert((double) ins / (double) del) : "") << endl; } cout << endl << "insertion alleles / deletion alleles:\t" << (double) totalinsertions / (double) totaldeletions << endl << "inserted bases / deleted bases:\t" << (double) insertedbases / (double) deletedbases << endl << endl << "mnp length frequency distribution" << endl << "length\tcount" << endl; for (int i = 2; i <= maxmnp; ++i) { int mnp = mnps[i]; cout << i << "\t" << (mnp > 0 ? convert(mnp) : "") << endl; } cout << endl; cout << "complex event frequency distribution" << endl << "length\tcount" << endl; for (map<int, int>::iterator i = complexsubs.begin(); i != complexsubs.end(); ++i) { cout << i->first << "\t" << i->second << endl; } return 0; }
int main(int argc, char** argv) { // set the random seed for MCMC srand((unsigned)time(NULL)); // the filename string filename = "NA"; // set region to scaffold string region = "NA"; // using vcflib; thanks to Erik Garrison VariantCallFile variantFile; // zero based index for the target and background indivudals map<int, int> it, ib; // deltaaf is the difference of allele frequency we bother to look at // ancestral state is set to zero by default string mut = "1"; int counts = 0; // phased int phased = 0; const struct option longopts[] = { {"version" , 0, 0, 'v'}, {"help" , 0, 0, 'h'}, {"file" , 1, 0, 'f'}, {"target" , 1, 0, 't'}, {"background", 1, 0, 'b'}, {"deltaaf" , 1, 0, 'd'}, {"region" , 1, 0, 'r'}, {"mutation" , 1, 0, 'm'}, {"phased" , 1, 0, 'p'}, {0,0,0,0} }; int findex; int iarg=0; while(iarg != -1) { iarg = getopt_long(argc, argv, "p:m:r:d:t:b:f:hv", longopts, &findex); switch (iarg) { case 'h': cerr << endl << endl; cerr << "INFO: help" << endl; cerr << "INFO: description:" << endl; cerr << " gl-XPEHH estimates haplotype decay between the target and background populations. SNVs are integrated " << endl; cerr << " until EHH in the target and background is less than 0.05. The score is the itegrated EHH (target) / integrated EHH (background). " << endl; cerr << " gl-XPEHH does NOT integrate over genetic distance, as genetic maps are not availible for most non-model organisms. " << endl; cerr << " gl-XPEHH phases genotypes, imuputes missing genotypes, and changes poor quality genotypes. Phasing is done in a sliding window " << endl; cerr << " with a stochastic search, therefore, every time gl-XPEHH is run it will generate slightly different results. " << endl; cerr << "Output : 4 columns : " << endl; cerr << " 1. seqid " << endl; cerr << " 2. position " << endl; cerr << " 3. xp-ehh " << endl; cerr << " 4. iHS " << endl << endl; cerr << "INFO: gl-XPEHH --target 0,1,2,3,4,5,6,7 --background 11,12,13,16,17,19,22 --file my.vcf --deltaaf 0.1 --ancestral 0 " << endl; cerr << endl; cerr << "INFO: required: r,region -- a genomice range to calculate gl-XPEHH on in the format : \"seqid:start-end]\" or \"seqid\" " << endl; cerr << "INFO: required: t,target -- a zero base comma seperated list of target individuals corrisponding to VCF columns " << endl; cerr << "INFO: required: b,background -- a zero base comma seperated list of background individuals corrisponding to VCF columns " << endl; cerr << "INFO: required: f,file a -- proper formatted VCF. the FORMAT field MUST contain \"PL\" if option phased == 0 " << endl; cerr << "INFO: optional: m,mutation -- which state is derived in vcf [0,1] default is 1 " << endl; cerr << "INFO: optional: p,phased -- phasing flag [0,1] 0 = phase vcf, 1 = vcf is already phased " << endl; cerr << endl; cerr << "INFO: version 1.0.1 ; date: April 2014 ; author: Zev Kronenberg; email : [email protected] " << endl; cerr << endl << endl; return 0; case 'v': cerr << endl << endl; cerr << "INFO: version 1.0.1 ; date: April 2014 ; author: Zev Kronenberg; email : [email protected] " << endl; return 0; case 'p': phased = atoi(optarg); cerr << "INFO: setting phase to: " << phased << endl; break; case 'm': mut = optarg; cerr << "INFO: derived state set to " << mut << endl; break; case 't': loadIndices(it, optarg); cerr << "INFO: there are " << it.size() << " individuals in the target" << endl; cerr << "INFO: target ids: " << optarg << endl; break; case 'b': loadIndices(ib, optarg); cerr << "INFO: there are " << ib.size() << " individuals in the background" << endl; cerr << "INFO: background ids: " << optarg << endl; break; case 'f': cerr << "INFO: file: " << optarg << endl; filename = optarg; break; case 'r': cerr << "INFO: set seqid region to : " << optarg << endl; region = optarg; break; default: break; } } if(filename == "NA"){ cerr << "FATAL: did not specify a file" << endl; cerr << "INFO: please use gl-XPEHH --help" << endl; return(1); } variantFile.open(filename); if(region == "NA"){ cerr << "FATAL: did not specify a region" << endl; cerr << "INFO: please use gl-XPEHH --help" << endl; } if(region != "NA"){ variantFile.setRegion(region); } if (!variantFile.is_open()) { return 1; } Variant var(variantFile); vector<string> samples = variantFile.sampleNames; vector<int> target_h, background_h; int index, indexi = 0; cerr << "INFO: there are " << samples.size() << " individuals in the VCF" << endl; if(samples.size() == 0){ cerr << "FATAL: too few samples or no VCF header" << endl; cerr << "INFO: please use gl-XPEHH --help" << endl; return(1); } for(vector<string>::iterator samp = samples.begin(); samp != samples.end(); samp++){ if(it.find(index) != it.end() ){ target_h.push_back(indexi); indexi++; } if(ib.find(index) != ib.end()){ background_h.push_back(indexi); indexi++; } index++; } list< pop > tdat, bdat, zdat; vector<long int> positions; string haplotypes [it.size() + ib.size()][2]; string seqid; while (variantFile.getNextVariant(var)) { map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); map<string, map<string, vector<string> > >::iterator sEnd = var.samples.end(); // biallelic sites naturally if(var.alt.size() > 1){ continue; } vector < map< string, vector<string> > > target, background, total; int sindex = 0; for (; s != sEnd; s++) { map<string, vector<string> >& sample = s->second; if(it.find(sindex) != it.end() ){ target.push_back(sample); total.push_back(sample); } if(ib.find(sindex) != ib.end()){ background.push_back(sample); total.push_back(sample); } sindex += 1; } seqid = var.sequenceName; pop popt, popb, popz; initPop(popt); initPop(popb); initPop(popz); loadPop(target, popt, var.sequenceName, var.position, phased ); loadPop(background, popb, var.sequenceName, var.position, phased ); loadPop(total, popz, var.sequenceName, var.position, phased ); if(popt.af == -1 || popb.af == -1){ continue; } if(popz.af > 0.95 || popz.af < 0.05){ continue; } if(popt.af == 0 && popb.af == 1){ continue; } if(popt.af == 1 && popb.af == 0){ continue; } tdat.push_back(popt); bdat.push_back(popb); zdat.push_back(popz); positions.push_back(var.position); counts += 1; if(counts >= 1000){ cerr << "INFO: processed " << haplotypes[0][0].size() << " SNPs; current location : " << var.position << endl; counts = 0; } while(zdat.size() >= 15 && !zdat.empty()){ if(phased == 0){ localPhase(haplotypes, zdat, (it.size() + ib.size())); } else{ loadPhased(haplotypes, zdat, (it.size() + ib.size())); } while(!zdat.empty()){ zdat.pop_front(); } } } if(phased == 0){ localPhase(haplotypes, zdat, (it.size() + ib.size())); } else{ loadPhased(haplotypes, zdat, (it.size() + ib.size())); } while(!zdat.empty()){ zdat.pop_front(); } cerr << "INFO: phasing done" << endl; calc(haplotypes, (it.size() + ib.size()), positions, target_h, background_h, mut, seqid); cerr << "INFO: gl-XPEHH finished" << endl; return 0; }
int main(int argc, char** argv) { vector<string> regions; bool addTags = false; bool addType = false; bool lengthFrequency = true; // constants for SmithWaterman algorithm float matchScore = 10.0f; float mismatchScore = -9.0f; float gapOpenPenalty = 15.0f; float gapExtendPenalty = 6.66f; bool useReferenceAlignment = false; int c; while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, {"help", no_argument, 0, 'h'}, {"region", required_argument, 0, 'r'}, {"add-info", no_argument, 0, 'a'}, {"add-type", no_argument, 0, 't'}, {"no-length-frequency", no_argument, 0, 'l'}, {"match-score", required_argument, 0, 'm'}, {"mismatch-score", required_argument, 0, 'x'}, {"gap-open-penalty", required_argument, 0, 'o'}, {"gap-extend-penalty", required_argument, 0, 'e'}, //{"length", no_argument, &printLength, true}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hlatr:m:x:o:e:", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) break; switch (c) { case 0: /* If this option set a flag, do nothing else now. */ if (long_options[option_index].flag != 0) break; printf ("option %s", long_options[option_index].name); if (optarg) printf (" with arg %s", optarg); printf ("\n"); break; case 'h': printSummary(argv); exit(0); break; case 'r': regions.push_back(optarg); break; case 'l': lengthFrequency = false; break; case 'a': addTags = true; break; case 't': addType = true; break; case 'm': matchScore = atof(optarg); break; case 'x': mismatchScore = atof(optarg); break; case 'o': gapOpenPenalty = atof(optarg); break; case 'e': gapExtendPenalty = atof(optarg); break; default: abort (); } } VariantCallFile variantFile; string inputFilename; if (optind == argc - 1) { inputFilename = argv[optind]; variantFile.open(inputFilename); } else { variantFile.open(std::cin); } if (!variantFile.is_open()) { return 1; } if (addType && !addTags) { variantFile.addHeaderLine("##INFO=<ID=type,Number=A,Type=String,Description=\"The type of the allele, either snp, ins, del, complex, or ref.\">"); variantFile.addHeaderLine("##INFO=<ID=cigar,Number=A,Type=String,Description=\"The CIGAR-style representation of the alternate allele as aligned to the reference\">"); cout << variantFile.header << endl; } if (addTags) { variantFile.addHeaderLine("##INFO=<ID=transitions,Number=A,Type=Integer,Description=\"Total number of transitions in the alternate allele\">"); variantFile.addHeaderLine("##INFO=<ID=transversions,Number=A,Type=Integer,Description=\"Total number of transversions in the alternate allele\">"); variantFile.addHeaderLine("##INFO=<ID=deaminations,Number=A,Type=Integer,Description=\"Total number of deaminations in the alternate allele\">"); variantFile.addHeaderLine("##INFO=<ID=aminations,Number=A,Type=Integer,Description=\"Total number of aminations in the alternate allele\">"); variantFile.addHeaderLine("##INFO=<ID=mismatches,Number=A,Type=Integer,Description=\"Total number of mismatches in the alternate allele\">"); variantFile.addHeaderLine("##INFO=<ID=insertions,Number=A,Type=Integer,Description=\"Total number of inserted bases in the alternate allele\">"); variantFile.addHeaderLine("##INFO=<ID=deletions,Number=A,Type=Integer,Description=\"Total number of deleted bases in the alternate allele\">"); variantFile.addHeaderLine("##INFO=<ID=cigar,Number=A,Type=String,Description=\"The CIGAR-style representation of the alternate allele as aligned to the reference\">"); variantFile.addHeaderLine("##INFO=<ID=type,Number=A,Type=String,Description=\"The type of the allele, either snp, ins, del, complex, or ref.\">"); variantFile.addHeaderLine("##INFO=<ID=reflen,Number=1,Type=Integer,Description=\"The length of the reference allele\">"); variantFile.addHeaderLine("##INFO=<ID=altlen,Number=A,Type=Integer,Description=\"The length of the alternate allele\">"); cout << variantFile.header << endl; } Variant var(variantFile); vector<string>::iterator regionItr = regions.begin(); int variantAlleles = 0; int uniqueVariantAlleles = 0; int variantSites = 0; int snps = 0; int transitions = 0; int transversions = 0; int deaminations = 0; int aminations = 0; int totalinsertions = 0; int totaldeletions = 0; int insertedbases = 0; int deletedbases = 0; int totalmnps = 0; int totalcomplex = 0; int mismatchbases = 0; int mnpbases = 0; int biallelics = 0; int multiallelics = 0; map<int, int> insertions; map<int, int> deletions; map<int, int> mnps; map<int, int> complexsubs; bool includePreviousBaseForIndels = false; bool useMNPs = true; bool useEntropy = false; AlleleStats biallelicSNPs; // todo, add biallelic snp dialog to output and ts/tv for snps and mnps do { if (!inputFilename.empty() && !regions.empty()) { string regionStr = *regionItr++; variantFile.setRegion(regionStr); } while (variantFile.getNextVariant(var)) { ++variantSites; if (var.alt.size() > 1) { ++multiallelics; } else { ++biallelics; } map<string, vector<VariantAllele> > alternates = var.parsedAlternates(includePreviousBaseForIndels, useMNPs, useEntropy, matchScore, mismatchScore, gapOpenPenalty, gapExtendPenalty); map<VariantAllele, vector<string> > uniqueVariants; vector<string> cigars; for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { string& alternate = *a; if (addTags) var.info["altlen"].push_back(convert(alternate.size())); vector<VariantAllele>& vav = alternates[alternate]; if (vav.size() > 1) { // check that there are actually multiple non-reference alleles int nonRefAlleles = 0; for (vector<VariantAllele>::iterator z = vav.begin(); z != vav.end(); ++z) { if (z->ref != z->alt) ++nonRefAlleles; } if (nonRefAlleles > 1) ++totalcomplex; } for (vector<VariantAllele>::iterator v = vav.begin(); v != vav.end(); ++v) { uniqueVariants[*v].push_back(alternate); } if (addTags || addType) { string cigar; pair<int, string> element; for (vector<VariantAllele>::iterator v = vav.begin(); v != vav.end(); ++v) { VariantAllele& va = *v; if (va.ref != va.alt) { if (element.second == "M") { cigar += convert(element.first) + element.second; element.second = ""; element.first = 0; } if (va.ref.size() == va.alt.size()) { cigar += convert(va.ref.size()) + "X"; } else if (va.ref.size() > va.alt.size()) { cigar += convert(va.ref.size() - va.alt.size()) + "D"; } else { cigar += convert(va.alt.size() - va.ref.size()) + "I"; } } else { if (element.second == "M") { element.first += va.ref.size(); } else { element = make_pair(va.ref.size(), "M"); } } } if (element.second == "M") { cigar += convert(element.first) + element.second; } element.second = ""; element.first = 0; cigars.push_back(cigar); } } if (addTags) { var.info["cigar"] = cigars; var.info["reflen"].push_back(convert(var.ref.size())); } else if (addType) { var.info["cigar"] = cigars; } variantAlleles += var.alt.size(); map<string, AlleleStats> alleleStats; for (map<VariantAllele, vector<string> >::iterator v = uniqueVariants.begin(); v != uniqueVariants.end(); ++v) { const VariantAllele& va = v->first; vector<string>& alternates = v->second; if (!(addTags || addType)) { // don't add any tag information if we're not going to output it alternates.clear(); } if (va.ref != va.alt) { ++uniqueVariantAlleles; if (va.ref.size() == va.alt.size()) { if (va.ref.size() == 1) { ++snps; ++mismatchbases; for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) { ++alleleStats[*a].mismatches; } if (isTransition(va.ref, va.alt)) { ++transitions; for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) { ++alleleStats[*a].transitions; } } else { ++transversions; for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) { ++alleleStats[*a].transversions; } } if (isAmination(va.ref, va.alt)) { ++aminations; for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) { ++alleleStats[*a].aminations; } } if (isDeamination(va.ref, va.alt)) { ++deaminations; for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) { ++alleleStats[*a].deaminations; } } } else { ++totalmnps; ++mnps[va.alt.size()]; // not entirely correct for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) { alleleStats[*a].mismatches += va.alt.size(); } string::const_iterator r = va.ref.begin(); for (string::const_iterator a = va.alt.begin(); a != va.alt.end(); ++a, ++r) { string rstr = string(1, *r); string astr = string(1, *a); if (rstr == astr) { continue; } if (isTransition(rstr, astr)) { ++transitions; for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) { ++alleleStats[*a].transitions; } } else { ++transversions; for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) { ++alleleStats[*a].transversions; } } if (isAmination(rstr, astr)) { ++aminations; for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) { ++alleleStats[*a].aminations; } } if (isDeamination(rstr, astr)) { ++deaminations; for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) { ++alleleStats[*a].deaminations; } } ++mismatchbases; ++mnpbases; } } } else if (va.ref.size() > va.alt.size()) { int diff = va.ref.size() - va.alt.size(); deletedbases += diff; ++totaldeletions; ++deletions[diff]; for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) { alleleStats[*a].deletedbases += diff; alleleStats[*a].deletions += 1; } } else { int diff = va.alt.size() - va.ref.size(); insertedbases += diff; ++totalinsertions; ++insertions[diff]; for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) { alleleStats[*a].insertedbases += diff; alleleStats[*a].insertions += 1; } } } } if (addTags || addType) { for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { string vartype; if (alleleStats[*a].insertions + alleleStats[*a].deletions == 0) { if (alleleStats[*a].mismatches == 1) { vartype = "snp"; } else if (alleleStats[*a].mismatches > 1) { vartype = "complex"; } else { vartype = "ref"; } } else if (alleleStats[*a].insertions + alleleStats[*a].deletions == 1) { if (alleleStats[*a].insertions == 1) { vartype = "ins"; } else { vartype = "del"; } } else { vartype = "complex"; } if (addTags) { var.info["mismatches"].push_back(convert(alleleStats[*a].mismatches)); var.info["insertions"].push_back(convert(alleleStats[*a].insertions)); var.info["deletions"].push_back(convert(alleleStats[*a].deletions)); var.info["transitions"].push_back(convert(alleleStats[*a].transitions)); var.info["transversions"].push_back(convert(alleleStats[*a].transversions)); var.info["deaminations"].push_back(convert(alleleStats[*a].deaminations)); var.info["aminations"].push_back(convert(alleleStats[*a].aminations)); } var.info["type"].push_back(vartype); } cout << var << endl; } // biallelic SNP case if (var.alt.size() == 1 && var.ref.size() == 1 && var.alt.front().size() == 1) { if (isTransition(var.ref, var.alt.front())) { biallelicSNPs.transitions++; } else { biallelicSNPs.transversions++; } biallelicSNPs.mismatches++; } } } while (regionItr != regions.end()); // find the maximum indel size int maxindel = 0; for (map<int, int>::iterator i = insertions.begin(); i != insertions.end(); ++i) { if (i->first > maxindel) { maxindel = i->first; } } for (map<int, int>::iterator i = deletions.begin(); i != deletions.end(); ++i) { if (i->first > maxindel) { maxindel = i->first; } } // and maximum mnp int maxmnp = 0; for (map<int, int>::iterator i = mnps.begin(); i != mnps.end(); ++i) { if (i->first > maxmnp) { maxmnp = i->first; } } // now print the results if (!addTags && !addType) { cout << "total variant sites:\t" << variantSites << endl << "of which " << biallelics << " (" << (double) biallelics / variantSites << ") are biallelic and " << multiallelics << " (" << (double) multiallelics / variantSites << ") are multiallelic" << endl << "total variant alleles:\t" << variantAlleles << endl << "unique variant alleles:\t" << uniqueVariantAlleles << endl << endl << "snps:\t" << snps << endl << "mnps:\t" << totalmnps << endl << "indels:\t" << totalinsertions + totaldeletions << endl << "complex:\t" << totalcomplex << endl << endl << "mismatches:\t" << mismatchbases << endl << endl << "ts/tv ratio:\t" << (double) transitions / (double) transversions << endl << "deamination ratio:\t" << (double) deaminations / aminations << endl << "biallelic snps:\t" << biallelicSNPs.mismatches << " @ " << (double) biallelicSNPs.transitions / (double) biallelicSNPs.transversions << endl; if (lengthFrequency) { cout << endl << "ins/del length frequency distribution" << endl << "length\tins\tdel\tins/del" << endl; for (int i = 1; i <= maxindel; ++i) { int ins = insertions[i]; int del = deletions[i]; cout << i << "\t" << (ins > 0 ? convert(ins) : "" ) << "\t" << (del > 0 ? convert(del) : "") << "\t" << (ins > 0 && del > 0 ? convert((double) ins / (double) del) : "") << endl; } } cout << endl << "insertion alleles / deletion alleles:\t" << (double) totalinsertions / (double) totaldeletions << endl << "inserted bases / deleted bases:\t" << (double) insertedbases / (double) deletedbases << endl << endl; if (lengthFrequency) { cout << "mnp length frequency distribution" << endl << "length\tcount" << endl; for (int i = 2; i <= maxmnp; ++i) { int mnp = mnps[i]; cout << i << "\t" << (mnp > 0 ? convert(mnp) : "") << endl; } } cout << "total bases in mnps:\t" << mnpbases << endl; /* cout << "complex event frequency distribution" << endl << "length\tcount" << endl; for (map<int, int>::iterator i = complexsubs.begin(); i != complexsubs.end(); ++i) { cout << i->first << "\t" << i->second << endl; } */ } return 0; }
int main(int argc, char** argv) { bool snp = false; // set the random seed for MCMC srand((unsigned)time(NULL)); // the filename string filename; // open standardout // set region to scaffold string region = "NA"; // using vcflib; thanks to Erik Garrison VariantCallFile variantFile; // zero based index for the target and background indivudals map<int, int> it, ib; // genotype likelihood format string type = "NA"; // are we polarizing the counts relative to the ancestral allele? bool use_ancestral_state = false; set<char> allowed_ancestral_bases = { 'A', 'T', 'C', 'G' }; const struct option longopts[] = { {"version" , 0, 0, 'v'}, {"help" , 0, 0, 'h'}, {"file" , 1, 0, 'f'}, {"target" , 1, 0, 't'}, {"region" , 1, 0, 'r'}, {"type" , 1, 0, 'y'}, {"snp" , 0, 0, 's'}, {"ancestral" , 0, 0, 'a'}, {0,0,0,0} }; int index; int iarg=0; while(iarg != -1) { iarg = getopt_long(argc, argv, "y:r:d:t:b:f:chvsa", longopts, &index); switch (iarg) { case 'a': { use_ancestral_state = true; break; } case 's': { snp = true; break; } case 'h': { printHelp(); return 0; } case 'v': { printVersion(); return 0; } case 't': { loadIndices(it, optarg); cerr << "INFO: there are " << it.size() << " individuals in the target" << endl; cerr << "INFO: target ids: " << optarg << endl; break; } case 'b': { loadIndices(ib, optarg); cerr << "INFO: there are " << ib.size() << " individuals in the background" << endl; cerr << "INFO: background ids: " << optarg << endl; break; } case 'f': { cerr << "INFO: file: " << optarg << endl; filename = optarg; break; } case 'r': { cerr << "INFO: set seqid region to : " << optarg << endl; region = optarg; break; } case 'y': { type = optarg; cerr << "INFO: set genotype likelihood to: " << type << endl; break; } default: break; } } if(filename.empty()){ cerr << "FATAL: failed to specify a file" << endl; printHelp(); } bool is_open; if (filename == "-") { is_open=variantFile.open(std::cin); } else { is_open=variantFile.open(filename); } if (!is_open) { cerr << "FATAL: could not open file for reading" << endl; printHelp(); } if(region != "NA"){ if(! variantFile.setRegion(region)){ cerr <<"FATAL: unable to set region" << endl; return 1; } } if (!variantFile.is_open()) { cerr << "FATAL: could not open VCF for reading" << endl; printHelp(); return 1; } map<string, int> okayGenotypeLikelihoods; okayGenotypeLikelihoods["PL"] = 1; okayGenotypeLikelihoods["GL"] = 1; okayGenotypeLikelihoods["GP"] = 1; okayGenotypeLikelihoods["GT"] = 1; if(type == "NA"){ cerr << "FATAL: failed to specify genotype likelihood format : PL or GL" << endl; printHelp(); return 1; } if(okayGenotypeLikelihoods.find(type) == okayGenotypeLikelihoods.end()){ cerr << "FATAL: genotype likelihood is incorrectly formatted, only use: PL or GL" << endl; printHelp(); return 1; } Variant var(variantFile); vector<string> samples = variantFile.sampleNames; int nsamples = samples.size(); vector<indv *> countData; vector<string > countDataSampleName; for ( map<int ,int>::iterator x=it.begin(); x!=it.end(); ++x) { countDataSampleName.push_back(samples[x->first] ); } for(int i = 0; i < it.size(); i++){ indv * dip = new indv; dip->nhet = 0; dip->nhom = 0; dip->nalt = 0; dip->nocall = 0; countData.push_back(dip); } while (variantFile.getNextVariant(var)) { // biallelic sites naturally if(var.alt.size() > 1){ continue; } if(snp){ bool hit =false; for(vector<string>::iterator it = var.alleles.begin(); it != var.alleles.end(); it++){ if((*it).size() > 1){ hit = true; } } if(hit){ continue; } } // decide if we can polarize the site if we are using the ancestral allele bool ref_is_ancestral_allele = true; if (use_ancestral_state) { // we need the ancestral allele to decide what to do at this site if (var.info.find("AA") == var.info.end()) continue; string ancestral_allele = var.info["AA"].front(); // if we do not have a polarized site with only allowed bases in the ancestral allele, skip it bool allowed = true; for (string::iterator c = ancestral_allele.begin(); c != ancestral_allele.end(); ++c) { if (!allowed_ancestral_bases.count(*c)) { allowed = false; break; } } if (!allowed) continue; ref_is_ancestral_allele = (ancestral_allele == var.ref); } vector < map< string, vector<string> > > target, background, total; int index = 0; for(int nsamp = 0; nsamp < nsamples; nsamp++){ if(it.find(index) != it.end() ){ const map<string, vector<string> >& sample = var.samples[ samples[nsamp]]; target.push_back(sample); } index += 1; } genotype * populationTarget ; if(type == "PL"){ populationTarget = new pl(); } if(type == "GL"){ populationTarget = new gl(); } if(type == "GP"){ populationTarget = new gp(); } if(type == "GT"){ populationTarget = new gt(); } populationTarget->loadPop(target, var.sequenceName, var.position); for(int i = 0; i < populationTarget->genoIndex.size() ; i++){ if(populationTarget->genoIndex[i] == -1){ countData[i]->nocall += 1; } else if (populationTarget->genoIndex[i] == 0) { if (!use_ancestral_state || ref_is_ancestral_allele) { countData[i]->nhom += 1; } else { countData[i]->nalt += 1; } } else if (populationTarget->genoIndex[i] == 1){ countData[i]->nhet += 1; } else if (populationTarget->genoIndex[i] == 2) { if (!use_ancestral_state || ref_is_ancestral_allele) { countData[i]->nalt += 1; } else { countData[i]->nhom += 1; } } else{ std::cerr << "FATAL: unkown genotype index" << std::endl; cerr << populationTarget->genoIndex[i] << endl; cerr << var << endl; exit(1); } } delete populationTarget; } if (!use_ancestral_state) { std::cout << "#sample-id\tn-nocall\tn-hom-ref\tn-het\tn-hom-alt" << std::endl; } else { std::cout << "#sample-id\tn-nocall\tn-hom-ancestral\tn-het\tn-hom-derived" << std::endl; } for(int i = 0; i < countData.size(); i++){ std::cout << countDataSampleName[i] << "\t" << countData[i]->nocall << "\t" << countData[i]->nhom << "\t" << countData[i]->nhet << "\t" << countData[i]->nalt << std::endl; } return 0; }
int main(int argc, char** argv) { // set the random seed for MCMC srand((unsigned)time(NULL)); // the filename string filename = "NA"; // set region to scaffold string region = "NA"; // using vcflib; thanks to Erik Garrison VariantCallFile variantFile; // zero based index for the target and background indivudals map<int, int> it, ib; // genotype likelihood format string type = "NA"; const struct option longopts[] = { {"version" , 0, 0, 'v'}, {"help" , 0, 0, 'h'}, {"file" , 1, 0, 'f'}, {"target" , 1, 0, 't'}, {"region" , 1, 0, 'r'}, {"type" , 1, 0, 'y'}, {0,0,0,0} }; int index; int iarg=0; while(iarg != -1) { iarg = getopt_long(argc, argv, "y:r:d:t:b:f:chv", longopts, &index); switch (iarg) { case 'h': printHelp(); return 0; case 'v': printVersion(); return 0; case 't': loadIndices(it, optarg); cerr << "INFO: there are " << it.size() << " individuals in the target" << endl; cerr << "INFO: target ids: " << optarg << endl; break; case 'b': loadIndices(ib, optarg); cerr << "INFO: there are " << ib.size() << " individuals in the background" << endl; cerr << "INFO: background ids: " << optarg << endl; break; case 'f': cerr << "INFO: file: " << optarg << endl; filename = optarg; break; case 'r': cerr << "INFO: set seqid region to : " << optarg << endl; region = optarg; break; case 'y': type = optarg; cerr << "INFO: set genotype likelihood to: " << type << endl; break; default: break; } } if(filename == "NA"){ cerr << "FATAL: failed to specify a file" << endl; printHelp(); } if(!variantFile.open(filename)){ cerr << "FATAL: could not open file for reading" << endl; printHelp(); } if(region != "NA"){ if(! variantFile.setRegion(region)){ cerr <<"FATAL: unable to set region" << endl; return 1; } } if (!variantFile.is_open()) { cerr << "FATAL: could not open VCF for reading" << endl; printHelp(); return 1; } map<string, int> okayGenotypeLikelihoods; okayGenotypeLikelihoods["PL"] = 1; okayGenotypeLikelihoods["GL"] = 1; okayGenotypeLikelihoods["GP"] = 1; okayGenotypeLikelihoods["GT"] = 1; if(type == "NA"){ cerr << "FATAL: failed to specify genotype likelihood format : PL or GL" << endl; printHelp(); return 1; } if(okayGenotypeLikelihoods.find(type) == okayGenotypeLikelihoods.end()){ cerr << "FATAL: genotype likelihood is incorrectly formatted, only use: PL or GL" << endl; printHelp(); return 1; } Variant var(variantFile); vector<string> samples = variantFile.sampleNames; int nsamples = samples.size(); while (variantFile.getNextVariant(var)) { // biallelic sites naturally if(var.alt.size() > 1){ continue; } vector < map< string, vector<string> > > target, background, total; int index = 0; for(int nsamp = 0; nsamp < nsamples; nsamp++){ map<string, vector<string> > sample = var.samples[ samples[nsamp]]; if(sample["GT"].front() != "./."){ if(it.find(index) != it.end() ){ target.push_back(sample); } } index += 1; } genotype * populationTarget ; genotype * populationBackground ; if(type == "PL"){ populationTarget = new pl(); } if(type == "GL"){ populationTarget = new gl(); } if(type == "GP"){ populationTarget = new gp(); } if(type == "GT"){ populationTarget = new gt(); } populationTarget->loadPop(target, var.sequenceName, var.position); //cerr << " 3. target allele frequency " << endl; //cerr << " 4. expected heterozygosity " << endl; //cerr << " 5. observed heterozygosity " << endl; //cerr << " 6. number of hets " << endl; //cerr << " 7. number of homozygous ref " << endl; //cerr << " 8. number of homozygous alt " << endl; //cerr << " 9. target Fis " << endl; if(populationTarget->af == -1){ delete populationTarget; continue; } double ehet = 2*(populationTarget->af * (1 - populationTarget->af)); cout << var.sequenceName << "\t" << var.position << "\t" << populationTarget->af << "\t" << ehet << "\t" << populationTarget->hfrq << "\t" << populationTarget->nhet << "\t" << populationTarget->nhomr << "\t" << populationTarget->nhoma << "\t" << populationTarget->fis << endl; delete populationTarget; } return 0; }