int main(int argc, char** argv){ time_t currentTime = time(0); fprintf(stderr, "Analysis started at: %s", ctime(¤tTime)); //////////////////////////////////////////////// BEGIN_PARAMETER_LIST(pl) ADD_PARAMETER_GROUP(pl, "Input/Output") ADD_STRING_PARAMETER(pl, inVcf, "--inVcf", "input VCF File") ADD_STRING_PARAMETER(pl, snp, "--snp", "input dbSNP File (.rod)") ADD_STRING_PARAMETER(pl, hapmap, "--hapmap", "input HapMap File (.bim)") ADD_PARAMETER_GROUP(pl, "Site Filter") ADD_STRING_PARAMETER(pl, rangeList, "--rangeList", "Specify some ranges to use, please use chr:begin-end format.") ADD_STRING_PARAMETER(pl, rangeFile, "--rangeFile", "Specify the file containing ranges, please use chr:begin-end format.") END_PARAMETER_LIST(pl) ; pl.Read(argc, argv); pl.Status(); if (FLAG_REMAIN_ARG.size() > 0){ fprintf(stderr, "Unparsed arguments: "); for (unsigned int i = 0; i < FLAG_REMAIN_ARG.size(); i++){ fprintf(stderr, " %s", FLAG_REMAIN_ARG[i].c_str()); } fprintf(stderr, "\n"); abort(); } REQUIRE_STRING_PARAMETER(FLAG_inVcf, "Please provide input file using: --inVcf"); const char defaultDbSnp[] = "/net/fantasia/home/zhanxw/amd/data/umake-resources/dbSNP/dbsnp_129_b37.rod.map"; if (FLAG_snp == "") { FLAG_snp = defaultDbSnp; fprintf(stderr, "Use default dbsnp: [ %s ]\n", defaultDbSnp); } SiteSet snpSet; snpSet.loadRodFile(FLAG_snp); fprintf(stderr, "%zu dbSNP sites loaded.\n", snpSet.getTotalSite()); const char defaultHM3[] = "/net/fantasia/home/zhanxw/amd/data/umake-resources/HapMap3/hapmap3_r3_b37_fwd.consensus.qc.poly.bim"; if (FLAG_hapmap == "") { FLAG_hapmap = defaultHM3; fprintf(stderr, "Use default HapMap: [ %s ]\n", defaultHM3); } SiteSet hmSet; hmSet.loadBimFile(FLAG_hapmap); fprintf(stderr, "%zu Hapmap sites loaded.\n", hmSet.getTotalSite()); const char* fn = FLAG_inVcf.c_str(); LineReader lr(fn); // // set range filters here // // e.g. // // vin.setRangeList("1:69500-69600"); // vin.setRangeList(FLAG_rangeList.c_str()); // vin.setRangeFile(FLAG_rangeFile.c_str()); std::map<std::string, Variant> freq; std::string chrom; int pos; // std::string filt; std::string anno; char ref, alt; bool inDbSnp; bool inHapmap; int lineNo = 0; std::vector<std::string> fd; while(lr.readLineBySep(&fd, " \t")){ lineNo ++; if (fd[0][0] == '#') continue; // skip header chrom = fd[0]; // ref is on column 0 (0-based) pos = atoi(fd[1]); // ref is on column 1 (0-based) ref = fd[3][0]; // ref is on column 3 (0-based) alt = fd[4][0]; // ref is on column 4 (0-based) // filt = fd[6]; // filt is on column 6 (0-based) anno = extractAnno(fd[7]); // info is on column 7 (0-based), we will extract ANNO= inDbSnp = snpSet.isIncluded(chrom.c_str(), pos); inHapmap = hmSet.isIncluded(chrom.c_str(), pos); Variant& v = freq[anno]; v.total++; if ( isTs(ref, alt) ) { v.ts ++; if (inDbSnp) { v.tsInDbSnp ++; v.dbSnp ++; } } else if (isTv(ref, alt)) { v.tv ++; if (inDbSnp) { v.tvInDbSnp ++; v.dbSnp ++; } }; if (inHapmap) v.hapmap ++; if (lineNo % 10000 == 0) { fprintf(stderr, "\rProcessed %d lines...\r", lineNo); } }; fprintf(stdout, "Total %d VCF records have been read successfully\n", lineNo); ////////////////////////////////////////////////////////////////////// std::string title = "Summarize per annotation type"; int pad = (170 - title.size() ) /2 ; std::string outTitle = std::string(pad, '-') + title + std::string(pad, '-'); puts(outTitle.c_str()); printf("%40s\t%10s\t%10s\t%10s\t%10s\t%10s\t%10s\t%10s\t%10s\n", "Filter", "#SNPs", "#dbSNP", "%dbSNP", "Known Ts/Tv", "Novel Ts/Tv", "Overall", "%TotalHM3", "%HMCalled"); std::map<std::string, Variant> indvFreq; Variant total; for (std::map<std::string, Variant>::iterator i = freq.begin() ; i != freq.end(); ++i ){ i->second.print(i->first, hmSet); total += i->second; }; total.print("TOTAL", hmSet); currentTime = time(0); fprintf(stderr, "Analysis end at: %s", ctime(¤tTime)); return 0; };
int main(int argc, char** argv) { time_t currentTime = time(0); fprintf(stderr, "Analysis started at: %s", ctime(¤tTime)); PARSE_PARAMETER(argc, argv); PARAMETER_STATUS(); if (FLAG_REMAIN_ARG.size() > 0) { fprintf(stderr, "Unparsed arguments: "); for (unsigned int i = 0; i < FLAG_REMAIN_ARG.size(); i++) { fprintf(stderr, " %s", FLAG_REMAIN_ARG[i].c_str()); } fprintf(stderr, "\n"); abort(); } REQUIRE_STRING_PARAMETER(FLAG_inVcf, "Please provide input file using: --inVcf"); const char defaultDbSnp[] = "/net/fantasia/home/zhanxw/amd/data/umake-resources/dbSNP/" "dbsnp_129_b37.rod.map"; if (FLAG_snp == "") { FLAG_snp = defaultDbSnp; fprintf(stderr, "Use default dbsnp: [ %s ]\n", defaultDbSnp); } SiteSet snpSet; snpSet.loadRodFile(FLAG_snp); fprintf(stderr, "%zu dbSNP sites loaded.\n", snpSet.getTotalSite()); const char defaultHM3[] = "/net/fantasia/home/zhanxw/amd/data/umake-resources/HapMap3/" "hapmap3_r3_b37_fwd.consensus.qc.poly.bim"; if (FLAG_hapmap == "") { FLAG_hapmap = defaultHM3; fprintf(stderr, "Use default HapMap: [ %s ]\n", defaultHM3); } SiteSet hmSet; hmSet.loadBimFile(FLAG_hapmap); fprintf(stderr, "%zu Hapmap sites loaded.\n", hmSet.getTotalSite()); const char* fn = FLAG_inVcf.c_str(); LineReader lr(fn); // // set range filters here // // e.g. // // vin.setRangeList("1:69500-69600"); // vin.setRangeList(FLAG_rangeList.c_str()); // vin.setRangeFile(FLAG_rangeFile.c_str()); std::map<std::string, Variant> freq; std::string chrom; int pos; std::string filt; char ref, alt; bool inDbSnp; bool inHapmap; int lineNo = 0; std::vector<std::string> fd; while (lr.readLineBySep(&fd, " \t")) { lineNo++; if (fd[0][0] == '#') continue; // skip header chrom = fd[0]; // ref is on column 0 (0-based) pos = atoi(fd[1]); // ref is on column 1 (0-based) ref = fd[3][0]; // ref is on column 3 (0-based) alt = fd[4][0]; // ref is on column 4 (0-based) filt = fd[6]; // filt is on column 6 (0-based) inDbSnp = snpSet.isIncluded(chrom.c_str(), pos); inHapmap = hmSet.isIncluded(chrom.c_str(), pos); Variant& v = freq[filt]; v.total++; if (isTs(ref, alt)) { v.ts++; if (inDbSnp) { v.tsInDbSnp++; v.dbSnp++; } } else if (isTv(ref, alt)) { v.tv++; if (inDbSnp) { v.tvInDbSnp++; v.dbSnp++; } }; if (inHapmap) v.hapmap++; }; fprintf(stdout, "Total %d VCF records have converted successfully\n", lineNo); ////////////////////////////////////////////////////////////////////// std::string title = "Summarize per combined filter"; int pad = (170 - title.size()) / 2; std::string outTitle = std::string(pad, '-') + title + std::string(pad, '-'); puts(outTitle.c_str()); printf("%40s\t%10s\t%10s\t%10s\t%10s\t%10s\t%10s\t%10s\t%10s\n", "Filter", "#SNPs", "#dbSNP", "%dbSNP", "Known Ts/Tv", "Novel Ts/Tv", "Overall", "%TotalHM3", "%HMCalled"); std::map<std::string, Variant> indvFreq; Variant pass; Variant fail; Variant total; std::vector<std::string> filters; // individual filter for (std::map<std::string, Variant>::iterator i = freq.begin(); i != freq.end(); ++i) { const std::string& filt = i->first; const Variant& v = i->second; v.print(filt, hmSet); // calculate indvFreq, pass, fail and total stringTokenize(filt, ';', &filters); for (unsigned int j = 0; j < filters.size(); j++) { const std::string& filt = filters[j]; indvFreq[filt] += v; } if (filt == "PASS") pass += v; else fail += v; total += v; }; ////////////////////////////////////////////////////////////////////// title = "Summarize per individual filter"; pad = (170 - title.size()) / 2; outTitle = std::string(pad, '-') + title + std::string(pad, '-'); puts(outTitle.c_str()); printf("%40s\t%10s\t%10s\t%10s\t%10s\t%10s\t%10s\t%10s\t%10s\n", "Filter", "#SNPs", "#dbSNP", "%dbSNP", "Known Ts/Tv", "Novel Ts/Tv", "Overall", "%TotalHM3", "%HMCalled"); for (std::map<std::string, Variant>::iterator i = indvFreq.begin(); i != indvFreq.end(); ++i) { const std::string& filt = i->first; const Variant& v = i->second; v.print(filt, hmSet); } ////////////////////////////////////////////////////////////////////// title = "Summarize per pass/fail filter"; pad = (170 - title.size()) / 2; outTitle = std::string(pad, '-') + title + std::string(pad, '-'); puts(outTitle.c_str()); printf("%40s\t%10s\t%10s\t%10s\t%10s\t%10s\t%10s\t%10s\t%10s\n", "Filter", "#SNPs", "#dbSNP", "%dbSNP", "Known Ts/Tv", "Novel Ts/Tv", "Overall", "%TotalHM3", "%HMCalled"); pass.print("PASS", hmSet); fail.print("FAIL", hmSet); total.print("TOTAL", hmSet); currentTime = time(0); fprintf(stderr, "Analysis end at: %s", ctime(¤tTime)); return 0; };
int main(int argc, char** argv) { time_t currentTime = time(0); fprintf(stderr, "Analysis started at: %s", ctime(¤tTime)); PARSE_PARAMETER(argc, argv); PARAMETER_STATUS(); if (FLAG_REMAIN_ARG.size() > 0) { fprintf(stderr, "Unparsed arguments: "); for (unsigned int i = 0; i < FLAG_REMAIN_ARG.size(); i++) { fprintf(stderr, " %s", FLAG_REMAIN_ARG[i].c_str()); } fprintf(stderr, "\n"); abort(); } REQUIRE_STRING_PARAMETER(FLAG_inVcf, "Please provide input file using: --inVcf"); const char defaultDbSnp[] = "/net/fantasia/home/zhanxw/amd/data/umake-resources/dbSNP/" "dbsnp_129_b37.rod.map"; if (FLAG_snp == "") { FLAG_snp = defaultDbSnp; fprintf(stderr, "Use default dbsnp: [ %s ]\n", defaultDbSnp); } SiteSet snpSet; snpSet.loadRodFile(FLAG_snp); fprintf(stderr, "%zu dbSNP sites loaded.\n", snpSet.getTotalSite()); const char defaultHM3[] = "/net/fantasia/home/zhanxw/amd/data/umake-resources/HapMap3/" "hapmap3_r3_b37_fwd.consensus.qc.poly.bim"; if (FLAG_hapmap == "") { FLAG_hapmap = defaultHM3; fprintf(stderr, "Use default HapMap: [ %s ]\n", defaultHM3); } SiteSet hmSet; hmSet.loadBimFile(FLAG_hapmap); fprintf(stderr, "%zu Hapmap sites loaded.\n", hmSet.getTotalSite()); const char* fn = FLAG_inVcf.c_str(); LineReader lr(fn); // // set range filters here // // e.g. // // vin.setRangeList("1:69500-69600"); // vin.setRangeList(FLAG_rangeList.c_str()); // vin.setRangeFile(FLAG_rangeFile.c_str()); std::map<std::string, Variant> freq; std::string chrom; int pos; // std::string filt; // std::string anno; std::string numVariant; char ref, alt; bool inDbSnp; bool inHapmap; int lineNo = 0; std::vector<std::string> fd; while (lr.readLineBySep(&fd, " \t")) { lineNo++; if (fd[0][0] == '#') continue; // skip header chrom = fd[0]; // ref is on column 0 (0-based) pos = atoi(fd[1]); // ref is on column 1 (0-based) ref = fd[3][0]; // ref is on column 3 (0-based) alt = fd[4][0]; // ref is on column 4 (0-based) // filt = fd[6]; // filt is on column 6 (0-based) // anno = extractAnno(fd[7]); // info is on column 7 (0-based), we will // extract ANNO= // obtain number of variants if (fd.size() <= 9) { // first 9 columns are not individuals numVariant = toString(0); } else { int numVar = 0; for (size_t i = 9; i < fd.size(); ++i) { int varCount = countVariant(fd[i]); if (varCount > 0) numVar += varCount; } numVariant = toString(numVar); } inDbSnp = snpSet.isIncluded(chrom.c_str(), pos); inHapmap = hmSet.isIncluded(chrom.c_str(), pos); Variant& v = freq[numVariant]; v.total++; if (isTs(ref, alt)) { v.ts++; if (inDbSnp) { v.tsInDbSnp++; v.dbSnp++; } } else if (isTv(ref, alt)) { v.tv++; if (inDbSnp) { v.tvInDbSnp++; v.dbSnp++; } }; if (inHapmap) v.hapmap++; if (lineNo % 10000 == 0) { fprintf(stderr, "\rProcessed %d lines...\r", lineNo); } }; fprintf(stdout, "Total %d VCF records have been read successfully\n", lineNo); ////////////////////////////////////////////////////////////////////// std::string title = "Summarize per annotation type"; int pad = (170 - title.size()) / 2; std::string outTitle = std::string(pad, '-') + title + std::string(pad, '-'); puts(outTitle.c_str()); printf("%40s\t%10s\t%10s\t%10s\t%10s\t%10s\t%10s\t%10s\t%10s\n", "Filter", "#SNPs", "#dbSNP", "%dbSNP", "Known Ts/Tv", "Novel Ts/Tv", "Overall", "%TotalHM3", "%HMCalled"); std::map<std::string, Variant> indvFreq; Variant total; // to sort variants by its integer order, we use a temporary map std::map<int, Variant> tmp; for (std::map<std::string, Variant>::iterator i = freq.begin(); i != freq.end(); ++i) { tmp[atoi(i->first)] = i->second; }; for (std::map<int, Variant>::iterator i = tmp.begin(); i != tmp.end(); ++i) { i->second.print(toString(i->first), hmSet); total += i->second; }; total.print("TOTAL", hmSet); currentTime = time(0); fprintf(stderr, "Analysis end at: %s", ctime(¤tTime)); return 0; };