int loadGeneFile(const char* fn, const char* gene, OrderedMap<std::string, RangeList>* geneMap) { std::set<std::string> geneSet; makeSet(gene, ',', &geneSet); OrderedMap<std::string, RangeList>& m = *geneMap; LineReader lr(fn); int lineNo = 0; std::vector<std::string> fd; while (lr.readLineBySep(&fd, "\t ")) { ++lineNo; if (fd.size() < 6) { logger->error( "Skip %d line (short of columns) in gene file [ %s ], is gene file " "format correct?", lineNo, fn); break; } std::string& geneName = fd[0]; if (geneSet.size() && geneSet.find(geneName) == geneSet.end()) continue; std::string chr = chopChr(fd[2]); int beg = atoi(fd[4]); int end = atoi(fd[5]); m[geneName].addRange(chr.c_str(), beg, end); } return m.size(); }
void RangeList::filterGeneName(const char* inclusionGeneFileName, const char* geneTableFileName){ // require user input gene list file if (strlen(geneTableFileName) == 0 && strlen(inclusionGeneFileName) != 0) { REprintf("Please provide gene list file (e.g. refFlat) until we are able to process gene\n"); return; //exit(1); } // if not specify any gene, return whole range. if (strlen(inclusionGeneFileName) == 0) { return; } // store which gene do we want if specified std::set< std::string > inclusionSet; LineReader lr(inclusionGeneFileName); std::string gene; while(lr.readLine(&gene)) { inclusionSet.insert(gene); } std::vector<std::string> fields; std::string chr; std::string geneNameTbl; LineReader geneTable(geneTableFileName); while (geneTable.readLineBySep(&fields, "\t ")) { geneNameTbl = fields[0]; if (inclusionSet.find(geneNameTbl) != inclusionSet.end()){ // store gene range chr = chopChr(fields[2].c_str()); this->rangeCollection.addRange(chr, atoi(fields[4].c_str()), // start atoi(fields[5].c_str())); // end } } if (this->rangeCollection.size() == 0){ Rprintf("We cannot find given gene in your geneListFile, so all sites will be outputed\n"); } }
int main(int argc, char** argv) { time_t currentTime = time(0); fprintf(stderr, "Analysis started at: %s", ctime(¤tTime)); PARSE_PARAMETER(argc, argv); PARAMETER_STATUS(); if (FLAG_REMAIN_ARG.size() > 0) { fprintf(stderr, "Unparsed arguments: "); for (unsigned int i = 0; i < FLAG_REMAIN_ARG.size(); i++) { fprintf(stderr, " %s", FLAG_REMAIN_ARG[i].c_str()); } fprintf(stderr, "\n"); abort(); } REQUIRE_STRING_PARAMETER(FLAG_inVcf, "Please provide input file using: --inVcf"); const char* fn = FLAG_inVcf.c_str(); VCFInputFile vin(fn); // set range filters here // e.g. // vin.setRangeList("1:69500-69600"); vin.setRangeList(FLAG_rangeList.c_str()); vin.setRangeFile(FLAG_rangeFile.c_str()); // set people filters here if (FLAG_peopleIncludeID.size() || FLAG_peopleIncludeFile.size()) { vin.excludeAllPeople(); vin.includePeople(FLAG_peopleIncludeID.c_str()); vin.includePeopleFromFile(FLAG_peopleIncludeFile.c_str()); } vin.excludePeople(FLAG_peopleExcludeID.c_str()); vin.excludePeopleFromFile(FLAG_peopleExcludeFile.c_str()); // let's write it out. if (FLAG_updateId != "") { int ret = vin.updateId(FLAG_updateId.c_str()); fprintf(stdout, "%d samples have updated id.\n", ret); } // load gene ranges std::map<std::string, std::string> geneRange; if (FLAG_geneName.size()) { if (FLAG_geneFile.size() == 0) { fprintf(stderr, "Have to provide --geneFile to extract by gene.\n"); abort(); } LineReader lr(FLAG_geneFile); std::vector<std::string> fd; while (lr.readLineBySep(&fd, "\t ")) { if (FLAG_geneName != fd[0]) continue; fd[2] = chopChr(fd[2]); // chop "chr1" to "1" if (geneRange.find(fd[0]) == geneRange.end()) { geneRange[fd[0]] = fd[2] + ":" + fd[4] + "-" + fd[5]; } else { geneRange[fd[0]] += "," + fd[2] + ":" + fd[4] + "-" + fd[5]; } }; } std::string range; for (std::map<std::string, std::string>::iterator it = geneRange.begin(); it != geneRange.end(); it++) { if (range.size() > 0) { range += ","; } range += it->second; }; fprintf(stderr, "range = %s\n", range.c_str()); vin.setRangeList(range.c_str()); Regex regex; if (FLAG_annoType.size()) { regex.readPattern(FLAG_annoType); } // print header std::vector<std::string> names; vin.getVCFHeader()->getPeopleName(&names); printf("CHROM\tPOS"); for (unsigned int i = 0; i < names.size(); i++) { printf("\t%s", names[i].c_str()); } printf("\n"); // real working part int nonVariantSite = 0; while (vin.readRecord()) { VCFRecord& r = vin.getVCFRecord(); VCFPeople& people = r.getPeople(); VCFIndividual* indv; if (FLAG_variantOnly) { bool hasVariant = false; int geno; int GTidx = r.getFormatIndex("GT"); for (size_t i = 0; i < people.size(); i++) { indv = people[i]; geno = indv->justGet(GTidx).getGenotype(); if (geno != 0 && geno != MISSING_GENOTYPE) hasVariant = true; } if (!hasVariant) { nonVariantSite++; continue; } } if (FLAG_annoType.size()) { bool isMissing = false; const char* tag = r.getInfoTag("ANNO", &isMissing).toStr(); if (isMissing) continue; // fprintf(stdout, "ANNO = %s", tag); bool match = regex.match(tag); // fprintf(stdout, " %s \t", match ? "match": "noMatch"); // fprintf(stdout, " %s \n", exists ? "exists": "missing"); if (!match) { continue; } } fprintf(stdout, "%s\t%s", r.getChrom(), r.getPosStr()); for (size_t i = 0; i < people.size(); i++) { indv = people[i]; fprintf(stdout, "\t%d", indv->justGet(0).getGenotype()); } fprintf(stdout, "\n"); }; currentTime = time(0); fprintf(stderr, "Analysis ends at: %s", ctime(¤tTime)); return 0; };