Пример #1
0
int loadGeneFile(const char* fn, const char* gene,
                 OrderedMap<std::string, RangeList>* geneMap) {
  std::set<std::string> geneSet;
  makeSet(gene, ',', &geneSet);

  OrderedMap<std::string, RangeList>& m = *geneMap;
  LineReader lr(fn);
  int lineNo = 0;
  std::vector<std::string> fd;
  while (lr.readLineBySep(&fd, "\t ")) {
    ++lineNo;
    if (fd.size() < 6) {
      logger->error(
          "Skip %d line (short of columns) in gene file [ %s ], is gene file "
          "format correct?",
          lineNo, fn);
      break;
    }

    std::string& geneName = fd[0];
    if (geneSet.size() && geneSet.find(geneName) == geneSet.end()) continue;

    std::string chr = chopChr(fd[2]);
    int beg = atoi(fd[4]);
    int end = atoi(fd[5]);
    m[geneName].addRange(chr.c_str(), beg, end);
  }
  return m.size();
}
Пример #2
0
void RangeList::filterGeneName(const char* inclusionGeneFileName, const char* geneTableFileName){
  // require user input gene list file
  if (strlen(geneTableFileName) == 0 && strlen(inclusionGeneFileName) != 0) {
    REprintf("Please provide gene list file (e.g. refFlat) until we are able to process gene\n");
    return;
    //exit(1);
  }

  // if not specify any gene, return whole range.
  if (strlen(inclusionGeneFileName) == 0) {
    return;
  }

  // store which gene do we want if specified
  std::set< std::string > inclusionSet;
  LineReader lr(inclusionGeneFileName);
  std::string gene;
  while(lr.readLine(&gene)) {
    inclusionSet.insert(gene);
  }

  std::vector<std::string> fields;
  std::string chr;
  std::string geneNameTbl;

  LineReader geneTable(geneTableFileName);
  while (geneTable.readLineBySep(&fields, "\t ")) {
    geneNameTbl = fields[0];
    if (inclusionSet.find(geneNameTbl) != inclusionSet.end()){ // store gene range
      chr = chopChr(fields[2].c_str());
      this->rangeCollection.addRange(chr,
                                     atoi(fields[4].c_str()),   // start
                                     atoi(fields[5].c_str()));   // end
    }
  }
  if (this->rangeCollection.size() == 0){
    Rprintf("We cannot find given gene in your geneListFile, so all sites will be outputed\n");
  }
}
Пример #3
0
int main(int argc, char** argv) {
  time_t currentTime = time(0);
  fprintf(stderr, "Analysis started at: %s", ctime(&currentTime));

  PARSE_PARAMETER(argc, argv);
  PARAMETER_STATUS();

  if (FLAG_REMAIN_ARG.size() > 0) {
    fprintf(stderr, "Unparsed arguments: ");
    for (unsigned int i = 0; i < FLAG_REMAIN_ARG.size(); i++) {
      fprintf(stderr, " %s", FLAG_REMAIN_ARG[i].c_str());
    }
    fprintf(stderr, "\n");
    abort();
  }

  REQUIRE_STRING_PARAMETER(FLAG_inVcf,
                           "Please provide input file using: --inVcf");

  const char* fn = FLAG_inVcf.c_str();
  VCFInputFile vin(fn);

  // set range filters here
  // e.g.
  // vin.setRangeList("1:69500-69600");
  vin.setRangeList(FLAG_rangeList.c_str());
  vin.setRangeFile(FLAG_rangeFile.c_str());

  // set people filters here
  if (FLAG_peopleIncludeID.size() || FLAG_peopleIncludeFile.size()) {
    vin.excludeAllPeople();
    vin.includePeople(FLAG_peopleIncludeID.c_str());
    vin.includePeopleFromFile(FLAG_peopleIncludeFile.c_str());
  }
  vin.excludePeople(FLAG_peopleExcludeID.c_str());
  vin.excludePeopleFromFile(FLAG_peopleExcludeFile.c_str());

  // let's write it out.
  if (FLAG_updateId != "") {
    int ret = vin.updateId(FLAG_updateId.c_str());
    fprintf(stdout, "%d samples have updated id.\n", ret);
  }

  // load gene ranges
  std::map<std::string, std::string> geneRange;
  if (FLAG_geneName.size()) {
    if (FLAG_geneFile.size() == 0) {
      fprintf(stderr, "Have to provide --geneFile to extract by gene.\n");
      abort();
    }
    LineReader lr(FLAG_geneFile);
    std::vector<std::string> fd;
    while (lr.readLineBySep(&fd, "\t ")) {
      if (FLAG_geneName != fd[0]) continue;
      fd[2] = chopChr(fd[2]);  // chop "chr1" to "1"
      if (geneRange.find(fd[0]) == geneRange.end()) {
        geneRange[fd[0]] = fd[2] + ":" + fd[4] + "-" + fd[5];
      } else {
        geneRange[fd[0]] += "," + fd[2] + ":" + fd[4] + "-" + fd[5];
      }
    };
  }
  std::string range;
  for (std::map<std::string, std::string>::iterator it = geneRange.begin();
       it != geneRange.end(); it++) {
    if (range.size() > 0) {
      range += ",";
    }
    range += it->second;
  };
  fprintf(stderr, "range = %s\n", range.c_str());
  vin.setRangeList(range.c_str());

  Regex regex;
  if (FLAG_annoType.size()) {
    regex.readPattern(FLAG_annoType);
  }

  // print header
  std::vector<std::string> names;
  vin.getVCFHeader()->getPeopleName(&names);
  printf("CHROM\tPOS");
  for (unsigned int i = 0; i < names.size(); i++) {
    printf("\t%s", names[i].c_str());
  }
  printf("\n");

  // real working part
  int nonVariantSite = 0;
  while (vin.readRecord()) {
    VCFRecord& r = vin.getVCFRecord();
    VCFPeople& people = r.getPeople();
    VCFIndividual* indv;
    if (FLAG_variantOnly) {
      bool hasVariant = false;
      int geno;
      int GTidx = r.getFormatIndex("GT");
      for (size_t i = 0; i < people.size(); i++) {
        indv = people[i];
        geno = indv->justGet(GTidx).getGenotype();
        if (geno != 0 && geno != MISSING_GENOTYPE) hasVariant = true;
      }
      if (!hasVariant) {
        nonVariantSite++;
        continue;
      }
    }

    if (FLAG_annoType.size()) {
      bool isMissing = false;
      const char* tag = r.getInfoTag("ANNO", &isMissing).toStr();
      if (isMissing) continue;
      // fprintf(stdout, "ANNO = %s", tag);
      bool match = regex.match(tag);
      // fprintf(stdout, " %s \t", match ? "match": "noMatch");
      // fprintf(stdout, " %s \n", exists ? "exists": "missing");
      if (!match) {
        continue;
      }
    }

    fprintf(stdout, "%s\t%s", r.getChrom(), r.getPosStr());

    for (size_t i = 0; i < people.size(); i++) {
      indv = people[i];
      fprintf(stdout, "\t%d", indv->justGet(0).getGenotype());
    }
    fprintf(stdout, "\n");
  };

  currentTime = time(0);
  fprintf(stderr, "Analysis ends at: %s", ctime(&currentTime));

  return 0;
};