Exemplo n.º 1
0
int main(int argc, char* argv[]) {
  PARSE_PARAMETER(argc, argv);

  if (FLAG_help) {
    PARAMETER_HELP();
    return 0;
  }

  PARAMETER_STATUS();
  if (FLAG_REMAIN_ARG.size() > 0) {
    fprintf(stderr, "Unparsed arguments: ");
    for (unsigned int i = 0; i < FLAG_REMAIN_ARG.size(); i++) {
      fprintf(stderr, " %s", FLAG_REMAIN_ARG[i].c_str());
    }
    exit(1);
  }

  REQUIRE_STRING_PARAMETER(FLAG_inBgen,
                           "Please provide input file using: --inBgen");

  BGenFile read(FLAG_inBgen);
  if (!FLAG_inSample.empty()) {
    if (read.loadSampleFile(FLAG_inSample)) {
      fprintf(stderr, "ERROR: failed to sample file [ %s ]!\n",
              FLAG_inSample.c_str());
      exit(1);
    }
  }
  read.printInfo();
  return 0;
}
Exemplo n.º 2
0
int main(int argc, char** argv){
  time_t currentTime = time(0);
  fprintf(stderr, "Analysis started at: %s", ctime(&currentTime));

  ////////////////////////////////////////////////
  BEGIN_PARAMETER_LIST(pl)
      ADD_PARAMETER_GROUP(pl, "Input/Output")
      ADD_STRING_PARAMETER(pl, inVcf, "--inVcf", "input VCF File")
      ADD_STRING_PARAMETER(pl, outVcf, "--outVcf", "output VCF File")      
      ADD_PARAMETER_GROUP(pl, "Site Filter")
      ADD_STRING_PARAMETER(pl, site, "--site", "input site file (.rod file: 0-based position)")
      ADD_BOOL_PARAMETER(pl, inverse, "--inverse", "Inverse site")
      ADD_STRING_PARAMETER(pl, rangeList, "--rangeList", "Specify some ranges to use, please use chr:begin-end format.")
      ADD_STRING_PARAMETER(pl, rangeFile, "--rangeFile", "Specify the file containing ranges, please use chr:begin-end format.")
      ADD_BOOL_PARAMETER(pl, snpOnly, "--snpOnly", "Specify only extract SNP site")      
      END_PARAMETER_LIST(pl)
      ;

  pl.Read(argc, argv);
  pl.Status();

  if (FLAG_REMAIN_ARG.size() > 0){
    fprintf(stderr, "Unparsed arguments: ");
    for (unsigned int i = 0; i < FLAG_REMAIN_ARG.size(); i++){
      fprintf(stderr, " %s", FLAG_REMAIN_ARG[i].c_str());
    }
    fprintf(stderr, "\n");
    abort();
  }

  REQUIRE_STRING_PARAMETER(FLAG_inVcf, "Please provide input file using: --inVcf");
  REQUIRE_STRING_PARAMETER(FLAG_outVcf, "Please provide output file using: --outVcf");
  
  const char defaultDbSnp[] = "/net/fantasia/home/zhanxw/amd/data/umake-resources/dbSNP/dbsnp_129_b37.rod.map";
  if (FLAG_site == "") {
    FLAG_site = defaultDbSnp;
    fprintf(stderr, "Use default dbsnp: [ %s ]\n", defaultDbSnp);
  }
  SiteSet snpSet;
  snpSet.loadRodFile(FLAG_site);
  fprintf(stderr, "%zu dbSNP sites loaded.\n", snpSet.getTotalSite());

  const char* fn = FLAG_inVcf.c_str();
  VCFInputFile vin(fn);

  VCFOutputFile* vout = NULL;
  // PlinkOutputFile* pout = NULL;
  if (FLAG_outVcf.size() > 0) {
    vout = new VCFOutputFile(FLAG_outVcf.c_str());
  };
  if (vout) vout->writeHeader(vin.getVCFHeader());
  
  // set range filters here
  // e.g.
  // vin.setRangeList("1:69500-69600");
  vin.setRangeList(FLAG_rangeList.c_str());
  vin.setRangeFile(FLAG_rangeFile.c_str());

  std::string filt;
  /// char ref, alt;
  bool keep;
  int lineNo = 0;
  int lineOut = 0;
  while (vin.readRecord()){
    lineNo ++;
    VCFRecord& r = vin.getVCFRecord();
    keep = snpSet.isIncluded(r.getChrom(), r.getPos());
    if (FLAG_inverse) {
      keep = !keep;
    }
    if (!keep) continue;
    if (FLAG_snpOnly) {
      if ( strlen(r.getRef()) != 1) continue;
      if ( strlen(r.getAlt()) != 1) continue;
      if ( r.getAlt()[0] == '.') continue;  //deletion e.g. A -> .
    }
    if (vout) vout->writeRecord(& r);
    lineOut ++;
  };

  delete vout;
  
  fprintf(stdout, "Total %d VCF records have converted successfully\n", lineNo);
  fprintf(stdout, "Total %d VCF records have outputted successfully\n", lineOut);


  currentTime = time(0);
  fprintf(stderr, "Analysis end at: %s", ctime(&currentTime));  
  return 0;
};
Exemplo n.º 3
0
int main(int argc, char** argv) {
  time_t currentTime = time(0);
  fprintf(stderr, "Analysis started at: %s", ctime(&currentTime));

  PARSE_PARAMETER(argc, argv);
  PARAMETER_STATUS();

  REQUIRE_STRING_PARAMETER(FLAG_s, "Please provide input file using: -s");

  // load referene
  const char* fn = FLAG_s.c_str();
  VCFInputFile vin(fn);
  StringArray refPeople;
  vin.getVCFHeader()->getPeopleName(&refPeople);

  VCFInputFile** compareVcfs = new VCFInputFile*[FLAG_REMAIN_ARG.size()];
  StringArray comparePeopleNames;
  for (unsigned int i = 0; i < FLAG_REMAIN_ARG.size(); i++) {
    compareVcfs[i] = new VCFInputFile(FLAG_REMAIN_ARG[i]);
    StringArray names;
    compareVcfs[i]->getVCFHeader()->getPeopleName(&names);
    for (unsigned int j = 0; j < names.size(); j++) {
      comparePeopleNames.push_back(names[j]);
    };
  };
  StringArray unionPeopleNames;
  StringArray commonNames;
  set_union(comparePeopleNames, &unionPeopleNames);
  set_intersection(refPeople, unionPeopleNames, &commonNames);

  vin.excludeAllPeople();
  vin.includePeople(commonNames);
  fprintf(stderr, "Total %d samples are included.\n", (int)commonNames.size());

  // set range filters here
  // e.g.
  // vin.setRangeList("1:69500-69600");
  vin.setRangeList(FLAG_rangeList.c_str());
  vin.setRangeFile(FLAG_rangeFile.c_str());
  vin.setSiteFile(FLAG_siteFile.c_str());
  AllConcordanceType data;
  loadGenotype(vin, &data, Value::REFERENCE);

  printf(
      "File\t"
      "PeopleId\t"
      "Overlap\t"
      "Std_total\t"
      "Input_total\t"
      "Std_only\t"
      "Input_only\t"
      "nonRefConcord_overlap\t"
      "Std_variants\t"
      "Input_variants\t"
      "Std_variants_in_Input\t"
      "Ptg_Std_variants_in_Input\t"
      "HomR/HomR\tHomR/Het\tHomR/HomA\t"
      "Het/HomR\tHet/Het\tHet/HomA\t"
      "HomA/HomR\tHomA/Het\tHomA/HomA\n");

  for (unsigned int i = 0; i < FLAG_REMAIN_ARG.size(); i++) {
    fprintf(stderr, "Process %s ... \n", FLAG_REMAIN_ARG[i].c_str());
    compareVcfs[i]->setRangeList(FLAG_rangeList.c_str());
    compareVcfs[i]->setRangeFile(FLAG_rangeList.c_str());
    compareVcfs[i]->setSiteFile(FLAG_siteFile.c_str());
    loadGenotype(*compareVcfs[i], &data, Value::COMPARISON);

    StringArray names;
    compareVcfs[i]->getVCFHeader()->getPeopleName(&names);
    printComparision(*compareVcfs[i], data, names);

    clearGenotype(&data);
  }

  for (unsigned int i = 0; i < FLAG_REMAIN_ARG.size(); i++) {
    delete compareVcfs[i];
  }
  delete[] compareVcfs;

  currentTime = time(0);
  fprintf(stderr, "Analysis end at: %s", ctime(&currentTime));
  return 0;
};
Exemplo n.º 4
0
int main(int argc, char** argv) {
  time_t currentTime = time(0);
  fprintf(stderr, "Analysis started at: %s", ctime(&currentTime));

  PARSE_PARAMETER(argc, argv);
  PARAMETER_STATUS();

  if (FLAG_REMAIN_ARG.size() > 0) {
    fprintf(stderr, "Unparsed arguments: ");
    for (unsigned int i = 0; i < FLAG_REMAIN_ARG.size(); i++) {
      fprintf(stderr, " %s", FLAG_REMAIN_ARG[i].c_str());
    }
    fprintf(stderr, "\n");
    abort();
  }

  REQUIRE_STRING_PARAMETER(FLAG_inVcf,
                           "Please provide input file using: --inVcf");

  const char defaultDbSnp[] =
      "/net/fantasia/home/zhanxw/amd/data/umake-resources/dbSNP/"
      "dbsnp_129_b37.rod.map";
  if (FLAG_snp == "") {
    FLAG_snp = defaultDbSnp;
    fprintf(stderr, "Use default dbsnp: [ %s ]\n", defaultDbSnp);
  }
  SiteSet snpSet;
  snpSet.loadRodFile(FLAG_snp);
  fprintf(stderr, "%zu dbSNP sites loaded.\n", snpSet.getTotalSite());

  const char defaultHM3[] =
      "/net/fantasia/home/zhanxw/amd/data/umake-resources/HapMap3/"
      "hapmap3_r3_b37_fwd.consensus.qc.poly.bim";
  if (FLAG_hapmap == "") {
    FLAG_hapmap = defaultHM3;
    fprintf(stderr, "Use default HapMap: [ %s ]\n", defaultHM3);
  }
  SiteSet hmSet;
  hmSet.loadBimFile(FLAG_hapmap);
  fprintf(stderr, "%zu Hapmap sites loaded.\n", hmSet.getTotalSite());

  const char* fn = FLAG_inVcf.c_str();
  LineReader lr(fn);

  // // set range filters here
  // // e.g.
  // // vin.setRangeList("1:69500-69600");
  // vin.setRangeList(FLAG_rangeList.c_str());
  // vin.setRangeFile(FLAG_rangeFile.c_str());

  std::map<std::string, Variant> freq;
  std::string chrom;
  int pos;
  std::string filt;
  char ref, alt;
  bool inDbSnp;
  bool inHapmap;
  int lineNo = 0;
  std::vector<std::string> fd;
  while (lr.readLineBySep(&fd, " \t")) {
    lineNo++;
    if (fd[0][0] == '#') continue;  // skip header
    chrom = fd[0];                  // ref is on column 0 (0-based)
    pos = atoi(fd[1]);              // ref is on column 1 (0-based)
    ref = fd[3][0];                 // ref is on column 3 (0-based)
    alt = fd[4][0];                 // ref is on column 4 (0-based)
    filt = fd[6];                   // filt is on column 6 (0-based)
    inDbSnp = snpSet.isIncluded(chrom.c_str(), pos);
    inHapmap = hmSet.isIncluded(chrom.c_str(), pos);

    Variant& v = freq[filt];
    v.total++;
    if (isTs(ref, alt)) {
      v.ts++;
      if (inDbSnp) {
        v.tsInDbSnp++;
        v.dbSnp++;
      }
    } else if (isTv(ref, alt)) {
      v.tv++;
      if (inDbSnp) {
        v.tvInDbSnp++;
        v.dbSnp++;
      }
    };
    if (inHapmap) v.hapmap++;
  };
  fprintf(stdout, "Total %d VCF records have converted successfully\n", lineNo);

  //////////////////////////////////////////////////////////////////////
  std::string title = "Summarize per combined filter";
  int pad = (170 - title.size()) / 2;
  std::string outTitle = std::string(pad, '-') + title + std::string(pad, '-');
  puts(outTitle.c_str());
  printf("%40s\t%10s\t%10s\t%10s\t%10s\t%10s\t%10s\t%10s\t%10s\n", "Filter",
         "#SNPs", "#dbSNP", "%dbSNP", "Known Ts/Tv", "Novel Ts/Tv", "Overall",
         "%TotalHM3", "%HMCalled");
  std::map<std::string, Variant> indvFreq;
  Variant pass;
  Variant fail;
  Variant total;
  std::vector<std::string> filters;  // individual filter
  for (std::map<std::string, Variant>::iterator i = freq.begin();
       i != freq.end(); ++i) {
    const std::string& filt = i->first;
    const Variant& v = i->second;
    v.print(filt, hmSet);

    // calculate indvFreq, pass, fail and total
    stringTokenize(filt, ';', &filters);
    for (unsigned int j = 0; j < filters.size(); j++) {
      const std::string& filt = filters[j];
      indvFreq[filt] += v;
    }
    if (filt == "PASS")
      pass += v;
    else
      fail += v;
    total += v;
  };
  //////////////////////////////////////////////////////////////////////
  title = "Summarize per individual filter";
  pad = (170 - title.size()) / 2;
  outTitle = std::string(pad, '-') + title + std::string(pad, '-');
  puts(outTitle.c_str());
  printf("%40s\t%10s\t%10s\t%10s\t%10s\t%10s\t%10s\t%10s\t%10s\n", "Filter",
         "#SNPs", "#dbSNP", "%dbSNP", "Known Ts/Tv", "Novel Ts/Tv", "Overall",
         "%TotalHM3", "%HMCalled");
  for (std::map<std::string, Variant>::iterator i = indvFreq.begin();
       i != indvFreq.end(); ++i) {
    const std::string& filt = i->first;
    const Variant& v = i->second;
    v.print(filt, hmSet);
  }
  //////////////////////////////////////////////////////////////////////
  title = "Summarize per pass/fail filter";
  pad = (170 - title.size()) / 2;
  outTitle = std::string(pad, '-') + title + std::string(pad, '-');
  puts(outTitle.c_str());
  printf("%40s\t%10s\t%10s\t%10s\t%10s\t%10s\t%10s\t%10s\t%10s\n", "Filter",
         "#SNPs", "#dbSNP", "%dbSNP", "Known Ts/Tv", "Novel Ts/Tv", "Overall",
         "%TotalHM3", "%HMCalled");

  pass.print("PASS", hmSet);
  fail.print("FAIL", hmSet);
  total.print("TOTAL", hmSet);

  currentTime = time(0);
  fprintf(stderr, "Analysis end at: %s", ctime(&currentTime));
  return 0;
};
Exemplo n.º 5
0
int main(int argc, char* argv[]) {
  time_t currentTime = time(0);
  fprintf(stderr, "Analysis started at: %s", ctime(&currentTime));

  PARSE_PARAMETER(argc, argv);
  PARAMETER_STATUS();

  if (FLAG_REMAIN_ARG.size() > 0) {
    fprintf(stderr, "Unparsed arguments: ");
    for (unsigned int i = 0; i < FLAG_REMAIN_ARG.size(); i++) {
      fprintf(stderr, " %s", FLAG_REMAIN_ARG[i].c_str());
    }
    fprintf(stderr, "\n");
    abort();
  }

  REQUIRE_STRING_PARAMETER(FLAG_inPlink,
                           "Please provide input file using: --inPlink");

  PlinkInputFile* pin = new PlinkInputFile(FLAG_inPlink.c_str());
  FILE* fout = fopen(FLAG_outVcf.c_str(), "wt");
  FILE* flog = fopen((FLAG_outVcf + ".log").c_str(), "wt");

  int numPeople = pin->getNumIndv();
  int numMarker = pin->getNumMarker();
  fprintf(stderr, "Loaded %d individuals and %d markers\n", numPeople,
          numMarker);
  fprintf(flog, "Loaded %d individuals and %d markers\n", numPeople, numMarker);

  fprintf(fout, "##fileformat=VCFv4.0\n");
  fprintf(fout, "##filedate=\n");
  fprintf(fout, "##source=plink2vcf\n");
  fprintf(fout, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT");

  // writer header
  for (int p = 0; p < numPeople; p++) {
    fprintf(fout, "\t%s", pin->indv[p].c_str());
  };
  fprintf(fout, "\n");

  // load reference allele
  std::map<std::string, char> reference;
  int ret = laodReference(FLAG_reference, &reference);
  if (ret) {
    fprintf(stderr, "Loaded %d referenced bases.\n", ret);
    fprintf(flog, "Loaded %d referenced bases.\n", ret);
  };
  // write content
  SimpleMatrix mat;
  ret = pin->readIntoMatrix(&mat);

  //   const char ref = 'N';
  //   const char alt = 'N';
  bool switchRefAlt;
  int switchSite = 0;
  int needFlip = 0;
  for (int m = 0; m < numMarker; m++) {
    switchRefAlt = false;
    if (reference.size() > 0 && reference.count(pin->snp[m]) > 0) {
      char refGiven = reference[pin->snp[m]];

      if (pin->ref[m][0] == refGiven) {
        switchRefAlt = false;
      } else {
        if (pin->alt[m][0] == refGiven) {
          switchRefAlt = true;
          ++switchSite;
          fprintf(flog, "Marker [ %s ] switched ref and alt.\n",
                  pin->snp[m].c_str());
        } else {
          ++needFlip;
          fprintf(flog, "Marker [ %s ] need flipping.\n", pin->snp[m].c_str());
        };
      }
    }
    if (switchRefAlt == false) {
      fprintf(fout, "%s\t", pin->chrom[m].c_str());  // CHROM
      fprintf(fout, "%d\t", pin->pos[m]);            // POS
      fprintf(fout, "%s\t", pin->snp[m].c_str());    // ID
      fprintf(fout, "%c\t", pin->ref[m][0]);         // REF
      fprintf(fout, "%c\t", pin->alt[m][0]);         // ALT
      fprintf(fout, ".\t");                          // QUAL
      fprintf(fout, ".\t");                          // FILTER
      fprintf(fout, ".\t");                          // INFO
      fprintf(fout, "GT\t");                         // FORMAT
      for (int p = 0; p < numPeople; p++) {
        if (p) fputc('\t', fout);

        int geno = mat[p][m];
        switch (geno) {
          case 0:
            fputs("0/0", fout);
            break;
          case 1:
            fputs("0/1", fout);
            break;
          case 2:
            fputs("1/1", fout);
            break;
          default:
            fputs("./.", fout);
            break;
        }
      }
      fprintf(fout, "\n");
    } else {
      fprintf(fout, "%s\t", pin->chrom[m].c_str());  // CHROM
      fprintf(fout, "%d\t", pin->pos[m]);            // POS
      fprintf(fout, "%s\t", pin->snp[m].c_str());    // ID
      fprintf(fout, "%c\t", pin->alt[m][0]);         // REF
      fprintf(fout, "%c\t", pin->ref[m][0]);         // ALT
      fprintf(fout, ".\t");                          // QUAL
      fprintf(fout, ".\t");                          // FILTER
      fprintf(fout, ".\t");                          // INFO
      fprintf(fout, "GT\t");                         // FORMAT
      for (int p = 0; p < numPeople; p++) {
        if (p) fputc('\t', fout);

        int geno = mat[p][m];
        switch (geno) {
          case 0:
            fputs("1/1", fout);
            break;
          case 1:
            fputs("0/1", fout);
            break;
          case 2:
            fputs("0/0", fout);
            break;
          default:
            fputs("./.", fout);
            break;
        }
      }
      fprintf(fout, "\n");
    }
  }

  fclose(fout);
  fclose(flog);

  if (switchSite) {
    fprintf(stderr, "%d SNPs switched ref and alt, see log file.\n",
            switchSite);
  };
  if (needFlip) {
    fprintf(stderr, "%d SNPs need flipping, see log file.\n", needFlip);
  };
  return 0;
}
Exemplo n.º 6
0
int main(int argc, char** argv){
    time_t currentTime = time(0);
    fprintf(stderr, "Analysis started at: %s", ctime(&currentTime));

    ////////////////////////////////////////////////
    BEGIN_PARAMETER_LIST(pl)
        ADD_PARAMETER_GROUP(pl, "Input/Output")
        ADD_STRING_PARAMETER(pl, inVcf, "--inVcf", "input VCF File")
        ADD_STRING_PARAMETER(pl, outMerlin, "--outMerlin", "output prefix")
        ADD_PARAMETER_GROUP(pl, "People Filter")
        ADD_STRING_PARAMETER(pl, peopleIncludeID, "--peopleIncludeID", "give IDs of people that will be included in study")
        ADD_STRING_PARAMETER(pl, peopleIncludeFile, "--peopleIncludeFile", "from given file, set IDs of people that will be included in study")
        ADD_STRING_PARAMETER(pl, peopleExcludeID, "--peopleExcludeID", "give IDs of people that will be included in study")
        ADD_STRING_PARAMETER(pl, peopleExcludeFile, "--peopleExcludeFile", "from given file, set IDs of people that will be included in study")
        ADD_PARAMETER_GROUP(pl, "Site Filter")
        ADD_STRING_PARAMETER(pl, rangeList, "--rangeList", "Specify some ranges to use, please use chr:begin-end format.")
        ADD_STRING_PARAMETER(pl, rangeFile, "--rangeFile", "Specify the file containing ranges, please use chr:begin-end format.")
        END_PARAMETER_LIST(pl)
        ;    

    pl.Read(argc, argv);
    pl.Status();
    
    if (FLAG_REMAIN_ARG.size() > 0){
        fprintf(stderr, "Unparsed arguments: ");
        for (unsigned int i = 0; i < FLAG_REMAIN_ARG.size(); i++){
            fprintf(stderr, " %s", FLAG_REMAIN_ARG[i].c_str());
        }
        fprintf(stderr, "\n");
        abort();
    }

    REQUIRE_STRING_PARAMETER(FLAG_inVcf, "Please provide input file using: --inVcf");

    const char* fn = FLAG_inVcf.c_str(); 
    VCFInputFile vin(fn);

    // set range filters here
    // e.g.     
    // vin.setRangeList("1:69500-69600");
    vin.setRangeList(FLAG_rangeList.c_str());
    vin.setRangeFile(FLAG_rangeFile.c_str());

    // set people filters here
    if (FLAG_peopleIncludeID.size() || FLAG_peopleIncludeFile.size()) {
        vin.excludeAllPeople();
        vin.includePeople(FLAG_peopleIncludeID.c_str());
        vin.includePeopleFromFile(FLAG_peopleIncludeFile.c_str());
    }
    vin.excludePeople(FLAG_peopleExcludeID.c_str());
    vin.excludePeopleFromFile(FLAG_peopleExcludeFile.c_str());
    
    // let's write it out.
    FILE* fMap;  // CHROMOSOME   MARKER          POSITION
    FILE* fDat; // A some_disease\n
                // T some_trait
                // M some_marker
                // M another_marker
    FILE* fPed; // first 5 column: FID, IID, PID, MID, SEX; then follow Dat file
    FILE* fPid; // Person ID file, (extra for Merlin), including all people ID as they are in PED file.

    fMap = fopen( (FLAG_outMerlin + ".map").c_str(), "wt");
    fDat = fopen( (FLAG_outMerlin + ".dat").c_str(), "wt");
    fPed = fopen( (FLAG_outMerlin + ".ped").c_str(), "wt");
    fPid = fopen( (FLAG_outMerlin + ".pid").c_str(), "wt");
    assert(fMap && fDat && fPed && fPid);

    std::string marker; // marker x people
    std::vector<std::string> allMarker;
    Matrix geno; 
    fputs("CHROMOSOME\tMARKER\tPOSITION\n", fMap);
    
    while (vin.readRecord()){
        VCFRecord& r = vin.getVCFRecord(); 
        VCFPeople& people = r.getPeople();
        VCFIndividual* indv;
        // write map file
        marker = r.getID();
        if ( marker == "." ) {
            fprintf(fMap, "%s\t%s:%d\t%d\n", r.getChrom(), r.getChrom(), r.getPos(), r.getPos());
            fprintf(fDat, "M\t%s:%d\n", r.getChrom(), r.getPos());
        } else {
            fprintf(fMap, "%s\t%s\t%d\n", r.getChrom(), marker.c_str(), r.getPos());
            fprintf(fDat, "M\t%s\n", marker.c_str());
        }
        allMarker.push_back(marker);


        geno.Dimension(allMarker.size(), people.size());

        // e.g.: get TAG from INFO field
        // fprintf(stderr, "%s\n", r.getInfoTag("ANNO"));

        int m = allMarker.size() - 1; 
        // e.g.: Loop each (selected) people in the same order as in the VCF 
        for (int i = 0; i < people.size(); i++) {
            indv = people[i];
            // get GT index. if you are sure the index will not change, call this function only once!
            int GTidx = r.getFormatIndex("GT");
            if (GTidx >= 0) {
                geno[m][i] = (*indv)[GTidx].getGenotype();
            }else {
                fprintf(stderr, "Cannot find GT field!\n");
                abort();
            }
        }
    };
    VCFHeader* h = vin.getVCFHeader();
    std::vector< std::string> peopleId;
    h->getPeopleName(&peopleId);
    
    // dump PED and PID file
    for (int p = 0; p < peopleId.size(); p++){
        fprintf(fPed, "%s\t%s\t0\t0\t0", peopleId[p].c_str(), peopleId[p].c_str());
        for (int m = 0; m < allMarker.size(); m++){
            int g = (int)geno[m][p];
            switch (g){
            case 0:
                fputs("\t0/0", fPed);
                break;
            case 1:
                fputs("\t0/1", fPed);
                break;
            case 2:
                fputs("\t1/1", fPed);
                break;
            default:
                fputs("x/x", fPed);
                break;
            }
        }
        fputs("\n", fPed);

        fprintf(fPid, "%s\n", peopleId[p].c_str());
    }
    return 0; 
};
Exemplo n.º 7
0
int main(int argc, char** argv){
  time_t currentTime = time(0);
  fprintf(stderr, "Analysis started at: %s", ctime(&currentTime));

  ////////////////////////////////////////////////
  BEGIN_PARAMETER_LIST(pl)
      ADD_PARAMETER_GROUP(pl, "Input/Output")
      ADD_STRING_PARAMETER(pl, inVcf, "--inVcf", "input VCF File")
      ADD_STRING_PARAMETER(pl, snp, "--snp", "input dbSNP File (.rod)")
      ADD_STRING_PARAMETER(pl, hapmap, "--hapmap", "input HapMap File (.bim)")
      ADD_PARAMETER_GROUP(pl, "Site Filter")
      ADD_STRING_PARAMETER(pl, rangeList, "--rangeList", "Specify some ranges to use, please use chr:begin-end format.")
      ADD_STRING_PARAMETER(pl, rangeFile, "--rangeFile", "Specify the file containing ranges, please use chr:begin-end format.")
      END_PARAMETER_LIST(pl)
      ;

  pl.Read(argc, argv);
  pl.Status();

  if (FLAG_REMAIN_ARG.size() > 0){
    fprintf(stderr, "Unparsed arguments: ");
    for (unsigned int i = 0; i < FLAG_REMAIN_ARG.size(); i++){
      fprintf(stderr, " %s", FLAG_REMAIN_ARG[i].c_str());
    }
    fprintf(stderr, "\n");
    abort();
  }

  REQUIRE_STRING_PARAMETER(FLAG_inVcf, "Please provide input file using: --inVcf");

  const char defaultDbSnp[] = "/net/fantasia/home/zhanxw/amd/data/umake-resources/dbSNP/dbsnp_129_b37.rod.map";
  if (FLAG_snp == "") {
    FLAG_snp = defaultDbSnp;
    fprintf(stderr, "Use default dbsnp: [ %s ]\n", defaultDbSnp);
  }
  SiteSet snpSet;
  snpSet.loadRodFile(FLAG_snp);
  fprintf(stderr, "%zu dbSNP sites loaded.\n", snpSet.getTotalSite());

  const char defaultHM3[] =  "/net/fantasia/home/zhanxw/amd/data/umake-resources/HapMap3/hapmap3_r3_b37_fwd.consensus.qc.poly.bim";
  if (FLAG_hapmap == "") {
    FLAG_hapmap = defaultHM3;
    fprintf(stderr, "Use default HapMap: [ %s ]\n", defaultHM3);
  }
  SiteSet hmSet;
  hmSet.loadBimFile(FLAG_hapmap);
  fprintf(stderr, "%zu Hapmap sites loaded.\n", hmSet.getTotalSite());

  const char* fn = FLAG_inVcf.c_str();
  LineReader lr(fn);

  // // set range filters here
  // // e.g.
  // // vin.setRangeList("1:69500-69600");
  // vin.setRangeList(FLAG_rangeList.c_str());
  // vin.setRangeFile(FLAG_rangeFile.c_str());

  std::map<std::string, Variant> freq;
  std::string chrom;
  int pos;
  // std::string filt;
  std::string anno;
  char ref, alt;
  bool inDbSnp;
  bool inHapmap;
  int lineNo = 0;
  std::vector<std::string> fd;
  while(lr.readLineBySep(&fd, " \t")){
    lineNo ++;
    if (fd[0][0] == '#') continue; // skip header
    chrom = fd[0]; // ref is on column 0 (0-based)
    pos = atoi(fd[1]); // ref is on column 1 (0-based)    
    ref = fd[3][0]; // ref is on column 3 (0-based)
    alt = fd[4][0]; // ref is on column 4 (0-based)
    // filt = fd[6]; // filt is on column 6 (0-based)
    anno = extractAnno(fd[7]); // info is on column 7 (0-based), we will extract ANNO=
    inDbSnp = snpSet.isIncluded(chrom.c_str(), pos);
    inHapmap = hmSet.isIncluded(chrom.c_str(), pos);

    
    Variant& v = freq[anno];
    v.total++;
    if ( isTs(ref, alt) ) {
      v.ts ++;
      if (inDbSnp) {
        v.tsInDbSnp ++;
        v.dbSnp ++;
      }
    } else if (isTv(ref, alt)) {
      v.tv ++;
      if (inDbSnp) {
        v.tvInDbSnp ++;
        v.dbSnp ++;
      }
    };
    if (inHapmap)
      v.hapmap ++;

    if (lineNo % 10000 == 0) {
      fprintf(stderr, "\rProcessed %d lines...\r", lineNo);
    }
  };
  fprintf(stdout, "Total %d VCF records have been read successfully\n", lineNo);

  //////////////////////////////////////////////////////////////////////
  std::string title = "Summarize per annotation type";
  int pad = (170 - title.size() ) /2 ;
  std::string outTitle = std::string(pad, '-') + title + std::string(pad, '-');
  puts(outTitle.c_str());
  printf("%40s\t%10s\t%10s\t%10s\t%10s\t%10s\t%10s\t%10s\t%10s\n",
         "Filter", "#SNPs", "#dbSNP", "%dbSNP", "Known Ts/Tv", "Novel Ts/Tv", "Overall", "%TotalHM3", "%HMCalled");
  std::map<std::string, Variant> indvFreq;
  Variant total;
  for (std::map<std::string, Variant>::iterator i = freq.begin() ; i != freq.end(); ++i ){
    i->second.print(i->first, hmSet);
    total += i->second;
  };
  total.print("TOTAL", hmSet);

  currentTime = time(0);
  fprintf(stderr, "Analysis end at: %s", ctime(&currentTime));  
  return 0;
};
Exemplo n.º 8
0
int main(int argc, char** argv) {
  time_t currentTime = time(0);
  fprintf(stderr, "Analysis started at: %s", ctime(&currentTime));

  PARSE_PARAMETER(argc, argv);
  PARAMETER_STATUS();

  if (FLAG_REMAIN_ARG.size() > 0) {
    fprintf(stderr, "Unparsed arguments: ");
    for (unsigned int i = 0; i < FLAG_REMAIN_ARG.size(); i++) {
      fprintf(stderr, " %s", FLAG_REMAIN_ARG[i].c_str());
    }
    fprintf(stderr, "\n");
    abort();
  }

  REQUIRE_STRING_PARAMETER(FLAG_inVcf,
                           "Please provide input file using: --inVcf");

  const char defaultDbSnp[] =
      "/net/fantasia/home/zhanxw/amd/data/umake-resources/dbSNP/"
      "dbsnp_129_b37.rod.map";
  if (FLAG_snp == "") {
    FLAG_snp = defaultDbSnp;
    fprintf(stderr, "Use default dbsnp: [ %s ]\n", defaultDbSnp);
  }
  SiteSet snpSet;
  snpSet.loadRodFile(FLAG_snp);
  fprintf(stderr, "%zu dbSNP sites loaded.\n", snpSet.getTotalSite());

  const char defaultHM3[] =
      "/net/fantasia/home/zhanxw/amd/data/umake-resources/HapMap3/"
      "hapmap3_r3_b37_fwd.consensus.qc.poly.bim";
  if (FLAG_hapmap == "") {
    FLAG_hapmap = defaultHM3;
    fprintf(stderr, "Use default HapMap: [ %s ]\n", defaultHM3);
  }
  SiteSet hmSet;
  hmSet.loadBimFile(FLAG_hapmap);
  fprintf(stderr, "%zu Hapmap sites loaded.\n", hmSet.getTotalSite());

  const char* fn = FLAG_inVcf.c_str();
  LineReader lr(fn);

  // // set range filters here
  // // e.g.
  // // vin.setRangeList("1:69500-69600");
  // vin.setRangeList(FLAG_rangeList.c_str());
  // vin.setRangeFile(FLAG_rangeFile.c_str());

  std::map<std::string, Variant> freq;
  std::string chrom;
  int pos;
  // std::string filt;
  // std::string anno;
  std::string numVariant;
  char ref, alt;
  bool inDbSnp;
  bool inHapmap;
  int lineNo = 0;
  std::vector<std::string> fd;
  while (lr.readLineBySep(&fd, " \t")) {
    lineNo++;
    if (fd[0][0] == '#') continue;  // skip header
    chrom = fd[0];                  // ref is on column 0 (0-based)
    pos = atoi(fd[1]);              // ref is on column 1 (0-based)
    ref = fd[3][0];                 // ref is on column 3 (0-based)
    alt = fd[4][0];                 // ref is on column 4 (0-based)
    // filt = fd[6]; // filt is on column 6 (0-based)
    // anno = extractAnno(fd[7]); // info is on column 7 (0-based), we will
    // extract ANNO=

    // obtain number of variants
    if (fd.size() <= 9) {  // first 9 columns are not individuals
      numVariant = toString(0);
    } else {
      int numVar = 0;
      for (size_t i = 9; i < fd.size(); ++i) {
        int varCount = countVariant(fd[i]);
        if (varCount > 0) numVar += varCount;
      }
      numVariant = toString(numVar);
    }

    inDbSnp = snpSet.isIncluded(chrom.c_str(), pos);
    inHapmap = hmSet.isIncluded(chrom.c_str(), pos);

    Variant& v = freq[numVariant];
    v.total++;
    if (isTs(ref, alt)) {
      v.ts++;
      if (inDbSnp) {
        v.tsInDbSnp++;
        v.dbSnp++;
      }
    } else if (isTv(ref, alt)) {
      v.tv++;
      if (inDbSnp) {
        v.tvInDbSnp++;
        v.dbSnp++;
      }
    };
    if (inHapmap) v.hapmap++;

    if (lineNo % 10000 == 0) {
      fprintf(stderr, "\rProcessed %d lines...\r", lineNo);
    }
  };
  fprintf(stdout, "Total %d VCF records have been read successfully\n", lineNo);

  //////////////////////////////////////////////////////////////////////
  std::string title = "Summarize per annotation type";
  int pad = (170 - title.size()) / 2;
  std::string outTitle = std::string(pad, '-') + title + std::string(pad, '-');
  puts(outTitle.c_str());
  printf("%40s\t%10s\t%10s\t%10s\t%10s\t%10s\t%10s\t%10s\t%10s\n", "Filter",
         "#SNPs", "#dbSNP", "%dbSNP", "Known Ts/Tv", "Novel Ts/Tv", "Overall",
         "%TotalHM3", "%HMCalled");
  std::map<std::string, Variant> indvFreq;
  Variant total;

  // to sort variants by its integer order, we use a temporary map
  std::map<int, Variant> tmp;
  for (std::map<std::string, Variant>::iterator i = freq.begin();
       i != freq.end(); ++i) {
    tmp[atoi(i->first)] = i->second;
  };
  for (std::map<int, Variant>::iterator i = tmp.begin(); i != tmp.end(); ++i) {
    i->second.print(toString(i->first), hmSet);
    total += i->second;
  };
  total.print("TOTAL", hmSet);

  currentTime = time(0);
  fprintf(stderr, "Analysis end at: %s", ctime(&currentTime));
  return 0;
};
Exemplo n.º 9
0
int main(int argc, char** argv) {
  time_t currentTime = time(0);
  fprintf(stderr, "Analysis started at: %s", ctime(&currentTime));

  PARSE_PARAMETER(argc, argv);
  PARAMETER_STATUS();

  if (FLAG_REMAIN_ARG.size() > 0) {
    fprintf(stderr, "Unparsed arguments: ");
    for (unsigned int i = 0; i < FLAG_REMAIN_ARG.size(); i++) {
      fprintf(stderr, " %s", FLAG_REMAIN_ARG[i].c_str());
    }
    fprintf(stderr, "\n");
    abort();
  }

  REQUIRE_STRING_PARAMETER(FLAG_inVcf,
                           "Please provide input file using: --inVcf");

  const char* fn = FLAG_inVcf.c_str();
  VCFInputFile vin(fn);

  // set range filters here
  // e.g.
  // vin.setRangeList("1:69500-69600");
  vin.setRangeList(FLAG_rangeList.c_str());
  vin.setRangeFile(FLAG_rangeFile.c_str());

  // set people filters here
  if (FLAG_peopleIncludeID.size() || FLAG_peopleIncludeFile.size()) {
    vin.excludeAllPeople();
    vin.includePeople(FLAG_peopleIncludeID.c_str());
    vin.includePeopleFromFile(FLAG_peopleIncludeFile.c_str());
  }
  vin.excludePeople(FLAG_peopleExcludeID.c_str());
  vin.excludePeopleFromFile(FLAG_peopleExcludeFile.c_str());

  // let's write it out.
  if (FLAG_updateId != "") {
    int ret = vin.updateId(FLAG_updateId.c_str());
    fprintf(stdout, "%d samples have updated id.\n", ret);
  }

  // load gene ranges
  std::map<std::string, std::string> geneRange;
  if (FLAG_geneName.size()) {
    if (FLAG_geneFile.size() == 0) {
      fprintf(stderr, "Have to provide --geneFile to extract by gene.\n");
      abort();
    }
    LineReader lr(FLAG_geneFile);
    std::vector<std::string> fd;
    while (lr.readLineBySep(&fd, "\t ")) {
      if (FLAG_geneName != fd[0]) continue;
      fd[2] = chopChr(fd[2]);  // chop "chr1" to "1"
      if (geneRange.find(fd[0]) == geneRange.end()) {
        geneRange[fd[0]] = fd[2] + ":" + fd[4] + "-" + fd[5];
      } else {
        geneRange[fd[0]] += "," + fd[2] + ":" + fd[4] + "-" + fd[5];
      }
    };
  }
  std::string range;
  for (std::map<std::string, std::string>::iterator it = geneRange.begin();
       it != geneRange.end(); it++) {
    if (range.size() > 0) {
      range += ",";
    }
    range += it->second;
  };
  fprintf(stderr, "range = %s\n", range.c_str());
  vin.setRangeList(range.c_str());

  Regex regex;
  if (FLAG_annoType.size()) {
    regex.readPattern(FLAG_annoType);
  }

  // print header
  std::vector<std::string> names;
  vin.getVCFHeader()->getPeopleName(&names);
  printf("CHROM\tPOS");
  for (unsigned int i = 0; i < names.size(); i++) {
    printf("\t%s", names[i].c_str());
  }
  printf("\n");

  // real working part
  int nonVariantSite = 0;
  while (vin.readRecord()) {
    VCFRecord& r = vin.getVCFRecord();
    VCFPeople& people = r.getPeople();
    VCFIndividual* indv;
    if (FLAG_variantOnly) {
      bool hasVariant = false;
      int geno;
      int GTidx = r.getFormatIndex("GT");
      for (size_t i = 0; i < people.size(); i++) {
        indv = people[i];
        geno = indv->justGet(GTidx).getGenotype();
        if (geno != 0 && geno != MISSING_GENOTYPE) hasVariant = true;
      }
      if (!hasVariant) {
        nonVariantSite++;
        continue;
      }
    }

    if (FLAG_annoType.size()) {
      bool isMissing = false;
      const char* tag = r.getInfoTag("ANNO", &isMissing).toStr();
      if (isMissing) continue;
      // fprintf(stdout, "ANNO = %s", tag);
      bool match = regex.match(tag);
      // fprintf(stdout, " %s \t", match ? "match": "noMatch");
      // fprintf(stdout, " %s \n", exists ? "exists": "missing");
      if (!match) {
        continue;
      }
    }

    fprintf(stdout, "%s\t%s", r.getChrom(), r.getPosStr());

    for (size_t i = 0; i < people.size(); i++) {
      indv = people[i];
      fprintf(stdout, "\t%d", indv->justGet(0).getGenotype());
    }
    fprintf(stdout, "\n");
  };

  currentTime = time(0);
  fprintf(stderr, "Analysis ends at: %s", ctime(&currentTime));

  return 0;
};
Exemplo n.º 10
0
int main(int argc, char** argv) {
  ////////////////////////////////////////////////
  BEGIN_PARAMETER_LIST(pl)
  ADD_PARAMETER_GROUP(pl, "Basic Input/Output")
  ADD_STRING_PARAMETER(pl, inVcf, "--inVcf", "Input VCF File")
  ADD_STRING_PARAMETER(pl, outPrefix, "--out", "Output prefix")
  ADD_BOOL_PARAMETER(pl, outputRaw, "--outputRaw",
                     "Output genotypes, phenotype, covariates(if any) and "
                     "collapsed genotype to tabular files")

  ADD_PARAMETER_GROUP(pl, "Specify Covariate")
  ADD_STRING_PARAMETER(pl, cov, "--covar", "Specify covariate file")
  ADD_STRING_PARAMETER(
      pl, covName, "--covar-name",
      "Specify the column name in covariate file to be included in analysis")
  ADD_BOOL_PARAMETER(pl, sex, "--sex",
                     "Include sex (5th column in the PED file) as a covariate")

  ADD_PARAMETER_GROUP(pl, "Specify Phenotype")
  ADD_STRING_PARAMETER(pl, pheno, "--pheno", "Specify phenotype file")
  ADD_BOOL_PARAMETER(pl, inverseNormal, "--inverseNormal",
                     "Transform phenotype like normal distribution")
  ADD_BOOL_PARAMETER(
      pl, useResidualAsPhenotype, "--useResidualAsPhenotype",
      "Fit covariate ~ phenotype, use residual to replace phenotype")
  ADD_STRING_PARAMETER(pl, mpheno, "--mpheno",
                       "Specify which phenotype column to read (default: 1)")
  ADD_STRING_PARAMETER(pl, phenoName, "--pheno-name",
                       "Specify which phenotype column to read by header")
  ADD_BOOL_PARAMETER(pl, qtl, "--qtl", "Treat phenotype as quantitative trait")
  ADD_STRING_PARAMETER(
      pl, multiplePheno, "--multiplePheno",
      "Specify aa template file for analyses of more than one phenotype")

  ADD_PARAMETER_GROUP(pl, "Specify Genotype")
  ADD_STRING_PARAMETER(pl, dosageTag, "--dosage",
                       "Specify which dosage tag to use. (e.g. EC or DS)")

  ADD_PARAMETER_GROUP(pl, "Chromosome X Options")
  ADD_STRING_PARAMETER(pl, xLabel, "--xLabel",
                       "Specify X chromosome label (default: 23|X)")
  ADD_STRING_PARAMETER(pl, xParRegion, "--xParRegion",
                       "Specify PAR region (default: hg19), can be build "
                       "number e.g. hg38, b37; or specify region, e.g. "
                       "'60001-2699520,154931044-155260560'")

  ADD_PARAMETER_GROUP(pl, "People Filter")
  ADD_STRING_PARAMETER(pl, peopleIncludeID, "--peopleIncludeID",
                       "List IDs of people that will be included in study")
  ADD_STRING_PARAMETER(
      pl, peopleIncludeFile, "--peopleIncludeFile",
      "From given file, set IDs of people that will be included in study")
  ADD_STRING_PARAMETER(pl, peopleExcludeID, "--peopleExcludeID",
                       "List IDs of people that will be included in study")
  ADD_STRING_PARAMETER(
      pl, peopleExcludeFile, "--peopleExcludeFile",
      "From given file, set IDs of people that will be included in study")

  ADD_PARAMETER_GROUP(pl, "Site Filter")
  ADD_STRING_PARAMETER(
      pl, rangeList, "--rangeList",
      "Specify some ranges to use, please use chr:begin-end format.")
  ADD_STRING_PARAMETER(
      pl, rangeFile, "--rangeFile",
      "Specify the file containing ranges, please use chr:begin-end format.")
  ADD_STRING_PARAMETER(pl, siteFile, "--siteFile",
                       "Specify the file containing sites to include, please "
                       "use \"chr pos\" format.")
  ADD_INT_PARAMETER(
      pl, siteDepthMin, "--siteDepthMin",
      "Specify minimum depth(inclusive) to be included in analysis")
  ADD_INT_PARAMETER(
      pl, siteDepthMax, "--siteDepthMax",
      "Specify maximum depth(inclusive) to be included in analysis")
  ADD_INT_PARAMETER(pl, siteMACMin, "--siteMACMin",
                    "Specify minimum Minor Allele Count(inclusive) to be "
                    "included in analysis")
  ADD_STRING_PARAMETER(pl, annoType, "--annoType",
                       "Specify annotation type that is followed by ANNO= in "
                       "the VCF INFO field, regular expression is allowed ")

  ADD_PARAMETER_GROUP(pl, "Genotype Filter")
  ADD_INT_PARAMETER(
      pl, indvDepthMin, "--indvDepthMin",
      "Specify minimum depth(inclusive) of a sample to be included in analysis")
  ADD_INT_PARAMETER(
      pl, indvDepthMax, "--indvDepthMax",
      "Specify maximum depth(inclusive) of a sample to be included in analysis")
  ADD_INT_PARAMETER(
      pl, indvQualMin, "--indvQualMin",
      "Specify minimum depth(inclusive) of a sample to be included in analysis")

  ADD_PARAMETER_GROUP(pl, "Association Model")
  ADD_STRING_PARAMETER(pl, modelSingle, "--single",
                       "Single variant tests, choose from: score, wald, exact, "
                       "famScore, famLrt, famGrammarGamma, firth")
  ADD_STRING_PARAMETER(pl, modelBurden, "--burden",
                       "Burden tests, choose from: cmc, zeggini, mb, exactCMC, "
                       "rarecover, cmat, cmcWald")
  ADD_STRING_PARAMETER(pl, modelVT, "--vt",
                       "Variable threshold tests, choose from: price, analytic")
  ADD_STRING_PARAMETER(
      pl, modelKernel, "--kernel",
      "Kernal-based tests, choose from: SKAT, KBAC, FamSKAT, SKATO")
  ADD_STRING_PARAMETER(pl, modelMeta, "--meta",
                       "Meta-analysis related functions to generate summary "
                       "statistics, choose from: score, cov, dominant, "
                       "recessive")

  ADD_PARAMETER_GROUP(pl, "Family-based Models")
  ADD_STRING_PARAMETER(pl, kinship, "--kinship",
                       "Specify a kinship file for autosomal analysis, use "
                       "vcf2kinship to generate")
  ADD_STRING_PARAMETER(pl, xHemiKinship, "--xHemiKinship",
                       "Provide kinship for the chromosome X hemizygote region")
  ADD_STRING_PARAMETER(pl, kinshipEigen, "--kinshipEigen",
                       "Specify eigen decomposition results of a kinship file "
                       "for autosomal analysis")
  ADD_STRING_PARAMETER(
      pl, xHemiKinshipEigen, "--xHemiKinshipEigen",
      "Specify eigen decomposition results of a kinship file for X analysis")

  ADD_PARAMETER_GROUP(pl, "Grouping Unit ")
  ADD_STRING_PARAMETER(pl, geneFile, "--geneFile",
                       "Specify a gene file (for burden tests)")
  ADD_STRING_PARAMETER(pl, gene, "--gene", "Specify which genes to test")
  ADD_STRING_PARAMETER(pl, setList, "--setList",
                       "Specify a list to test (for burden tests)")
  ADD_STRING_PARAMETER(pl, setFile, "--setFile",
                       "Specify a list file (for burden tests, first 2 "
                       "columns: setName chr:beg-end)")
  ADD_STRING_PARAMETER(pl, set, "--set",
                       "Specify which set to test (1st column)")

  ADD_PARAMETER_GROUP(pl, "Frequency Cutoff")
  /*ADD_BOOL_PARAMETER(pl, freqFromFile, "--freqFromFile", "Obtain frequency
   * from external file")*/
  // ADD_BOOL_PARAMETER(pl, freqFromControl, "--freqFromControl", "Calculate
  // frequency from case samples")
  ADD_DOUBLE_PARAMETER(
      pl, freqUpper, "--freqUpper",
      "Specify upper minor allele frequency bound to be included in analysis")
  ADD_DOUBLE_PARAMETER(
      pl, freqLower, "--freqLower",
      "Specify lower minor allele frequency bound to be included in analysis")

  ADD_PARAMETER_GROUP(pl, "Missing Data")
  ADD_STRING_PARAMETER(
      pl, impute, "--impute",
      "Impute missing genotype (default:mean):  mean, hwe, and drop")
  ADD_BOOL_PARAMETER(
      pl, imputePheno, "--imputePheno",
      "Impute phenotype to mean of those have genotypes but no phenotypes")
  ADD_BOOL_PARAMETER(pl, imputeCov, "--imputeCov",
                     "Impute each covariate to its mean, instead of drop "
                     "samples with missing covariates")

  ADD_PARAMETER_GROUP(pl, "Conditional Analysis")
  ADD_STRING_PARAMETER(pl, condition, "--condition",
                       "Specify markers to be conditions (specify range)")

  ADD_PARAMETER_GROUP(pl, "Auxiliary Functions")
  ADD_BOOL_PARAMETER(pl, noweb, "--noweb", "Skip checking new version")
  ADD_BOOL_PARAMETER(pl, help, "--help", "Print detailed help message")
  END_PARAMETER_LIST(pl);

  pl.Read(argc, argv);

  if (FLAG_help) {
    pl.Help();
    return 0;
  }

  welcome();
  pl.Status();
  if (FLAG_REMAIN_ARG.size() > 0) {
    fprintf(stderr, "Unparsed arguments: ");
    for (unsigned int i = 0; i < FLAG_REMAIN_ARG.size(); i++) {
      fprintf(stderr, " %s", FLAG_REMAIN_ARG[i].c_str());
    }
    exit(1);
  }

  if (!FLAG_outPrefix.size()) FLAG_outPrefix = "rvtest";

  REQUIRE_STRING_PARAMETER(FLAG_inVcf,
                           "Please provide input file using: --inVcf");

  // check new version
  if (!FLAG_noweb) {
    VersionChecker ver;
    if (ver.retrieveRemoteVersion("http://zhanxw.com/rvtests/version") < 0) {
      fprintf(stderr,
              "Retrieve remote version failed, use '--noweb' to skip.\n");
    } else {
      ver.setLocalVersion(VERSION);
      if (ver.isRemoteVersionNewer()) {
        fprintf(stderr, "New version of rvtests is available:");
        ver.printRemoteContent();
      }
    }
  }

  // start logging
  Logger _logger((FLAG_outPrefix + ".log").c_str());
  logger = &_logger;
  logger->info("Program version: %s", VERSION);
  logger->infoToFile("Git Version: %s", GIT_VERSION);
  logger->infoToFile("Parameters BEGIN");
  pl.WriteToFile(logger->getHandle());
  logger->infoToFile("Parameters END");
  logger->sync();

  // start analysis
  time_t startTime = time(0);
  logger->info("Analysis started at: %s", currentTime().c_str());

  GenotypeExtractor ge(FLAG_inVcf);

  // set range filters here
  ge.setRangeList(FLAG_rangeList.c_str());
  ge.setRangeFile(FLAG_rangeFile.c_str());

  // set people filters here
  if (FLAG_peopleIncludeID.size() || FLAG_peopleIncludeFile.size()) {
    ge.excludeAllPeople();
    ge.includePeople(FLAG_peopleIncludeID.c_str());
    ge.includePeopleFromFile(FLAG_peopleIncludeFile.c_str());
  }
  ge.excludePeople(FLAG_peopleExcludeID.c_str());
  ge.excludePeopleFromFile(FLAG_peopleExcludeFile.c_str());

  if (FLAG_siteDepthMin > 0) {
    ge.setSiteDepthMin(FLAG_siteDepthMin);
    logger->info("Set site depth minimum to %d", FLAG_siteDepthMin);
  }
  if (FLAG_siteDepthMax > 0) {
    ge.setSiteDepthMax(FLAG_siteDepthMax);
    logger->info("Set site depth maximum to %d", FLAG_siteDepthMax);
  }
  if (FLAG_siteMACMin > 0) {
    ge.setSiteMACMin(FLAG_siteMACMin);
    logger->info("Set site minimum MAC to %d", FLAG_siteDepthMin);
  }
  if (FLAG_annoType != "") {
    ge.setAnnoType(FLAG_annoType.c_str());
    logger->info("Set annotype type filter to %s", FLAG_annoType.c_str());
  }

  std::vector<std::string> vcfSampleNames;
  ge.getPeopleName(&vcfSampleNames);
  logger->info("Loaded [ %zu ] samples from VCF files", vcfSampleNames.size());

  DataLoader dataLoader;
  dataLoader.setPhenotypeImputation(FLAG_imputePheno);
  dataLoader.setCovariateImputation(FLAG_imputeCov);

  if (FLAG_multiplePheno.empty()) {
    dataLoader.loadPhenotype(FLAG_pheno, FLAG_mpheno, FLAG_phenoName);

    // // load phenotypes
    // std::map<std::string, double> phenotype;
    // if (FLAG_pheno.empty()) {
    //   logger->error("Cannot do association when phenotype is missing!");
    //   return -1;
    // }

    // // check if alternative phenotype columns are used
    // if (!FLAG_mpheno.empty() && !FLAG_phenoName.empty()) {
    //   logger->error("Please specify either --mpheno or --pheno-name");
    //   return -1;
    // }
    // if (!FLAG_mpheno.empty()) {
    //   int col = atoi(FLAG_mpheno);
    //   int ret = loadPedPhenotypeByColumn(FLAG_pheno.c_str(), &phenotype,
    //   col);
    //   if (ret < 0) {
    //     logger->error("Loading phenotype failed!");
    //     return -1;
    //   }
    // } else if (!FLAG_phenoName.empty()) {
    //   int ret = loadPedPhenotypeByHeader(FLAG_pheno.c_str(), &phenotype,
    //                                      FLAG_phenoName.c_str());
    //   if (ret < 0) {
    //     logger->error("Loading phenotype failed!");
    //     return -1;
    //   }
    // } else {
    //   int col = 1;  // default use the first phenotype
    //   int ret = loadPedPhenotypeByColumn(FLAG_pheno.c_str(), &phenotype,
    //   col);
    //   if (ret < 0) {
    //     logger->error("Loading phenotype failed!");
    //     return -1;
    //   }
    // }
    // logger->info("Loaded [ %zu ] sample pheontypes.", phenotype.size());

    // rearrange phenotypes
    // drop samples from phenotype or vcf
    matchPhenotypeAndVCF("missing phenotype", &dataLoader, &ge);

    // // phenotype names (vcf sample names) arranged in the same order as in
    // VCF
    // std::vector<std::string> phenotypeNameInOrder;
    // std::vector<double>
    //     phenotypeInOrder;  // phenotype arranged in the same order as in VCF
    // rearrange(phenotype, vcfSampleNames, &vcfSampleToDrop,
    // &phenotypeNameInOrder,
    //           &phenotypeInOrder, FLAG_imputePheno);
    // if (vcfSampleToDrop.size()) {
    //   // exclude this sample from parsing VCF
    //   ge.excludePeople(vcfSampleToDrop);
    //   // output dropped samples
    //   for (size_t i = 0; i < vcfSampleToDrop.size(); ++i) {
    //     if (i == 0)
    //       logger->warn(
    //           "Total [ %zu ] samples are dropped from VCF file due to missing
    //           "
    //           "phenotype",
    //           vcfSampleToDrop.size());
    //     if (i >= 10) {
    //       logger->warn(
    //           "Skip outputting additional [ %d ] samples with missing "
    //           "phenotypes.",
    //           ((int)vcfSampleToDrop.size() - 10));
    //       break;
    //     }
    //     logger->warn("Drop sample [ %s ] from VCF file due to missing
    //     phenotype",
    //                  (vcfSampleToDrop)[i].c_str());
    //   }
    //   // logger->warn("Drop %zu sample from VCF file since we don't have
    //   their
    //   // phenotypes", vcfSampleToDrop.size());
    // }
    // if (phenotypeInOrder.size() != phenotype.size()) {
    //   logger->warn(
    //       "Drop [ %d ] samples from phenotype file due to missing genotypes
    //       from "
    //       "VCF files",
    //       (int)(phenotype.size() - phenotypeInOrder.size()));
    //   // We may output these samples by comparing keys of phenotype and
    //   // phenotypeNameInOrder
    // }
    dataLoader.loadCovariate(FLAG_cov, FLAG_covName);
    matchCovariateAndVCF("missing covariate", &dataLoader, &ge);

    // // load covariate
    // Matrix covariate;
    // HandleMissingCov handleMissingCov = COVARIATE_DROP;
    // if (FLAG_imputeCov) {
    //   handleMissingCov = COVARIATE_IMPUTE;
    // }
    // if (FLAG_cov.empty() && !FLAG_covName.empty()) {
    //   logger->info("Use phenotype file as covariate file [ %s ]",
    //                FLAG_pheno.c_str());
    //   FLAG_cov = FLAG_pheno;
    // }
    // if (!FLAG_cov.empty()) {
    //   logger->info("Begin to read covariate file.");
    //   std::vector<std::string> columnNamesInCovariate;
    //   std::set<std::string> sampleToDropInCovariate;
    //   int ret = loadCovariate(FLAG_cov.c_str(), phenotypeNameInOrder,
    //                           FLAG_covName.c_str(), handleMissingCov,
    //                           &covariate,
    //                           &columnNamesInCovariate,
    //                           &sampleToDropInCovariate);
    //   if (ret < 0) {
    //     logger->error("Load covariate file failed !");
    //     exit(1);
    //   }

    //   // drop phenotype samples
    //   if (!sampleToDropInCovariate.empty()) {
    //     int idx = 0;
    //     int n = phenotypeNameInOrder.size();
    //     for (int i = 0; i < n; ++i) {
    //       if (sampleToDropInCovariate.count(phenotypeNameInOrder[i]) !=
    //           0) {  // need to drop
    //         continue;
    //       }
    //       phenotypeNameInOrder[idx] = phenotypeNameInOrder[i];
    //       phenotypeInOrder[idx] = phenotypeInOrder[i];
    //       idx++;
    //     }
    //     phenotypeNameInOrder.resize(idx);
    //     phenotypeInOrder.resize(idx);
    //     logger->warn(
    //         "[ %zu ] sample phenotypes are dropped due to lacking
    //         covariates.",
    //         sampleToDropInCovariate.size());
    //   }
    //   // drop vcf samples;
    //   for (std::set<std::string>::const_iterator iter =
    //            sampleToDropInCovariate.begin();
    //        iter != sampleToDropInCovariate.end(); ++iter) {
    //     ge.excludePeople(iter->c_str());
    //   }
    // }
  } else {
    dataLoader.loadMultiplePhenotype(FLAG_multiplePheno, FLAG_pheno, FLAG_cov);

    matchPhenotypeAndVCF("missing phenotype", &dataLoader, &ge);
    matchCovariateAndVCF("missing covariate", &dataLoader, &ge);
  }

  dataLoader.loadSex();
  if (FLAG_sex) {
    dataLoader.useSexAsCovariate();
    matchCovariateAndVCF("missing sex", &dataLoader, &ge);
  }
  // // load sex
  // std::vector<int> sex;
  // if (loadSex(FLAG_pheno, phenotypeNameInOrder, &sex)) {
  //   logger->error("Cannot load sex of samples from phenotype file");
  //   exit(1);
  // }

  // if (FLAG_sex) {            // append sex in covariate
  //   std::vector<int> index;  // mark missing samples
  //   int numMissing = findMissingSex(sex, &index);
  //   logger->info("Futher exclude %d samples with missing sex", numMissing);
  //   removeByIndex(index, &sex);
  //   excludeSamplesByIndex(index, &ge, &phenotypeNameInOrder,
  //   &phenotypeInOrder,
  //                         &covariate);
  //   appendToMatrix("Sex", sex, &covariate);
  // }

  if (!FLAG_condition.empty()) {
    dataLoader.loadMarkerAsCovariate(FLAG_inVcf, FLAG_condition);
    matchCovariateAndVCF("missing in conditioned marker(s)", &dataLoader, &ge);
  }
  // // load conditional markers
  // if (!FLAG_condition.empty()) {
  //   Matrix geno;
  //   std::vector<std::string> rowLabel;
  //   if (loadMarkerFromVCF(FLAG_inVcf, FLAG_condition, &rowLabel, &geno) < 0)
  //   {
  //     logger->error("Load conditional markers [ %s ] from [ %s ] failed.",
  //                   FLAG_condition.c_str(), FLAG_inVcf.c_str());
  //     exit(1);
  //   }
  //   if (appendGenotype(&covariate, phenotypeNameInOrder, geno, rowLabel) < 0)
  //   {
  //     logger->error(
  //         "Failed to combine conditional markers [ %s ] from [ %s ] failed.",
  //         FLAG_condition.c_str(), FLAG_inVcf.c_str());
  //     exit(1);
  //   }
  // }

  dataLoader.checkConstantCovariate();
  // // check if some covariates are constant for all samples
  // // e.g. user may include covariate "1" in addition to intercept
  // //      in such case, we will give a fatal error
  // for (int i = 0; i < covariate.cols; ++i) {
  //   std::set<double> s;
  //   s.clear();
  //   for (int j = 0; j < covariate.rows; ++j) {
  //     s.insert(covariate[j][i]);
  //   }
  //   if (s.size() == 1) {
  //     logger->error(
  //         "Covariate [ %s ] equals [ %g ] for all samples, cannot fit "
  //         "model...\n",
  //         covariate.GetColumnLabel(i), *s.begin());
  //     exit(1);
  //   }
  // }

  g_SummaryHeader = new SummaryHeader;
  g_SummaryHeader->recordCovariate(dataLoader.getCovariate());

  // record raw phenotype
  g_SummaryHeader->recordPhenotype("Trait",
                                   dataLoader.getPhenotype().extractCol(0));

  // adjust phenotype
  // bool binaryPhenotype;
  if (FLAG_qtl) {
    // binaryPhenotype = false;
    dataLoader.setTraitType(DataLoader::PHENOTYPE_QTL);
    logger->info("-- Force quantitative trait mode -- ");
  } else {
    if (dataLoader.detectPhenotypeType() == DataLoader::PHENOTYPE_BINARY) {
      logger->warn("-- Enabling binary phenotype mode -- ");
      dataLoader.setTraitType(DataLoader::PHENOTYPE_BINARY);

    } else {
      dataLoader.setTraitType(DataLoader::PHENOTYPE_QTL);
    }
    // binaryPhenotype = isBinaryPhenotype(phenotypeInOrder);
    // if (binaryPhenotype) {
    //   logger->warn("-- Enabling binary phenotype mode -- ");
    //   convertBinaryPhenotype(&phenotypeInOrder);
    // }
  }

  if (FLAG_useResidualAsPhenotype) {
    dataLoader.useResidualAsPhenotype();
    g_SummaryHeader->recordEstimation(dataLoader.getEstimation());
  }
  // // use residual as phenotype
  // if (FLAG_useResidualAsPhenotype) {
  //   if (binaryPhenotype) {
  //     logger->warn(
  //         "WARNING: Skip transforming binary phenotype, although you want to
  //         "
  //         "use residual as phenotype!");
  //   } else {
  //     if (covariate.cols > 0) {
  //       LinearRegression lr;
  //       Vector pheno;
  //       Matrix covAndInt;
  //       copy(phenotypeInOrder, &pheno);
  //       copyCovariateAndIntercept(covariate.rows, covariate, &covAndInt);
  //       if (!lr.FitLinearModel(covAndInt, pheno)) {
  //         logger->error(
  //             "Cannot fit model: [ phenotype ~ 1 + covariates ], now use the
  //             "
  //             "original phenotype");
  //       } else {
  //         const int n = lr.GetResiduals().Length();
  //         for (int i = 0; i < n; ++i) {
  //           phenotypeInOrder[i] = lr.GetResiduals()[i];
  //         }
  //         covariate.Dimension(0, 0);
  //         logger->info(
  //             "DONE: Fit model [ phenotype ~ 1 + covariates ] and model "
  //             "residuals will be used as responses.");
  //       }
  //     } else {  // no covaraites
  //       centerVector(&phenotypeInOrder);
  //       logger->info("DONE: Use residual as phenotype by centerng it");
  //     }
  //   }
  // }

  if (FLAG_inverseNormal) {
    dataLoader.inverseNormalizePhenotype();
    g_SummaryHeader->setInverseNormalize(FLAG_inverseNormal);
  }
  // // phenotype transformation
  // if (FLAG_inverseNormal) {
  //   if (binaryPhenotype) {
  //     logger->warn(
  //         "WARNING: Skip transforming binary phenotype, although you required
  //         "
  //         "inverse normalization!");
  //   } else {
  //     logger->info("Now applying inverse normalize transformation.");
  //     inverseNormalizeLikeMerlin(&phenotypeInOrder);
  //     g_SummaryHeader->setInverseNormalize(FLAG_inverseNormal);
  //     logger->info("DONE: inverse normal transformation finished.");
  //   }
  // }

  g_SummaryHeader->recordPhenotype("AnalyzedTrait",
                                   dataLoader.getPhenotype().extractCol(0));

  if (dataLoader.getPhenotype().nrow() == 0) {
    logger->fatal("There are 0 samples with valid phenotypes, quitting...");
    exit(1);
  }
  // if (phenotypeInOrder.empty()) {
  //   logger->fatal("There are 0 samples with valid phenotypes, quitting...");
  //   exit(1);
  // }

  logger->info("Analysis begins with [ %d ] samples...",
               dataLoader.getPhenotype().nrow());
  //////////////////////////////////////////////////////////////////////////////
  // prepare each model
  bool singleVariantMode = FLAG_modelSingle.size() || FLAG_modelMeta.size();
  bool groupVariantMode = (FLAG_modelBurden.size() || FLAG_modelVT.size() ||
                           FLAG_modelKernel.size());
  if (singleVariantMode && groupVariantMode) {
    logger->error("Cannot support both single variant and region based tests");
    exit(1);
  }

  ModelManager modelManager(FLAG_outPrefix);
  // set up models in qtl/binary modes
  if (dataLoader.isBinaryPhenotype()) {
    modelManager.setBinaryOutcome();
    matchPhenotypeAndVCF("missing phenotype (not case/control)", &dataLoader,
                         &ge);
  } else {
    modelManager.setQuantitativeOutcome();
  }
  // create models
  modelManager.create("single", FLAG_modelSingle);
  modelManager.create("burden", FLAG_modelBurden);
  modelManager.create("vt", FLAG_modelVT);
  modelManager.create("kernel", FLAG_modelKernel);
  modelManager.create("meta", FLAG_modelMeta);
  if (FLAG_outputRaw) {
    modelManager.create("outputRaw", "dump");
  }

  const std::vector<ModelFitter*>& model = modelManager.getModel();
  const std::vector<FileWriter*>& fOuts = modelManager.getResultFile();
  const size_t numModel = model.size();

  // TODO: optimize this by avoidding data copying
  Matrix phenotypeMatrix;
  Matrix covariate;
  toMatrix(dataLoader.getPhenotype(), &phenotypeMatrix);
  toMatrix(dataLoader.getCovariate(), &covariate);

  // determine VCF file reading pattern
  // current support:
  // * line by line ( including range selection)
  // * gene by gene
  // * range by range
  std::string rangeMode = "Single";
  if (FLAG_geneFile.size() && (FLAG_setFile.size() || FLAG_setList.size())) {
    logger->error("Cannot specify both gene file and set file.");
    exit(1);
  }

  if (!FLAG_gene.empty() && FLAG_geneFile.empty()) {
    logger->error("Please provide gene file for gene bases analysis.");
    exit(1);
  }
  OrderedMap<std::string, RangeList> geneRange;
  if (FLAG_geneFile.size()) {
    rangeMode = "Gene";
    int ret =
        loadGeneFile(FLAG_geneFile.c_str(), FLAG_gene.c_str(), &geneRange);
    if (ret < 0 || geneRange.size() == 0) {
      logger->error("Error loading gene file or gene list is empty!");
      return -1;
    } else {
      logger->info("Loaded [ %zu ] genes.", geneRange.size());
    }
  }

  if (!FLAG_set.empty() && FLAG_setFile.empty()) {
    logger->error("Please provide set file for set bases analysis.");
    exit(1);
  }
  if (FLAG_setFile.size()) {
    rangeMode = "Range";
    int ret = loadRangeFile(FLAG_setFile.c_str(), FLAG_set.c_str(), &geneRange);
    if (ret < 0 || geneRange.size() == 0) {
      logger->error("Error loading set file or set list is empty!");
      return -1;
    } else {
      logger->info("Loaded [ %zu ] set to tests.", geneRange.size());
    }
  }
  if (FLAG_setList.size()) {
    rangeMode = "Range";
    int ret = appendListToRange(FLAG_setList, &geneRange);
    if (ret < 0) {
      logger->error("Error loading set list or set list is empty!");
      return -1;
    }
  }

  DataConsolidator dc;
  dc.setSex(&dataLoader.getSex());
  dc.setFormula(&dataLoader.getFormula());
  dc.setGenotypeCounter(ge.getGenotypeCounter());

  // load kinshp if needed by family models
  if (modelManager.hasFamilyModel() ||
      (!FLAG_modelMeta.empty() && !FLAG_kinship.empty())) {
    logger->info("Family-based model specified. Loading kinship file...");

    // process auto kinship
    if (dc.setKinshipSample(dataLoader.getPhenotype().getRowName()) ||
        dc.setKinshipFile(DataConsolidator::KINSHIP_AUTO, FLAG_kinship) ||
        dc.setKinshipEigenFile(DataConsolidator::KINSHIP_AUTO,
                               FLAG_kinshipEigen) ||
        dc.loadKinship(DataConsolidator::KINSHIP_AUTO)) {
      logger->error(
          "Failed to load autosomal kinship (you may use vcf2kinship to "
          "generate one).");
      exit(1);
    }

    if (dc.setKinshipFile(DataConsolidator::KINSHIP_X, FLAG_xHemiKinship) ||
        dc.setKinshipEigenFile(DataConsolidator::KINSHIP_X,
                               FLAG_xHemiKinshipEigen) ||
        dc.loadKinship(DataConsolidator::KINSHIP_X)) {
      logger->warn(
          "Autosomal kinship loaded, but no hemizygote region kinship "
          "provided, some sex chromosome tests will be skipped.");
      // keep the program going
    }
  } else if (!FLAG_kinship.empty() && FLAG_modelMeta.empty()) {
    logger->info(
        "Family-based model not specified. Options related to kinship will be "
        "ignored here.");
  }

  // set imputation method
  if (FLAG_impute.empty()) {
    logger->info("Impute missing genotype to mean (by default)");
    dc.setStrategy(DataConsolidator::IMPUTE_MEAN);
  } else if (FLAG_impute == "mean") {
    logger->info("Impute missing genotype to mean");
    dc.setStrategy(DataConsolidator::IMPUTE_MEAN);
  } else if (FLAG_impute == "hwe") {
    logger->info("Impute missing genotype by HWE");
    dc.setStrategy(DataConsolidator::IMPUTE_HWE);
  } else if (FLAG_impute == "drop") {
    logger->info("Drop missing genotypes");
    dc.setStrategy(DataConsolidator::DROP);
  }
  dc.setPhenotypeName(dataLoader.getPhenotype().getRowName());

  // set up par region
  ParRegion parRegion(FLAG_xLabel, FLAG_xParRegion);
  dc.setParRegion(&parRegion);

  // genotype will be extracted and stored
  Matrix& genotype = dc.getOriginalGenotype();
  if (FLAG_freqUpper > 0) {
    ge.setSiteFreqMax(FLAG_freqUpper);
    logger->info("Set upper minor allele frequency limit to %g",
                 FLAG_freqUpper);
  }
  if (FLAG_freqLower > 0) {
    ge.setSiteFreqMin(FLAG_freqLower);
    logger->info("Set lower minor allele frequency limit to %g",
                 FLAG_freqLower);
  }

  // handle sex chromosome
  ge.setParRegion(&parRegion);
  ge.setSex(&dataLoader.getSex());

  // use dosage instead GT
  if (!FLAG_dosageTag.empty()) {
    ge.setDosageTag(FLAG_dosageTag);
    logger->info("Use dosage genotype from VCF flag %s.",
                 FLAG_dosageTag.c_str());
  }

  // genotype QC options
  if (FLAG_indvDepthMin > 0) {
    ge.setGDmin(FLAG_indvDepthMin);
    logger->info("Minimum GD set to %d (or marked as missing genotype).",
                 FLAG_indvDepthMin);
  }
  if (FLAG_indvDepthMax > 0) {
    ge.setGDmax(FLAG_indvDepthMax);
    logger->info("Maximum GD set to %d (or marked as missing genotype).",
                 FLAG_indvDepthMax);
  }
  if (FLAG_indvQualMin > 0) {
    ge.setGQmin(FLAG_indvQualMin);
    logger->info("Minimum GQ set to %d (or marked as missing genotype).",
                 FLAG_indvQualMin);
  }

  dc.preRegressionCheck(phenotypeMatrix, covariate);

  logger->info("Analysis started");
  Result& buf = dc.getResult();

  // we have three modes:
  // * single variant reading, single variant test
  // * range variant reading, single variant test
  // * range variant reading, group variant test
  if (rangeMode == "Single" && singleVariantMode) {  // use line by line mode
    buf.addHeader("CHROM");
    buf.addHeader("POS");
    buf.addHeader("REF");
    buf.addHeader("ALT");
    buf.addHeader("N_INFORMATIVE");

    // output headers
    for (size_t m = 0; m < model.size(); m++) {
      model[m]->writeHeader(fOuts[m], buf);
    }

    int variantProcessed = 0;
    while (true) {
      buf.clearValue();
      int ret = ge.extractSingleGenotype(&genotype, &buf);

      if (ret == GenotypeExtractor::FILE_END) {  // reach file end
        break;
      }
      if (ret == GenotypeExtractor::FAIL_FILTER) {
        continue;
      }
      if (ret != GenotypeExtractor::SUCCEED) {
        logger->error("Extract genotype failed at site: %s:%s!",
                      buf["CHROM"].c_str(), buf["POS"].c_str());
        continue;
      }
      if (genotype.cols == 0) {
        logger->warn("Extract [ %s:%s ] has 0 variants, skipping",
                     buf["CHROM"].c_str(), buf["POS"].c_str());
        continue;
      }

      ++variantProcessed;
      dc.consolidate(phenotypeMatrix, covariate, genotype);

      buf.updateValue("N_INFORMATIVE", toString(genotype.rows));

      // fit each model
      for (size_t m = 0; m != numModel; m++) {
        model[m]->reset();
        model[m]->fit(&dc);
        model[m]->writeOutput(fOuts[m], buf);
      }
    }
    logger->info("Analyzed [ %d ] variants", variantProcessed);
  } else if (rangeMode != "Single" &&
             singleVariantMode) {  // read by gene/range model, single variant
    // test
    buf.addHeader(rangeMode);
    buf.addHeader("CHROM");
    buf.addHeader("POS");
    buf.addHeader("REF");
    buf.addHeader("ALT");
    buf.addHeader("N_INFORMATIVE");

    // output headers
    for (size_t m = 0; m < numModel; m++) {
      model[m]->writeHeader(fOuts[m], buf);
    }
    std::string geneName;
    RangeList rangeList;
    int variantProcessed = 0;
    for (size_t i = 0; i < geneRange.size(); ++i) {
      geneRange.at(i, &geneName, &rangeList);
      ge.setRange(rangeList);

      while (true) {
        buf.clearValue();
        int ret = ge.extractSingleGenotype(&genotype, &buf);
        if (ret == GenotypeExtractor::FILE_END) {  // reach end of this region
          break;
        }
        if (ret == GenotypeExtractor::FAIL_FILTER) {
          continue;
        }
        if (ret != GenotypeExtractor::SUCCEED) {
          logger->error("Extract genotype failed for gene %s!",
                        geneName.c_str());
          continue;
        }
        if (genotype.cols == 0) {
          logger->warn("Gene %s has 0 variants, skipping", geneName.c_str());
          continue;
        }

        ++variantProcessed;
        dc.consolidate(phenotypeMatrix, covariate, genotype);

        buf.updateValue(rangeMode, geneName);
        buf.updateValue("N_INFORMATIVE", genotype.rows);

        // #pragma omp parallel for
        for (size_t m = 0; m != numModel; m++) {
          model[m]->reset();
          model[m]->fit(&dc);
          model[m]->writeOutput(fOuts[m], buf);
        }
      }
    }
    logger->info("Analyzed [ %d ] variants from [ %d ] genes/regions",
                 variantProcessed, (int)geneRange.size());
  } else if (rangeMode != "Single" &&
             groupVariantMode) {  // read by gene/range mode, group variant
                                  // test
    buf.addHeader(rangeMode);
    buf.addHeader("RANGE");
    buf.addHeader("N_INFORMATIVE");
    buf.addHeader("NumVar");
    buf.addHeader("NumPolyVar");

    // output headers
    for (size_t m = 0; m < numModel; m++) {
      model[m]->writeHeader(fOuts[m], buf);
    }
    std::string geneName;
    RangeList rangeList;
    int variantProcessed = 0;
    ge.enableAutoMerge();
    for (size_t i = 0; i < geneRange.size(); ++i) {
      geneRange.at(i, &geneName, &rangeList);
      ge.setRange(rangeList);

      buf.clearValue();
      int ret = ge.extractMultipleGenotype(&genotype);

      if (ret != GenotypeExtractor::SUCCEED) {
        logger->error("Extract genotype failed for gene %s!", geneName.c_str());
        continue;
      }
      if (genotype.cols == 0) {
        logger->info("Gene %s has 0 variants, skipping", geneName.c_str());
        continue;
      }

      variantProcessed += genotype.cols;  // genotype is people by marker
      dc.consolidate(phenotypeMatrix, covariate, genotype);

      buf.updateValue(rangeMode, geneName);
      buf.updateValue("RANGE", rangeList.toString());
      buf.updateValue("N_INFORMATIVE", genotype.rows);
      buf.updateValue("NumVar", genotype.cols);
      buf.updateValue("NumPolyVar",
                      dc.getFlippedToMinorPolymorphicGenotype().cols);

      // #ifdef _OPENMP
      // #pragma omp parallel for
      // #endif
      for (size_t m = 0; m != numModel; m++) {
        model[m]->reset();
        model[m]->fit(&dc);
        model[m]->writeOutput(fOuts[m], buf);
      }
    }
    logger->info("Analyzed [ %d ] variants from [ %d ] genes/regions",
                 variantProcessed, (int)geneRange.size());
  } else {
    logger->error(
        "Unsupported reading mode and test modes! (need more parameters?)");
    exit(1);
  }

  // Resource cleaning up
  modelManager.close();
  delete g_SummaryHeader;

  time_t endTime = time(0);
  logger->info("Analysis ends at: %s", currentTime().c_str());
  int elapsedSecond = (int)(endTime - startTime);
  logger->info("Analysis took %d seconds", elapsedSecond);

  return 0;
}