Пример #1
0
int main() {
    VCFInputFile vin("noindex.vcf.gz");
    vin.setRangeList("1:0");
    int lineNo = 0;
    while (vin.readRecord()) {
        lineNo ++;
        VCFRecord& r = vin.getVCFRecord();
        VCFPeople& people = r.getPeople();
        VCFIndividual* indv;

        printf("%s:%d\t", r.getChrom(), r.getPos());

        // e.g.: get TAG from INFO field
        // fprintf(stderr, "%s\n", r.getInfoTag("ANNO"));

        // e.g.: Loop each (selected) people in the same order as in the VCF
        for (int i = 0; i < people.size(); i++) {
            indv = people[i];
            // get GT index. if you are sure the index will not change, call this function only once!
            int GTidx = r.getFormatIndex("GT");
            if (GTidx >= 0)
                printf("%s ", indv->justGet(0).toStr());  // [0] meaning the first field of each individual
            else
                fprintf(stderr, "Cannot find GT field!\n");
        }
        printf("\n");
    };
    fprintf(stdout, "Total %d VCF records have converted successfully\n", lineNo);

};
Пример #2
0
int loadMarkerFromVCF(const std::string& fileName, const std::string& marker,
                      std::vector<std::string>* rowLabel, Matrix* genotype) {
  if (!rowLabel || !genotype) {
    // invalid parameter
    return -1;
  }
  Matrix& m = *genotype;
  int col = 0;

  VCFInputFile vin(fileName);
  vin.setRangeList(marker);

  while (vin.readRecord()) {
    VCFRecord& r = vin.getVCFRecord();
    VCFPeople& people = r.getPeople();
    VCFIndividual* indv;

    m.Dimension(people.size(), col + 1);

    int GTidx = r.getFormatIndex("GT");
    for (int i = 0; i < (int)people.size(); i++) {
      indv = people[i];
      // get GT index. if you are sure the index will not change,
      // call this function only once!
      if (GTidx >= 0) {
        // printf("%s ", indv->justGet(0).toStr());  // [0] meaning the first
        // field of each individual
        m[i][col] = indv->justGet(GTidx).getGenotype();
      } else {
        logger->error("Cannot find GT field!");
        return -1;
      }
    }
    if (col == 0) {
      // set-up names
      rowLabel->resize(people.size());
      for (size_t i = 0; i < people.size(); ++i) {
        (*rowLabel)[i] = people[i]->getName();
      }
    }
    std::string colLabel = r.getChrom();
    colLabel += ":";
    colLabel += r.getPosStr();
    m.SetColumnLabel(col, colLabel.c_str());
    ++col;
  }

  return 0;
}
Пример #3
0
int loadGenotype(VCFInputFile& vin, AllConcordanceType* input, int idx) {
  AllConcordanceType& data = *input;
  std::string key;
  int lineNo = 0;
  while (vin.readRecord()) {
    lineNo++;
    key.clear();
    VCFRecord& r = vin.getVCFRecord();
    key += r.getChrom();
    key += ":";
    key += r.getPosStr();

    VCFPeople& people = r.getPeople();
    VCFIndividual* indv;
    int GTidx = r.getFormatIndex("GT");
    if (GTidx < 0) continue;
    for (size_t i = 0; i < people.size(); ++i) {
      indv = people[i];
      const VCFValue& v = indv->justGet(GTidx);
      int a1 = v.getAllele1();
      int a2 = v.getAllele2();
      if (a1 == MISSING_GENOTYPE || a2 == MISSING_GENOTYPE) {
        data[indv->getName()][key][idx] = Value::MISSING;
      } else if (a1 == 0) {
        if (a2 == 0) {
          data[indv->getName()][key][idx] = Value::HOMREF;
        } else if (a2 == 1) {
          data[indv->getName()][key][idx] = Value::HET;
        }
      } else if (a1 == 1) {
        if (a2 == 0) {
          data[indv->getName()][key][idx] = Value::HET;
        } else if (a2 == 1) {
          data[indv->getName()][key][idx] = Value::HOMALT;
        }
      }
    }
  };
  fprintf(stderr, "Total %d VCF records have read successfully\n", lineNo);
  return lineNo;
};
Пример #4
0
int PlinkOutputFile::writeRecord(VCFRecord* r) {
  // write BIM
  if (isMultiAllelic(r->getRef()) || isMultiAllelic(r->getAlt())) {
    fprintf(stdout, "%s:%d Skip with ref = [ %s ] and alt= [ %s ]\n", __FILE__,
            __LINE__, r->getRef(), r->getAlt());
    return -1;
  }

  this->writeBIM(r->getChrom(), r->getID(), 0, r->getPos(), r->getRef(),
                 r->getAlt());

  // write BED
  int GTidx = r->getFormatIndex("GT");
  VCFPeople& people = r->getPeople();
  unsigned char c = 0;
  VCFIndividual* indv;
  int offset;
  for (unsigned int i = 0; i < people.size(); i++) {
    indv = people[i];
    offset = i & (4 - 1);
    if (indv->justGet(GTidx).isHaploid()) {  // 0: index of GT
      int a1 = indv->justGet(GTidx).getAllele1();
      if (a1 == 0)
        setGenotype(&c, offset, HOM_REF);
      else if (a1 == 1)
        setGenotype(&c, offset, HET);
      else
        setGenotype(&c, offset, MISSING);
    } else {
      int a1 = indv->justGet(GTidx).getAllele1();
      int a2 = indv->justGet(GTidx).getAllele2();
      if (a1 == 0) {
        if (a2 == 0) {
          // h**o ref: 0b00
        } else if (a2 == 1) {
          setGenotype(&c, offset, HET);  // het: 0b01
        } else {
          setGenotype(&c, offset, MISSING);  // missing 0b10
        }
      } else if (a1 == 1) {
        if (a2 == 0) {
          setGenotype(&c, offset, HET);  // het: 0b01
        } else if (a2 == 1) {
          setGenotype(&c, offset, HOM_ALT);  // hom alt: 0b11
        } else {
          setGenotype(&c, offset, MISSING);  // missing
        }
      } else {
        // NOTE: Plink does not support tri-allelic
        // so have to set genotype as missing.
        setGenotype(&c, offset, MISSING);  // missing
      }
    }
    if (offset == 3) {  // 3: 4 - 1, so every 4 genotype we will flush
      fwrite(&c, sizeof(char), 1, this->fpBed);
      c = 0;
    }
  }
  if (people.size() % 4 != 0) {  // remaining some bits
    fwrite(&c, sizeof(char), 1, this->fpBed);
  }
  return 0;
}
Пример #5
0
int GenotypeExtractor::extractMultipleGenotype(Matrix* g) {
  static Matrix m;  // make it static to reduce memory allocation
  int row = 0;
  std::vector<std::string> colNames;
  std::string name;
  this->hemiRegion.clear();
  GenotypeCounter genoCounter;
  while (this->vin->readRecord()) {
    VCFRecord& r = this->vin->getVCFRecord();
    VCFPeople& people = r.getPeople();
    VCFIndividual* indv;

    m.Dimension(row + 1, people.size());
    genoCounter.reset();

    int genoIdx;
    const bool useDosage = (!this->dosageTag.empty());
    if (useDosage) {
      genoIdx = r.getFormatIndex(dosageTag.c_str());
    } else {
      genoIdx = r.getFormatIndex("GT");
    }
    int GDidx = r.getFormatIndex("GD");
    int GQidx = r.getFormatIndex("GQ");
    assert(this->parRegion);
    bool hemiRegion = this->parRegion->isHemiRegion(r.getChrom(), r.getPos());
    // e.g.: Loop each (selected) people in the same order as in the VCF
    const int numPeople = (int)people.size();
    for (int i = 0; i < numPeople; i++) {
      indv = people[i];
      // get GT index. if you are sure the index will not change, call this
      // function only once!
      if (genoIdx >= 0) {
        // printf("%s ", indv->justGet(0).toStr());  // [0] meaning the first
        // field of each individual
        if (useDosage) {
          if (!hemiRegion) {
            m[row][i] = indv->justGet(genoIdx).toDouble();
          } else {
            // for male hemi region, imputated dosage is usually between 0 and 1
            // need to multiply by 2.0
            if ((*sex)[i] == PLINK_MALE) {
              m[row][i] = indv->justGet(genoIdx).toDouble() * 2.0;
            }
          }
        } else {
          if (!hemiRegion) {
            m[row][i] = indv->justGet(genoIdx).getGenotype();
          } else {
            if ((*sex)[i] == PLINK_MALE) {
              m[row][i] = indv->justGet(genoIdx).getMaleNonParGenotype02();
            } else if ((*sex)[i] == PLINK_FEMALE) {
              m[row][i] = indv->justGet(genoIdx).getGenotype();
            } else {
              m[row][i] = MISSING_GENOTYPE;
            }
          }
        }
        if (!checkGD(indv, GDidx) || !checkGQ(indv, GQidx)) {
          m[row][i] = MISSING_GENOTYPE;
        }
        genoCounter.add(m[row][i]);
      } else {
        logger->error("Cannot find %s field!",
                      this->dosageTag.empty() ? "GT" : dosageTag.c_str());
        return -1;
      }
    }

    // check frequency cutoffs
    // int numNonMissingPeople = 0;
    // double maf = 0.;
    // for (int i = 0; i < numPeople; ++i) {
    //   if (m[row][i] < 0) continue;
    //   maf += m[row][i];
    //   ++numNonMissingPeople;
    // }
    // if (numNonMissingPeople) {
    //   maf = maf / (2. * numNonMissingPeople);
    // } else {
    //   maf = 0.0;
    // }
    // if (maf > .5) {
    //   maf = 1.0 - maf;
    // }
    const double maf = genoCounter.getMAF();
    if (this->freqMin > 0. && this->freqMin > maf) continue;
    if (this->freqMax > 0. && this->freqMax < maf) continue;

    // store genotype results
    name = r.getChrom();
    name += ":";
    name += r.getPosStr();
    colNames.push_back(name);
    ++row;

    assert(this->parRegion);
    if (this->parRegion &&
        this->parRegion->isHemiRegion(r.getChrom(), r.getPos())) {
      this->hemiRegion.push_back(true);
    } else {
      this->hemiRegion.push_back(false);
    }
    this->counter.push_back(genoCounter);
  }  // end while (this->vin->readRecord())

  // delete rows (ugly code here, as we may allocate extra row in previous
  // loop)
  m.Dimension(row, m.cols);

  // now transpose (marker by people -> people by marker)
  g->Transpose(m);
  for (int i = 0; i < row; ++i) {
    g->SetColumnLabel(i, colNames[i].c_str());
  }
  return SUCCEED;
}  // end GenotypeExtractor
Пример #6
0
int GenotypeExtractor::extractSingleGenotype(Matrix* g, Result* b) {
  Matrix& genotype = *g;
  Result& buf = *b;

  bool hasRead = this->vin->readRecord();
  if (!hasRead) return FILE_END;

  VCFRecord& r = this->vin->getVCFRecord();
  VCFPeople& people = r.getPeople();
  VCFIndividual* indv;

  buf.updateValue("CHROM", r.getChrom());
  buf.updateValue("POS", r.getPosStr());
  buf.updateValue("REF", r.getRef());
  buf.updateValue("ALT", r.getAlt());

  genotype.Dimension(people.size(), 1);
  counter.resize(1);

  // get GT index. if you are sure the index will not change, call this
  // function only once!
  const bool useDosage = (!this->dosageTag.empty());
  int genoIdx;
  if (useDosage) {
    genoIdx = r.getFormatIndex(dosageTag.c_str());
  } else {
    genoIdx = r.getFormatIndex("GT");
  }
  // int GTidx = r.getFormatIndex("GT");
  int GDidx = r.getFormatIndex("GD");
  int GQidx = r.getFormatIndex("GQ");

  bool hemiRegion = this->parRegion->isHemiRegion(r.getChrom(), r.getPos());
  // e.g.: Loop each (selected) people in the same order as in the VCF
  const int numPeople = (int)people.size();
  for (int i = 0; i < numPeople; i++) {
    indv = people[i];

    if (genoIdx >= 0) {
      // printf("%s ", indv->justGet(0).toStr());  // [0] meaning the first
      // field of each individual
      if (useDosage) {
        genotype[i][0] = indv->justGet(genoIdx).toDouble();
      } else {
        if (!hemiRegion) {
          genotype[i][0] = indv->justGet(genoIdx).getGenotype();
        } else {
          if ((*sex)[i] == PLINK_MALE) {
            genotype[i][0] = indv->justGet(genoIdx).getMaleNonParGenotype02();
          } else if ((*sex)[i] == PLINK_FEMALE) {
            genotype[i][0] = indv->justGet(genoIdx).getGenotype();
          } else {
            genotype[i][0] = MISSING_GENOTYPE;
          }
        }
      }
      if (!checkGD(indv, GDidx) || !checkGQ(indv, GQidx)) {
        genotype[i][0] = MISSING_GENOTYPE;
      }
      counter[0].add(genotype[i][0]);
      // logger->info("%d ", int(genotype[i][0]));
    } else {
      std::string s;
      indv->toStr(&s);
      logger->error(
          "Cannot find [ %s ] field when read individual information [ %s ]!",
          this->dosageTag.empty() ? "GT" : this->dosageTag.c_str(), s.c_str());
      return ERROR;
    }
  }

  // check frequency cutoffs
  // double maf = 0.;
  // if (this->freqMin > 0.0 || this->freqMax > 0.) {
  //   for (int i = 0; i < numPeople; ++i) {
  //     maf += genotype[i][0];
  //   }
  //   maf = maf / (2. * numPeople);
  //   if (maf > .5) {
  //     maf = 1.0 - maf;
  //   }
  // }
  const double maf = counter[0].getMAF();
  if (this->freqMin > 0. && this->freqMin > maf) return FAIL_FILTER;
  if (this->freqMax > 0. && this->freqMax < maf) return FAIL_FILTER;

  std::string label = r.getChrom();
  label += ':';
  label += r.getPosStr();
  genotype.SetColumnLabel(0, label.c_str());

  this->hemiRegion.resize(1);
  assert(this->parRegion);
  if (this->parRegion &&
      this->parRegion->isHemiRegion(r.getChrom(), r.getPos())) {
    this->hemiRegion[0] = true;
  } else {
    this->hemiRegion[0] = false;
  }
  return SUCCEED;
}  // end extractSingleGenotype()
Пример #7
0
int main(int argc, char** argv) {
  time_t currentTime = time(0);
  fprintf(stderr, "Analysis started at: %s", ctime(&currentTime));

  PARSE_PARAMETER(argc, argv);
  PARAMETER_STATUS();

  if (FLAG_REMAIN_ARG.size() > 0) {
    fprintf(stderr, "Unparsed arguments: ");
    for (unsigned int i = 0; i < FLAG_REMAIN_ARG.size(); i++) {
      fprintf(stderr, " %s", FLAG_REMAIN_ARG[i].c_str());
    }
    fprintf(stderr, "\n");
    abort();
  }

  REQUIRE_STRING_PARAMETER(FLAG_inVcf,
                           "Please provide input file using: --inVcf");

  const char* fn = FLAG_inVcf.c_str();
  VCFInputFile vin(fn);

  // set range filters here
  // e.g.
  // vin.setRangeList("1:69500-69600");
  vin.setRangeList(FLAG_rangeList.c_str());
  vin.setRangeFile(FLAG_rangeFile.c_str());

  // set people filters here
  if (FLAG_peopleIncludeID.size() || FLAG_peopleIncludeFile.size()) {
    vin.excludeAllPeople();
    vin.includePeople(FLAG_peopleIncludeID.c_str());
    vin.includePeopleFromFile(FLAG_peopleIncludeFile.c_str());
  }
  vin.excludePeople(FLAG_peopleExcludeID.c_str());
  vin.excludePeopleFromFile(FLAG_peopleExcludeFile.c_str());

  // let's write it out.
  if (FLAG_updateId != "") {
    int ret = vin.updateId(FLAG_updateId.c_str());
    fprintf(stdout, "%d samples have updated id.\n", ret);
  }

  // load gene ranges
  std::map<std::string, std::string> geneRange;
  if (FLAG_geneName.size()) {
    if (FLAG_geneFile.size() == 0) {
      fprintf(stderr, "Have to provide --geneFile to extract by gene.\n");
      abort();
    }
    LineReader lr(FLAG_geneFile);
    std::vector<std::string> fd;
    while (lr.readLineBySep(&fd, "\t ")) {
      if (FLAG_geneName != fd[0]) continue;
      fd[2] = chopChr(fd[2]);  // chop "chr1" to "1"
      if (geneRange.find(fd[0]) == geneRange.end()) {
        geneRange[fd[0]] = fd[2] + ":" + fd[4] + "-" + fd[5];
      } else {
        geneRange[fd[0]] += "," + fd[2] + ":" + fd[4] + "-" + fd[5];
      }
    };
  }
  std::string range;
  for (std::map<std::string, std::string>::iterator it = geneRange.begin();
       it != geneRange.end(); it++) {
    if (range.size() > 0) {
      range += ",";
    }
    range += it->second;
  };
  fprintf(stderr, "range = %s\n", range.c_str());
  vin.setRangeList(range.c_str());

  Regex regex;
  if (FLAG_annoType.size()) {
    regex.readPattern(FLAG_annoType);
  }

  // print header
  std::vector<std::string> names;
  vin.getVCFHeader()->getPeopleName(&names);
  printf("CHROM\tPOS");
  for (unsigned int i = 0; i < names.size(); i++) {
    printf("\t%s", names[i].c_str());
  }
  printf("\n");

  // real working part
  int nonVariantSite = 0;
  while (vin.readRecord()) {
    VCFRecord& r = vin.getVCFRecord();
    VCFPeople& people = r.getPeople();
    VCFIndividual* indv;
    if (FLAG_variantOnly) {
      bool hasVariant = false;
      int geno;
      int GTidx = r.getFormatIndex("GT");
      for (size_t i = 0; i < people.size(); i++) {
        indv = people[i];
        geno = indv->justGet(GTidx).getGenotype();
        if (geno != 0 && geno != MISSING_GENOTYPE) hasVariant = true;
      }
      if (!hasVariant) {
        nonVariantSite++;
        continue;
      }
    }

    if (FLAG_annoType.size()) {
      bool isMissing = false;
      const char* tag = r.getInfoTag("ANNO", &isMissing).toStr();
      if (isMissing) continue;
      // fprintf(stdout, "ANNO = %s", tag);
      bool match = regex.match(tag);
      // fprintf(stdout, " %s \t", match ? "match": "noMatch");
      // fprintf(stdout, " %s \n", exists ? "exists": "missing");
      if (!match) {
        continue;
      }
    }

    fprintf(stdout, "%s\t%s", r.getChrom(), r.getPosStr());

    for (size_t i = 0; i < people.size(); i++) {
      indv = people[i];
      fprintf(stdout, "\t%d", indv->justGet(0).getGenotype());
    }
    fprintf(stdout, "\n");
  };

  currentTime = time(0);
  fprintf(stderr, "Analysis ends at: %s", ctime(&currentTime));

  return 0;
};