bool VcfRecord::read(IFILE filePtr, bool siteOnly, VcfRecordDiscardRules& discardRules, VcfSubsetSamples* sampleSubset) { // Clear out any previously set values. reset(); if(filePtr == NULL) { myStatus.setStatus(StatGenStatus::FAIL_ORDER, "Error reading VCF record before opening the file."); return(false); } if(ifeof(filePtr)) { // End of file, just return false. return(false); } // Read the chromosome. if(!readTilTab(filePtr, myChrom)) { if(myChrom.empty()) { // EOF. return(false); } // Not an empty line. myStatus.setStatus(StatGenStatus::FAIL_PARSE, "Error reading VCF Record CHROM."); return(false); } // Read the 1-based Position std::string strPos; if(!readTilTab(filePtr, strPos)) { myStatus.setStatus(StatGenStatus::FAIL_PARSE, "Error reading VCF Record POS."); return(false); } else { // Read the position, so convert to an integer. my1BasedPosNum = atoi(strPos.c_str()); } // Read the ID. if(!readTilTab(filePtr, myID)) { myStatus.setStatus(StatGenStatus::FAIL_PARSE, "Error reading VCF Record ID."); return(false); } if(discardRules.discardForID(myID)) { // Do not keep this id, so consume the rest of the record and // return the next record. filePtr->discardLine(); return(read(filePtr, siteOnly, discardRules, sampleSubset)); } // Read the Ref. if(!readTilTab(filePtr, myRef)) { myStatus.setStatus(StatGenStatus::FAIL_PARSE, "Error reading VCF Record REF."); return(false); } // Read the Alt. myAltArray.clear(); if(!readTilTab(filePtr, myAlt)) { myStatus.setStatus(StatGenStatus::FAIL_PARSE, "Error reading VCF Record ALT."); return(false); } // Read the Qual. if(!readTilTab(filePtr, myQual)) { myStatus.setStatus(StatGenStatus::FAIL_PARSE, "Error reading VCF Record QUAL."); return(false); } else { if(myQual != ".") { // Read the quality, so convert to an integer. myQualNum = atof(myQual.c_str()); } else { myQualNum = -1; } } // Read the Filter. if(!myFilter.read(filePtr)) { myStatus.setStatus(StatGenStatus::FAIL_PARSE, "Error reading VCF Record FILTER."); return(false); } // Read the Info (could be the last word in the line or file). if(!myInfo.read(filePtr)) { // Found the end of the line after the info field, so return true, // successfully read the record. return(true); } if(siteOnly) { // Do not store genotypes, so just consume the rest of the line. filePtr->readTilChar("\n"); } else { // Not yet at the end of the line, so read the genotype fields // (format & samples) try { myGenotype.read(filePtr, sampleSubset); } catch(std::exception& e) { myDummyString = "Failed parsing the Genotype Fields of " + myChrom + ":" + std::to_string((long long int)my1BasedPosNum) + " (chr:pos) - " + e.what(); myStatus.setStatus(StatGenStatus::FAIL_PARSE, myDummyString.c_str()); return(false); } } // Found the end of the line, return true since all required fields // were read. return(true); }
bool VcfGenotypeSample::read(IFILE filePtr, VcfGenotypeFormat& format) { static const char* GT_DELIM = "\n\t:|/."; static const int END_GT = 2; // Ends at index 2 or less static const int PHASED_CHAR_POS = 3; static const int UNPHASED_CHAR_POS = 4; static const int MISSING_GT_POS = 5; // Clear out any previously set values. reset(); myFormatPtr = &format; int gtIndex = format.getGTIndex(); // Read the subfields. SUBFIELD_READ_STATUS readStatus = MORE_SUBFIELDS; std::string* nextType = NULL; int subFieldIndex = 0; while(readStatus == MORE_SUBFIELDS) { // Get the field to write into. if(format.storeIndex(subFieldIndex)) { nextType = &(myGenotypeSubFields.getNextEmpty()); // Check if this is the GT field. if(subFieldIndex == gtIndex) { // There is a GT field, so set that all GT fields are there. // if any are missing it will be turned back to false. myHasAllGenotypeAlleles = true; // This is the GT field, so parse manually looking to see if it // is phased and store the genotypes. int stopChar = END_GT + 1; // Read until a new subfield is found. while(stopChar > END_GT) { // TODO have an option to autoparse the genotypes? // todo - store the previous nextType len in order to // do string conversion to ints... stopChar = filePtr->readTilChar(GT_DELIM, *nextType); if(stopChar == PHASED_CHAR_POS) { nextType->push_back('|'); myPhased = true; } else if(stopChar == UNPHASED_CHAR_POS) { nextType->push_back('/'); myUnphased = true; } else if(stopChar == MISSING_GT_POS) { nextType->push_back('.'); myHasAllGenotypeAlleles = false; } } // Check if this is the END_GT signal. readStatus = getReadStatus(stopChar); } else { // more subfields to read. readStatus = readGenotypeSubField(filePtr, nextType); } } else { readStatus = readGenotypeSubField(filePtr, NULL); } ++subFieldIndex; } // subFieldIndex contains the number of fields in this sample. if(subFieldIndex > format.getOrigNumFields()) { throw(std::runtime_error("VCF Number of Fields in a Sample does not match the Format.")); } else if(subFieldIndex < format.getOrigNumFields()) { // If there are no fields for this sample, enter the missing value. if(myGenotypeSubFields.size() == 0) { myGenotypeSubFields.getNextEmpty() = MISSING_FIELD; } } // Return true if there is a tab - it is just END_OF_FIELD. return(readStatus == END_OF_FIELD); }