bool glfHandler::NextSection() { if (isStub) { endOfSection = true; data.recordType = 0; maxPosition = 1999999999; position = maxPosition + 1; return true; } while (!endOfSection && !ifeof(handle)) NextEntry(); endOfSection = false; int labelLength = 0; currentSection++; position = 0; if (ifread(handle, &labelLength, sizeof(int)) == sizeof(int)) { ifread(handle, label.LockBuffer(labelLength+1), labelLength * sizeof(char)); label.UnlockBuffer(); maxPosition = 0; ifread(handle, &maxPosition, sizeof(int)); return ((maxPosition > 0) && !ifeof(handle)); } return false; }
// Read the refSection from the specified file. Assumes the file is in // the correct position for reading the refSection. bool GlfRefSection::read(IFILE filePtr) { // Read the reference sequence name length int numRead = 0; int32_t refNameLen = 0; int byteLen = sizeof(int32_t); numRead = ifread(filePtr, &refNameLen, byteLen); if(numRead != byteLen) { // If no bytes were read and it is the end of the file, then return // false, but do not throw an exception. This is not an error, just // the end of the file. if((numRead == 0) && ifeof(filePtr)) { return(false); } String errorMsg = "Failed to read the length of the reference sequence name ("; errorMsg += byteLen; errorMsg += " bytes). Only read "; errorMsg += numRead; errorMsg += " bytes."; std::string errorString = errorMsg.c_str(); throw(GlfException(GlfStatus::FAIL_IO, errorString)); return(false); } // Read the refSection from the file. numRead = myRefName.readFromFile(filePtr, refNameLen); if(numRead != refNameLen) { String errorMsg = "Failed to read the reference sequence name ("; errorMsg += refNameLen; errorMsg += " bytes). Only read "; errorMsg += numRead; errorMsg += " bytes."; std::string errorString = errorMsg.c_str(); throw(GlfException(GlfStatus::FAIL_IO, errorString)); return(false); } // Read the ref length. byteLen = sizeof(uint32_t); numRead = ifread(filePtr, &myRefLen, byteLen); if(numRead != byteLen) { String errorMsg = "Failed to read the reference sequence length ("; errorMsg += byteLen; errorMsg += " bytes). Only read "; errorMsg += numRead; errorMsg += " bytes."; std::string errorString = errorMsg.c_str(); throw(GlfException(GlfStatus::FAIL_IO, errorString)); return(false); } // Successfully read, return success. return(true); }
int loadVector(Vector& a, String& fileName) { a.Zero(); IFILE ifile(fileName.c_str(), "r"); String line; StringArray array; int lineNo = 0; while (!ifeof(ifile)) { line.ReadLine(ifile); lineNo++; if (line.Length() == 0) continue; array.Clear(); array.AddTokens(line); if (array.Length() > 1 && line.Length() > 0) { fprintf(stderr, "Warning: column size at line %d!\n", lineNo); array.Print(); line.Write(stdout); return -1; } if (a.dim < lineNo) { a.GrowTo(a.dim + 1); } a[lineNo - 1] = atol(array[0]); } // a.Print(stdout); return 0; };
int loadMatrix(Matrix& a, String& fileName) { a.Zero(); IFILE ifile(fileName.c_str(), "r"); String line; StringArray array; int lineNo = 0; while (!ifeof(ifile)) { line.ReadLine(ifile); lineNo++; if (line.Length() == 0) continue; array.Clear(); array.AddTokens(line); if (a.cols != 0 && a.cols != array.Length() && line.Length() > 0) { fprintf(stderr, "Wrong column size at line %d!\n", lineNo); array.Print(); line.Write(stdout); return -1; } else { a.GrowTo(a.rows, array.Length()); } if (a.rows < lineNo) { a.GrowTo(a.rows + 1, a.cols); } for (int i = 0; i < array.Length(); i++) { a[lineNo - 1][i] = atol(array[i]); } } // a.Print(stdout); return 0; };
// copied from Matthew Flickenger/Snyder void VcfFile::parseMeta() { do { if ( readLine() <= 0 ) break; if ( ifeof(iFile) || !isMetaLine() ) break; parseMetaLine(); } while (1); verifyMetaLines(); }
void StringHash::ReadLinesFromFile(IFILE & f) { String buffer; while (!ifeof(f)) { buffer.ReadLine(f); Add(buffer.Trim()); } }
// Returns whether or not the end of the file has been reached. // return: int - true = EOF; false = not eof. bool SamFile::IsEOF() { if (myFilePtr != NULL) { // File Pointer is set, so return if eof. return(ifeof(myFilePtr)); } // File pointer is not set, so return true, eof. return true; }
// Returns whether or not the end of the file has been reached. // return: int - true = EOF; false = not eof. bool VcfFileReader::isEOF() { if (myFilePtr != NULL) { // File Pointer is set, so return if eof. return(ifeof(myFilePtr)); } // File pointer is not set, so return true, eof. return true; }
void GCContent::LoadRegions(String & regionsFile, GenomeSequence &genome, bool invertRegion) { if(regionsFile.Length()==0) return; if(genome.sequenceLength()==0) error("No reference genome loaded!\n"); IFILE fhRegions; fhRegions = ifopen(regionsFile.c_str(),"r"); if(fhRegions==NULL) error("Open regions file %s failed!\n", regionsFile.c_str()); regionIndicator.resize(genome.sequenceLength()); StringArray tokens; String buffer; int len; fprintf(stderr, "Loading region list..."); while (!ifeof(fhRegions)){ buffer.ReadLine(fhRegions); if (buffer.IsEmpty() || buffer[0] == '#') continue; tokens.AddTokens(buffer, WHITESPACE); if(tokens.Length() < 3) continue; genomeIndex_t startGenomeIndex = 0; int chromosomeIndex = tokens[1].AsInteger(); // use chromosome name (token[0]) and position (token[1]) to query genome index. startGenomeIndex = genome.getGenomePosition(tokens[0].c_str(), chromosomeIndex); if(startGenomeIndex >= regionIndicator.size() ) { //fprintf(stderr, "WARNING: region list section %s position %u is not found in the reference and skipped...\n", tokens[0].c_str(), chromosomeIndex); continue; } len = tokens[2].AsInteger() - tokens[1].AsInteger() + 1; for(uint32_t i=startGenomeIndex; i<startGenomeIndex+len; i++) regionIndicator[i] = true; tokens.Clear(); buffer.Clear(); } if (invertRegion) { fprintf(stderr, " invert region..."); for (uint32_t i = 0; i < regionIndicator.size(); i++) { regionIndicator[i] = !regionIndicator[i]; } } ifclose(fhRegions); fprintf(stderr, "DONE!\n"); }
// Check to see if the file is at the end of the file. bool FastQFile::isEof() { // Check to see if the file is open. if((myFile != NULL) && (ifeof(myFile))) { // At EOF. return true; } // Not at EOF. return false; }
int main() { String fn = "/home/zhanxw/compareMapSoft/index/mapreads/chr1.fa"; IFILE file = ifopen(fn.c_str(), "r"); int totalChar = 0; String line; int freq[256] = {0}; while (!ifeof(file)){ line.ReadLine(file); totalChar += line.Length(); for (int i = 0; i < line.Length(); i++) freq[(unsigned int) line[i]]++; } printf("A frequency: %d (%f)\n", freq[(int)'A'], (float)freq[(int)'A']/totalChar); printf("T frequency: %d (%f)\n", freq[(int)'T'], (float)freq[(int)'T']/totalChar); printf("G frequency: %d (%f)\n", freq[(int)'G'], (float)freq[(int)'G']/totalChar); printf("C frequency: %d (%f)\n", freq[(int)'C'], (float)freq[(int)'C']/totalChar); }
int InputFile::readLine(std::string& line) { int charRead = 0; while(!ifeof()) { charRead = ifgetc(); if(charRead == EOF) { return(-1); } if(charRead == '\n') { return(0); } line += charRead; } // Should never get here. return(-1); }
int InputFile::readTilTab(std::string& field) { int charRead = 0; while(!ifeof()) { charRead = ifgetc(); if(charRead == EOF) { return(-1); } if(charRead == '\n') { return(0); } if(charRead == '\t') { return(1); } field += charRead; } return(-1); }
bool Stats::getNextSection(SamFile &samIn) { static bool alreadyRead = false; if(myRegionList == NULL) { // no region list is set, so just read once. if(alreadyRead) { // No regions and it has already been read, so // return false, no more to read. return(false); } // Return true that there is more to read, but // set the flag that it has already been read // so the next call will return false. alreadyRead = true; return(true); } else { // There is a region list, so read process that. // Track whether or not a section has been found. bool sectionFound = false; myStartPos = 0; myEndPos = 0; // Loop until the end of the file or the end of the file or // a section is found. while(!sectionFound && !ifeof(myRegionList)) { myRegBuffer.Clear(); myRegBuffer.ReadLine(myRegionList); if(myRegBuffer.IsEmpty()) { // Nothing read, so continue to the next line. continue; } // A line was read, so parse it. myRegColumn.ReplaceColumns(myRegBuffer, '\t'); if(myRegColumn.Length() < 3) { // Incorrectly formatted line. std::cerr << "Improperly formatted reg line: " << myRegBuffer << "; Skipping to the next line.\n"; continue; } // Check the columns. if(!myRegColumn[1].AsInteger(myStartPos)) { // The start position (2nd column) is not an integer. std::cerr << "Improperly formatted region line, start position " << "(2nd column) is not an integer: " << myRegColumn[1] << "; Skipping to the next line.\n"; } else if(!myRegColumn[2].AsInteger(myEndPos)) { // The end position (3rd column) is not an integer. std::cerr << "Improperly formatted region line, end position " << "(3rd column) is not an integer: " << myRegColumn[2] << "; Skipping to the next line.\n"; } else if((myStartPos >= myEndPos) && (myEndPos != -1)) { // The start position is >= the end position std::cerr << "Improperly formatted region line, the start position " << "is >= end position: " << myRegColumn[1] << " >= " << myRegColumn[2] << "; Skipping to the next line.\n"; } else { sectionFound = true; samIn.SetReadSection(myRegColumn[0].c_str(), myStartPos, myEndPos); } } return(sectionFound); } }
// Reads and validates a single fastq sequence from myFile. FastQStatus::Status FastQFile::readFastQSequence() { // First verify that a file is open, if not, return failure. if(!isOpen()) { std::string message = "ERROR: Trying to read a fastq file but no file is open."; logMessage(message.c_str()); return(FastQStatus::FASTQ_ORDER_ERROR); } // Reset variables for each sequence. resetForEachSequence(); bool valid = true; // No sequence was read. if(isTimeToQuit()) { return(FastQStatus::FASTQ_NO_SEQUENCE_ERROR); } // The first line is the sequence identifier, so validate that. valid = validateSequenceIdentifierLine(); if(myFileProblem) { return(FastQStatus::FASTQ_READ_ERROR); } // If we are at the end of the file, check to see if it is a partial // sequence or just an empty line at the end. if(ifeof(myFile)) { // If the sequence identifier line was empty and we are at the // end of the file, there is nothing more to validate. if(mySequenceIdLine.Length() != 0) { // There was a sequence identifier line, so this is an incomplete // sequence. myErrorString = "Incomplete Sequence.\n"; reportErrorOnLine(); valid = false; } if(valid) { // Return failure - no sequences were left to read. At the end // of the file. It wasn't invalid and it wasn't really an error. return(FastQStatus::FASTQ_NO_SEQUENCE_ERROR); } else { return(FastQStatus::FASTQ_INVALID); } } // If enough errors, quit before reading any more. if(isTimeToQuit()) { // Means there was an error, so mark it as invalid. return(FastQStatus::FASTQ_INVALID); } // Validate the Raw Sequence Line(s) and the "+" line. valid &= validateRawSequenceAndPlusLines(); if(myFileProblem) { return(FastQStatus::FASTQ_READ_ERROR); } // If enough errors, quit before reading any more. if(isTimeToQuit()) { return(FastQStatus::FASTQ_INVALID); } // If it is the end of a file, it is missing the quality string. if(ifeof(myFile)) { // There was a sequence identifier line, so this is an incomplete // sequence. myErrorString = "Incomplete Sequence, missing Quality String."; reportErrorOnLine(); valid = false; return(FastQStatus::FASTQ_INVALID); } // All that is left is to validate the quality string line(s). valid &= validateQualityStringLines(); if(myFileProblem) { return(FastQStatus::FASTQ_READ_ERROR); } if(valid) { return(FastQStatus::FASTQ_SUCCESS); } return(FastQStatus::FASTQ_INVALID); }
// Reads and validates the sequence identifier line of a fastq sequence. bool FastQFile::validateSequenceIdentifierLine() { // Read the first line of the sequence. int readStatus = mySequenceIdLine.ReadLine(myFile); // Check to see if the read was successful. if(readStatus <= 0) { // If EOF, not an error. if(ifeof(myFile)) { return true; } myFileProblem = true; myErrorString = "Failure trying to read sequence identifier line"; reportErrorOnLine(); return false; } // If the line is 0 length and it is the end of the file, just // return since this is the eof - no error. if((mySequenceIdLine.Length() == 0) && (ifeof(myFile))) { // Not an error, just a new line at the end of the file. return true; } // Increment the line number. myLineNum++; // Verify that the line has at least 2 characters: '@' and at least // one character for the sequence identifier. if(mySequenceIdLine.Length() < 2) { // Error. Sequence Identifier line not long enough. myErrorString = "The sequence identifier line was too short."; reportErrorOnLine(); return false; } // The sequence identifier line must start wtih a '@' if(mySequenceIdLine[0] != '@') { // Error - sequence identifier line does not begin with an '@'. myErrorString = "First line of a sequence does not begin with @"; reportErrorOnLine(); return false; } // Valid Sequence Identifier Line. // The sequence identifier ends at the first space or at the end of the // line if there is no space. // Use fast find since this is a case insensitive search. // Start at 1 since we know that 0 is '@' int endSequenceIdentifier = mySequenceIdLine.FastFindChar(' ', 1); // Check if a " " was found. if(endSequenceIdentifier == -1) { // Did not find a ' ', so the identifier is the rest of the line. // It starts at 1 since @ is at offset 0. mySequenceIdentifier = (mySequenceIdLine.SubStr(1)).c_str(); } else { // Found a ' ', so the identifier ends just before that. // The sequence identifier must be at least 1 character long, // therefore the endSequenceIdentifier must be greater than 1. if(endSequenceIdentifier <= 1) { myErrorString = "No Sequence Identifier specified before the comment."; reportErrorOnLine(); return false; } mySequenceIdentifier = (mySequenceIdLine.SubStr(1, endSequenceIdentifier - 1)).c_str(); } // If myInterleaved, validate matches the previous seqID. if(myInterleaved && (myPrevSeqID != "")) { // Valid if the sequence identifiers are identical or if // the only difference is a trailing 1 or 2. if(myPrevSeqID.compare(mySequenceIdentifier) != 0) { // Compare all but the last characters, then check the last characters for 1 or 2. if((myPrevSeqID.compare(0, myPrevSeqID.length()-1, mySequenceIdentifier.c_str(), mySequenceIdentifier.Length()-1) != 0) || (((myPrevSeqID[myPrevSeqID.length()-1] != '1') || (mySequenceIdentifier[mySequenceIdentifier.Length()-1] != '2')) && (myPrevSeqID[myPrevSeqID.length()-1] != mySequenceIdentifier[mySequenceIdentifier.Length()-1]))) { myErrorString = "Interleaved: consecutive reads do not have matching sequence identifiers: "; myErrorString += mySequenceIdentifier.c_str(); myErrorString += " and "; myErrorString += myPrevSeqID.c_str(); reportErrorOnLine(); myPrevSeqID.clear(); return(false); } } myPrevSeqID.clear(); } else { if(myInterleaved) { myPrevSeqID = mySequenceIdentifier.c_str(); } // Check if sequence identifier should be validated for uniqueness if it is // not the 2nd in an interleaved pair. if(myCheckSeqID) { // Check to see if the sequenceIdentifier is a repeat by adding // it to the set and seeing if it already existed. std::pair<std::map<std::string, unsigned int>::iterator,bool> insertResult; insertResult = myIdentifierMap.insert(std::make_pair(mySequenceIdentifier.c_str(), myLineNum)); if(insertResult.second == false) { // Sequence Identifier is a repeat. myErrorString = "Repeated Sequence Identifier: "; myErrorString += mySequenceIdentifier.c_str(); myErrorString += " at Lines "; myErrorString += insertResult.first->second; myErrorString += " and "; myErrorString += myLineNum; reportErrorOnLine(); return(false); } } } // Valid, return true. return(true); }
// Reads and validates the raw sequence line(s) and the plus line. Both are // included in one method since it is unknown when the raw sequence line // ends until you find the plus line that divides it from the quality // string. Since this method will read the plus line to know when the // raw sequence ends, it also validates that line. bool FastQFile::validateRawSequenceAndPlusLines() { // Read the raw sequence. int readStatus = myRawSequence.ReadLine(myFile); myLineNum++; if(readStatus <= 0) { myFileProblem = true; myErrorString = "Failure trying to read sequence line"; reportErrorOnLine(); return false; } // Offset into the raw sequence to be validated. int offset = 0; // Validate the raw sequence. bool valid = validateRawSequence(offset); // Increment the offset for what was just read. offset = myRawSequence.Length(); // The next line is either a continuation of the raw sequence or it starts // with a '+' // Keep validating til the '+' line or the end of file is found. bool stillRawLine = true; while(stillRawLine && !ifeof(myFile)) { // If enough errors, quit before reading any more. if(isTimeToQuit()) { return(false); } // Read the next line. // Read into the plus line, but if it isn't a plus line, then // it will be copied into the raw sequence line. readStatus = myPlusLine.ReadLine(myFile); myLineNum++; if(readStatus <= 0) { myFileProblem = true; myErrorString = "Failure trying to read sequence/plus line"; reportErrorOnLine(); return false; } // Check if the next line is blank if(myPlusLine.Length() == 0) { // The next line is blank. Assume it is part of the raw sequence and // report an error since there are no valid characters on the line. myErrorString = "Looking for continuation of Raw Sequence or '+' instead found a blank line, assuming it was part of Raw Sequence."; reportErrorOnLine(); } // Check for the plus line. else if(myPlusLine[0] == '+') { // This is the + line. valid &= validateSequencePlus(); stillRawLine = false; } else { // Not a plus line, so assume this is a continuation of the Raw // Sequence. // Copy from the plus line to the raw sequence line. myRawSequence += myPlusLine; myPlusLine.SetLength(0); valid &= validateRawSequence(offset); // Increment the offset. offset = myRawSequence.Length(); } } // If enough errors, quit before reading any more. if(isTimeToQuit()) { return(false); } // Now that the entire raw sequence has been obtained, check its length // against the minimum allowed length. if(myRawSequence.Length() < myMinReadLength) { // Raw sequence is not long enough - error. myErrorString = "Raw Sequence is shorter than the min read length: "; myErrorString += myRawSequence.Length(); myErrorString += " < "; myErrorString += myMinReadLength; reportErrorOnLine(); valid = false; } // If enough errors, quit before reading any more. if(isTimeToQuit()) { return(false); } // if the flag still indicates it is processing the raw sequence that means // we reached the end of the file without a '+' line. So report that error. if(stillRawLine) { myErrorString = "Reached the end of the file without a '+' line."; reportErrorOnLine(); valid = false; } return(valid); }
// Reads and validates the quality string line(s). bool FastQFile::validateQualityStringLines() { // Read the quality string. int readStatus = myQualityString.ReadLine(myFile); myLineNum++; if(readStatus <= 0) { myFileProblem = true; myErrorString = "Failure trying to read quality line"; reportErrorOnLine(); return false; } // track the offset into the quality string to validate. int offset = 0; // Validate this line of the quality string. bool valid = validateQualityString(offset); offset = myQualityString.Length(); // Keep reading quality string lines until the length of the // raw sequence has been hit or the end of the file is reached. while((myQualityString.Length() < myRawSequence.Length()) && (!ifeof(myFile))) { // If enough errors, quit before reading any more. if(isTimeToQuit()) { return(false); } // Read another line of the quality string. readStatus = myTempPartialQuality.ReadLine(myFile); myLineNum++; if(readStatus <= 0) { myFileProblem = true; myErrorString = "Failure trying to read quality line"; reportErrorOnLine(); return false; } myQualityString += myTempPartialQuality; myTempPartialQuality.Clear(); // Validate this line of the quality string. valid &= validateQualityString(offset); offset = myQualityString.Length(); } // If enough errors, quit before reading any more. if(isTimeToQuit()) { return(false); } // Validate that the quality string length is the same as the // raw sequence length. if(myQualityString.Length() != myRawSequence.Length()) { myErrorString = "Quality string length ("; myErrorString += myQualityString.Length(); myErrorString += ") does not equal raw sequence length ("; myErrorString += myRawSequence.Length(); myErrorString += ")"; reportErrorOnLine(); valid = false; } return(valid); }
bool WriteRegion::getNextSection() { bool anotherSection = false; // If refName is set, use that. if(myRefName.Length() != 0) { // Use Reference Name for the next section. anotherSection = true; mySamIn.SetReadSection(myRefName.c_str(), myStart, myEnd, !myWithinReg); // Already processed this section, so clear the reference name // so it will not be used again. myRefName.Clear(); myStart = UNSPECIFIED_INT; myEnd = UNSPECIFIED_INT; } else if(myRefID != UNSET_REF) { // Use Reference ID for the next section. anotherSection = true; mySamIn.SetReadSection(myRefID, myStart, myEnd, !myWithinReg); // Already processed this section, so clear the reference id // so it will not be used again. myRefID = UNSET_REF; myStart = UNSPECIFIED_INT; myEnd = UNSPECIFIED_INT; } else if(myBedFile != NULL) { // There is a bed file, so read the next line. while(!anotherSection) { myBedBuffer.Clear(); myBedBuffer.ReadLine(myBedFile); if(ifeof(myBedFile) && myBedBuffer.IsEmpty()) { // End of the file, so break. break; } // Not the end of the file, so parse the line. myBedColumn.ReplaceColumns(myBedBuffer, '\t'); if(myBedColumn.Length() != 3) { // Incorrectly formatted line. std::cerr << "Improperly formatted bed line: " << myBedBuffer << "; Skipping to the next line.\n"; } else { // Check the reference name. if(myPrevRefName != myBedColumn[0]) { // New reference name (chromosome), so clear the previous // start/end. myPrevStart = UNSPECIFIED_INT; myPrevEnd = UNSPECIFIED_INT; myPrevRefName = myBedColumn[0]; // Get the reference ID for the reference name. myBedRefID = mySamHeader.getReferenceID(myPrevRefName); // Check to see if the reference ID is found. if(myBedRefID == SamReferenceInfo::NO_REF_ID) { // The specified Reference ID is not in the file, // so check to see if it has chr. // Check to see if it is the same except for 'chr' appended. if((myPrevRefName[0] == 'c') && (myPrevRefName[1] == 'h') && (myPrevRefName[2] == 'r')) { // It starts with chr, so look up with out the chr myBedRefID = mySamHeader.getReferenceID(myPrevRefName.c_str() + 3); } } } else { // Not a new reference name. // Store the previous positions before overwriting them. myPrevStart = myStart; if(myPrevEnd < myEnd) { // The last section ends later than the previous one, // So update the previous latest end. myPrevEnd = myEnd; } } // If the refID is still NO_REF_ID, just continue to the next bed line. if(myBedRefID == SamReferenceInfo::NO_REF_ID) { continue; } // Correct number of columns, check the columns. if(!myBedColumn[1].AsInteger(myStart)) { // The start position (2nd column) is not an integer. std::cerr << "Improperly formatted bed line, start position (2nd column) is not an integer: " << myBedColumn[1] << "; Skipping to the next line.\n"; } else if(!myBedColumn[2].AsInteger(myEnd)) { // The end position (3rd column) is not an integer. std::cerr << "Improperly formatted bed line, end position (3rd column) is not an integer: " << myBedColumn[2] << "; Skipping to the next line.\n"; } else if(myStart >= myEnd) { // The start position is >= the end std::cerr << "Improperly formatted bed line, the start position is >= end position: " << myBedColumn[1] << " >= " << myBedColumn[2] << "; Skipping to the next line.\n"; } else if(myPrevStart > myStart) { // Same reference name, but the position goes backwards. // This is against the assumption that the bed is sorted. std::cerr << "Improperly formatted bed, the start position is < the previous start (bed is assumed to be sorted): " << myStart << " < " << myPrevStart << "; Skipping to the next line.\n"; } else { anotherSection = true; mySamIn.SetReadSection(myBedRefID, myStart, myEnd, !myWithinReg); } } } } else { // If we have no bed file, then we only have another section // if we have not already written a region. anotherSection = !myWroteReg; } return(anotherSection); }
bool VcfRecord::read(IFILE filePtr, bool siteOnly, VcfRecordDiscardRules& discardRules, VcfSubsetSamples* sampleSubset) { // Clear out any previously set values. reset(); if(filePtr == NULL) { myStatus.setStatus(StatGenStatus::FAIL_ORDER, "Error reading VCF record before opening the file."); return(false); } if(ifeof(filePtr)) { // End of file, just return false. return(false); } // Read the chromosome. if(!readTilTab(filePtr, myChrom)) { if(myChrom.empty()) { // EOF. return(false); } // Not an empty line. myStatus.setStatus(StatGenStatus::FAIL_PARSE, "Error reading VCF Record CHROM."); return(false); } // Read the 1-based Position std::string strPos; if(!readTilTab(filePtr, strPos)) { myStatus.setStatus(StatGenStatus::FAIL_PARSE, "Error reading VCF Record POS."); return(false); } else { // Read the position, so convert to an integer. my1BasedPosNum = atoi(strPos.c_str()); } // Read the ID. if(!readTilTab(filePtr, myID)) { myStatus.setStatus(StatGenStatus::FAIL_PARSE, "Error reading VCF Record ID."); return(false); } if(discardRules.discardForID(myID)) { // Do not keep this id, so consume the rest of the record and // return the next record. filePtr->discardLine(); return(read(filePtr, siteOnly, discardRules, sampleSubset)); } // Read the Ref. if(!readTilTab(filePtr, myRef)) { myStatus.setStatus(StatGenStatus::FAIL_PARSE, "Error reading VCF Record REF."); return(false); } // Read the Alt. myAltArray.clear(); if(!readTilTab(filePtr, myAlt)) { myStatus.setStatus(StatGenStatus::FAIL_PARSE, "Error reading VCF Record ALT."); return(false); } // Read the Qual. if(!readTilTab(filePtr, myQual)) { myStatus.setStatus(StatGenStatus::FAIL_PARSE, "Error reading VCF Record QUAL."); return(false); } else { if(myQual != ".") { // Read the quality, so convert to an integer. myQualNum = atof(myQual.c_str()); } else { myQualNum = -1; } } // Read the Filter. if(!myFilter.read(filePtr)) { myStatus.setStatus(StatGenStatus::FAIL_PARSE, "Error reading VCF Record FILTER."); return(false); } // Read the Info (could be the last word in the line or file). if(!myInfo.read(filePtr)) { // Found the end of the line after the info field, so return true, // successfully read the record. return(true); } if(siteOnly) { // Do not store genotypes, so just consume the rest of the line. filePtr->readTilChar("\n"); } else { // Not yet at the end of the line, so read the genotype fields // (format & samples) try { myGenotype.read(filePtr, sampleSubset); } catch(std::exception& e) { myDummyString = "Failed parsing the Genotype Fields of " + myChrom + ":" + std::to_string((long long int)my1BasedPosNum) + " (chr:pos) - " + e.what(); myStatus.setStatus(StatGenStatus::FAIL_PARSE, myDummyString.c_str()); return(false); } } // Found the end of the line, return true since all required fields // were read. return(true); }
int Stats::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; String indexFile = ""; bool basic = false; bool noeof = false; bool params = false; bool qual = false; bool phred = false; int maxNumReads = -1; bool unmapped = false; String pBaseQC = ""; String cBaseQC = ""; String regionList = ""; int excludeFlags = 0; int requiredFlags = 0; bool withinRegion = false; int minMapQual = 0; String dbsnp = ""; PosList *dbsnpListPtr = NULL; bool baseSum = false; int bufferSize = PileupHelper::DEFAULT_WINDOW_SIZE; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inFile) LONG_PARAMETER_GROUP("Types of Statistics") LONG_PARAMETER("basic", &basic) LONG_PARAMETER("qual", &qual) LONG_PARAMETER("phred", &phred) LONG_STRINGPARAMETER("pBaseQC", &pBaseQC) LONG_STRINGPARAMETER("cBaseQC", &cBaseQC) LONG_PARAMETER_GROUP("Optional Parameters") LONG_INTPARAMETER("maxNumReads", &maxNumReads) LONG_PARAMETER("unmapped", &unmapped) LONG_STRINGPARAMETER("bamIndex", &indexFile) LONG_STRINGPARAMETER("regionList", ®ionList) LONG_INTPARAMETER("excludeFlags", &excludeFlags) LONG_INTPARAMETER("requiredFlags", &requiredFlags) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) LONG_PARAMETER_GROUP("Optional phred/qual Only Parameters") LONG_PARAMETER("withinRegion", &withinRegion) LONG_PARAMETER_GROUP("Optional BaseQC Only Parameters") LONG_PARAMETER("baseSum", &baseSum) LONG_INTPARAMETER("bufferSize", &bufferSize) LONG_INTPARAMETER("minMapQual", &minMapQual) LONG_STRINGPARAMETER("dbsnp", &dbsnp) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { usage(); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument for stats, " << "but was not specified" << std::endl; return(-1); } // Use the index file if unmapped or regionList is not empty. bool useIndex = (unmapped|| (!regionList.IsEmpty())); // IndexFile is required, so check to see if it has been set. if(useIndex && (indexFile == "")) { // In file was not specified, so set it to the in file // + ".bai" indexFile = inFile + ".bai"; } //////////////////////////////////////// // Setup in case pileup is used. Pileup<PileupElementBaseQCStats> pileup(bufferSize); // Initialize start/end positions. myStartPos = 0; myEndPos = -1; // Open the output qc file if applicable. IFILE baseQCPtr = NULL; if(!pBaseQC.IsEmpty() && !cBaseQC.IsEmpty()) { usage(); inputParameters.Status(); // Cannot specify both types of baseQC. std::cerr << "Cannot specify both --pBaseQC & --cBaseQC." << std::endl; return(-1); } else if(!pBaseQC.IsEmpty()) { baseQCPtr = ifopen(pBaseQC, "w"); PileupElementBaseQCStats::setPercentStats(true); } else if(!cBaseQC.IsEmpty()) { baseQCPtr = ifopen(cBaseQC, "w"); PileupElementBaseQCStats::setPercentStats(false); } if(baseQCPtr != NULL) { PileupElementBaseQCStats::setOutputFile(baseQCPtr); PileupElementBaseQCStats::printHeader(); } if((baseQCPtr != NULL) || baseSum) { PileupElementBaseQCStats::setMapQualFilter(minMapQual); PileupElementBaseQCStats::setBaseSum(baseSum); } if(params) { inputParameters.Status(); } // Open the file for reading. SamFile samIn; if(!samIn.OpenForRead(inFile)) { fprintf(stderr, "%s\n", samIn.GetStatusMessage()); return(samIn.GetStatus()); } samIn.SetReadFlags(requiredFlags, excludeFlags); // Set whether or not basic statistics should be generated. samIn.GenerateStatistics(basic); // Read the sam header. SamFileHeader samHeader; if(!samIn.ReadHeader(samHeader)) { fprintf(stderr, "%s\n", samIn.GetStatusMessage()); return(samIn.GetStatus()); } // Open the bam index file for reading if we are // doing unmapped reads (also set the read section). if(useIndex) { samIn.ReadBamIndex(indexFile); if(unmapped) { samIn.SetReadSection(-1); } if(!regionList.IsEmpty()) { myRegionList = ifopen(regionList, "r"); } } ////////////////////////// // Read dbsnp if specified and doing baseQC if(((baseQCPtr != NULL) || baseSum) && (!dbsnp.IsEmpty())) { // Read the dbsnp file. IFILE fdbSnp; fdbSnp = ifopen(dbsnp,"r"); // Determine how many entries. const SamReferenceInfo& refInfo = samHeader.getReferenceInfo(); int maxRefLen = 0; for(int i = 0; i < refInfo.getNumEntries(); i++) { int refLen = refInfo.getReferenceLength(i); if(refLen >= maxRefLen) { maxRefLen = refLen + 1; } } dbsnpListPtr = new PosList(refInfo.getNumEntries(),maxRefLen); if(fdbSnp==NULL) { std::cerr << "Open dbSNP file " << dbsnp.c_str() << " failed!\n"; } else if(dbsnpListPtr == NULL) { std::cerr << "Failed to init the memory allocation for the dbsnpList.\n"; } else { // Read the dbsnp file. StringArray tokens; String buffer; int position = 0; int refID = 0; // Loop til the end of the file. while (!ifeof(fdbSnp)) { // Read the next line. buffer.ReadLine(fdbSnp); // If it does not have at least 2 columns, // continue to the next line. if (buffer.IsEmpty() || buffer[0] == '#') continue; tokens.AddTokens(buffer); if(tokens.Length() < 2) continue; if(!tokens[1].AsInteger(position)) { std::cerr << "Improperly formatted region line, start position " << "(2nd column) is not an integer: " << tokens[1] << "; Skipping to the next line.\n"; continue; } // Look up the reference name. refID = samHeader.getReferenceID(tokens[0]); if(refID != SamReferenceInfo::NO_REF_ID) { // Reference id was found, so add it to the dbsnp dbsnpListPtr->addPosition(refID, position); } tokens.Clear(); buffer.Clear(); } } ifclose(fdbSnp); } // Read the sam records. SamRecord samRecord; int numReads = 0; ////////////////////// // Setup in case doing a quality count. // Quality histogram. const int MAX_QUAL = 126; const int START_QUAL = 33; uint64_t qualCount[MAX_QUAL+1]; for(int i = 0; i <= MAX_QUAL; i++) { qualCount[i] = 0; } const int START_PHRED = 0; const int PHRED_DIFF = START_QUAL - START_PHRED; const int MAX_PHRED = MAX_QUAL - PHRED_DIFF; uint64_t phredCount[MAX_PHRED+1]; for(int i = 0; i <= MAX_PHRED; i++) { phredCount[i] = 0; } int refPos = 0; Cigar* cigarPtr = NULL; char cigarChar = '?'; // Exclude clips from the qual/phred counts if unmapped reads are excluded. bool qualExcludeClips = excludeFlags & SamFlag::UNMAPPED; ////////////////////////////////// // When not reading by sections, getNextSection returns true // the first time, then false the next time. while(getNextSection(samIn)) { // Keep reading records from the file until SamFile::ReadRecord // indicates to stop (returns false). while(((maxNumReads < 0) || (numReads < maxNumReads)) && samIn.ReadRecord(samHeader, samRecord)) { // Another record was read, so increment the number of reads. ++numReads; // See if the quality histogram should be genereated. if(qual || phred) { // Get the quality. const char* qual = samRecord.getQuality(); // Check for no quality ('*'). if((qual[0] == '*') && (qual[1] == 0)) { // This record does not have a quality string, so no // quality processing is necessary. } else { int index = 0; cigarPtr = samRecord.getCigarInfo(); cigarChar = '?'; refPos = samRecord.get0BasedPosition(); if(!qualExcludeClips && (cigarPtr != NULL)) { // Offset the reference position by any soft clips // by subtracting the queryIndex of this start position. // refPos is now the start position of the clips. refPos -= cigarPtr->getQueryIndex(0); } while(qual[index] != 0) { // Skip this quality if it is clipped and we are skipping clips. if(cigarPtr != NULL) { cigarChar = cigarPtr->getCigarCharOpFromQueryIndex(index); } if(qualExcludeClips && Cigar::isClip(cigarChar)) { // Skip a clipped quality. ++index; // Increment the position. continue; } if(withinRegion && (myEndPos != -1) && (refPos >= myEndPos)) { // We have hit the end of the region, stop processing this // quality string. break; } if(withinRegion && (refPos < myStartPos)) { // This position is not in the target. ++index; // Update the position if this is found in the reference or a clip. if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar)) { ++refPos; } continue; } // Check for valid quality. if((qual[index] < START_QUAL) || (qual[index] > MAX_QUAL)) { if(qual) { std::cerr << "Invalid Quality found: " << qual[index] << ". Must be between " << START_QUAL << " and " << MAX_QUAL << ".\n"; } if(phred) { std::cerr << "Invalid Phred Quality found: " << qual[index] - PHRED_DIFF << ". Must be between " << START_QUAL << " and " << MAX_QUAL << ".\n"; } // Skip an invalid quality. ++index; // Update the position if this is found in the reference or a clip. if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar)) { ++refPos; } continue; } // Increment the count for this quality. ++(qualCount[(int)(qual[index])]); ++(phredCount[(int)(qual[index]) - PHRED_DIFF]); // Update the position if this is found in the reference or a clip. if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar)) { ++refPos; } ++index; } } } // Check the next thing to do for the read. if((baseQCPtr != NULL) || baseSum) { // Pileup the bases for this read. pileup.processAlignmentRegion(samRecord, myStartPos, myEndPos, dbsnpListPtr); } } // Done with a section, move on to the next one. // New section, so flush the pileup. pileup.flushPileup(); } // Flush the rest of the pileup. if((baseQCPtr != NULL) || baseSum) { // Pileup the bases. pileup.processAlignmentRegion(samRecord, myStartPos, myEndPos, dbsnpListPtr); PileupElementBaseQCStats::printSummary(); ifclose(baseQCPtr); } std::cerr << "Number of records read = " << samIn.GetCurrentRecordCount() << std::endl; if(basic) { std::cerr << std::endl; samIn.PrintStatistics(); } // Print the quality stats. if(qual) { std::cerr << std::endl; std::cerr << "Quality\tCount\n"; for(int i = START_QUAL; i <= MAX_QUAL; i++) { std::cerr << i << "\t" << qualCount[i] << std::endl; } } // Print the phred quality stats. if(phred) { std::cerr << std::endl; std::cerr << "Phred\tCount\n"; for(int i = START_PHRED; i <= MAX_PHRED; i++) { std::cerr << i << "\t" << phredCount[i] << std::endl; } } SamStatus::Status status = samIn.GetStatus(); if(status == SamStatus::NO_MORE_RECS) { // A status of NO_MORE_RECS means that all reads were successful. status = SamStatus::SUCCESS; } return(status); }
void GroupFromAnnotation::GetGeneMap(String path) { IFILE genemap; genemap = ifopen(mapFile,"r"); if(genemap==NULL) { if(mapFile=="../data/refFlat_hg19.txt") { mapFile += ".gz"; genemap = ifopen(mapFile,"r"); if(genemap==NULL) { int loc = path.Find("bin"); if(loc!=-1) { mapFile = path.Left(loc-1); mapFile += "/data/refFlat_hg19.txt"; } else { mapFile += "../data/refFlat_hg19.txt"; } genemap = ifopen(mapFile,"r"); } if(genemap==NULL) { mapFile += ".gz"; genemap = ifopen(mapFile,"r"); } if(genemap==NULL) error("Cannot open gene mapping file %s.\n",mapFile.c_str()); } else error("Cannot open gene mapping file %s.\n",mapFile.c_str()); } StringIntHash GeneLocHash; StringArray strand; int gene_idx =0; while(!ifeof(genemap)) { String buffer; buffer.ReadLine(genemap); StringArray record; record.AddTokens(buffer,"\t"); int loc = GeneLocHash.Integer(record[0]); if(loc==-1) { GeneLocHash.SetInteger(record[0],gene_idx); //save chr, start and end positions StringArray gene_chr; if(record[2][2]=='r' || record[2][2]=='R') record[2] = record[2].SubStr(3); gene_chr.AddTokens(record[2],"_,;."); if(gene_chr[0].Find("Un")!=-1) continue; /* if(ChrLocHash.Integer(gene_chr[0])==-1) { chr_count++; unique_chr.Push(gene_chr[0]); ChrLocHash.SetInteger(gene_chr[0],chr_count); } */ chr.Push(gene_chr[0]); //printf("%d\t%s\t%s\n",idx,record[0].c_str(),gene_chr[0].c_str()); start_pos.Push(record[4].AsInteger()); end_pos.Push(record[5].AsInteger()); strand.Push(record[3]); genename.Push(record[0]); gene_idx++; } else { //get the current chr StringArray gene_chr; if(record[2][2]=='r' || record[2][2]=='R') record[2] = record[2].SubStr(3); gene_chr.AddTokens(record[2],"_,;."); if(gene_chr[0].Find("Un")!=-1) continue; //check if strand and chr are consistent with previous record if(chr[loc]!=gene_chr[0]) //if(strand[loc]!=record[3] || chr[loc]!=gene_chr[0]) // printf("Gene %s in %s has multiple records in different chromosome or strand.\n",record[0].c_str(),mapFile.c_str()); continue; //update start and end position if(record[4].AsInteger()<start_pos[loc]) start_pos[loc] = record[4].AsInteger(); if(record[5].AsInteger()>end_pos[loc]) end_pos[loc] = record[5].AsInteger(); } } ifclose(genemap); //ifclose(genemap); chr_idx.Index(chr); String chr_=chr[chr_idx[0]]; for(int i=1;i<chr.Length();i++) { if(chr[chr_idx[i]]!=chr_) { ChrStartHash.SetInteger(chr[chr_idx[i]],i); ChrEndHash.SetInteger(chr_,i-1); chr_ = chr[chr_idx[i]]; } } }