void SAMReader::reinit(_int64 startingOffset, _int64 amountOfFileToProcess) { _ASSERT(-1 != headerSize && startingOffset >= headerSize); // Must call init() before reinit() // // There's no way to tell if we start at the very beginning of a read, we need to see the previous newline. // So, read one byte before our assigned read in case that was the terminating newline of the previous read. // if (startingOffset > headerSize) { startingOffset--; amountOfFileToProcess++; } data->reinit(startingOffset, amountOfFileToProcess); char* buffer; _int64 validBytes; if (!data->getData(&buffer, &validBytes)) { return; } if (startingOffset != headerSize) { char *firstNewline = strnchr(buffer,'\n',validBytes); if (NULL == firstNewline) { return; } data->advance((unsigned)(firstNewline - buffer + 1)); // +1 skips over the newline. } }
bool SAMReader::parseLine(char *line, char *endOfBuffer, char *result[], size_t *linelength, size_t fieldLengths[]) { *linelength = 0; char *next = line; char *endOfLine = strnchr(line,'\n',endOfBuffer-line); if (NULL == endOfLine) { return false; } // // Skip over any leading spaces and tabs // while (next < endOfLine && (*next == ' ' || *next == '\t')) { next++; } for (unsigned i = 0; i < nSAMFields; i++) { if (NULL == next || next >= endOfLine) { if (i == OPT) { // no optional fields result[OPT] = NULL; break; } else { // // Too few fields. // return false; } } result[i] = next; if (i == OPT) { // OPT field is actually all fields until end of line fieldLengths[OPT] = endOfLine - next; break; } next = skipToBeyondNextRunOfSpacesAndTabs(next,endOfLine,&fieldLengths[i]); } *linelength = endOfLine - line + 1; // +1 skips over the \n return true; }
bool SAMReader::getNextRead( Read *read, AlignmentResult *alignmentResult, unsigned *genomeLocation, Direction *direction, unsigned *mapQ, unsigned *flag, bool ignoreEndOfRange, const char **cigar) { unsigned local_flag; if (NULL == flag) { flag = &local_flag; } do { char* buffer; _int64 bytes; if (! data->getData(&buffer, &bytes)) { data->nextBatch(); if (! data->getData(&buffer, &bytes)) { return false; } } char *newLine = strnchr(buffer, '\n', bytes); if (NULL == newLine) { // // There is no newline, so the line crosses the end of the buffer. // This should never happen since underlying reader manages overflow between chunks. // WriteErrorMessage("SAM file has too long a line, or doesn't end with a newline! Failing. fileOffset = %lld\n", data->getFileOffset()); soft_exit(1); } size_t lineLength; read->setReadGroup(context.defaultReadGroup); getReadFromLine(context.genome, buffer,buffer + bytes, read, alignmentResult, genomeLocation, direction, mapQ, &lineLength, flag, cigar, clipping); read->setBatch(data->getBatch()); data->advance((newLine + 1) - buffer); } while ((context.ignoreSecondaryAlignments && ((*flag) & SAM_SECONDARY)) || (context.ignoreSupplementaryAlignments && ((*flag) & SAM_SUPPLEMENTARY))); return true; }
bool SAMFormat::writeRead( const Genome * genome, LandauVishkinWithCigar * lv, char * buffer, size_t bufferSpace, size_t * spaceUsed, size_t qnameLen, Read * read, AlignmentResult result, int mapQuality, unsigned genomeLocation, Direction direction, bool hasMate, bool firstInPair, Read * mate, AlignmentResult mateResult, unsigned mateLocation, Direction mateDirection) const { const int MAX_READ = MAX_READ_LENGTH; const int cigarBufSize = MAX_READ * 2; char cigarBuf[cigarBufSize]; const int cigarBufWithClippingSize = MAX_READ * 2 + 32; char cigarBufWithClipping[cigarBufWithClippingSize]; int flags = 0; const char *contigName = "*"; int contigIndex = -1; unsigned positionInContig = 0; const char *cigar = "*"; const char *matecontigName = "*"; int mateContigIndex = -1; unsigned matePositionInContig = 0; _int64 templateLength = 0; char data[MAX_READ]; char quality[MAX_READ]; const char* clippedData; unsigned fullLength; unsigned clippedLength; unsigned basesClippedBefore; unsigned extraBasesClippedBefore; // Clipping added if we align before the beginning of a chromosome unsigned basesClippedAfter; unsigned extraBasesClippedAfter; // Clipping added if we align off the end of a chromosome int editDistance = -1; if (! createSAMLine(genome, lv, data, quality, MAX_READ, contigName, contigIndex, flags, positionInContig, mapQuality, matecontigName, mateContigIndex, matePositionInContig, templateLength, fullLength, clippedData, clippedLength, basesClippedBefore, basesClippedAfter, qnameLen, read, result, genomeLocation, direction, useM, hasMate, firstInPair, mate, mateResult, mateLocation, mateDirection, &extraBasesClippedBefore, &extraBasesClippedAfter)) { return false; } if (genomeLocation != InvalidGenomeLocation) { cigar = computeCigarString(genome, lv, cigarBuf, cigarBufSize, cigarBufWithClipping, cigarBufWithClippingSize, clippedData, clippedLength, basesClippedBefore, extraBasesClippedBefore, basesClippedAfter, extraBasesClippedAfter, read->getOriginalFrontHardClipping(), read->getOriginalBackHardClipping(), genomeLocation, direction, useM, &editDistance); } // Write the SAM entry, which requires the following fields: // // 1. QNAME: Query name of the read or the read pair // 2. FLAG: Bitwise flag (pairing, strand, mate strand, etc.) // 3. RNAME: Reference sequence name // 4. POS: 1-Based leftmost position of clipped alignment // 5. MAPQ: Mapping quality (Phred-scaled) // 6. CIGAR: Extended CIGAR string (operations: MIDNSHP) // 7. MRNM: Mate reference name (‘=’ if same as RNAME) // 8. MPOS: 1-based leftmost mate position // 9. ISIZE: Inferred insert size // 10. SEQQuery: Sequence on the same strand as the reference // 11. QUAL: Query quality (ASCII-33=Phred base quality) // // Some FASTQ files have spaces in their ID strings, which is illegal in SAM. Just truncate them at the space. // const char *firstSpace = strnchr(read->getId(),' ',qnameLen); if (NULL != firstSpace) { qnameLen = (unsigned)(firstSpace - read->getId()); } const int nmStringSize = 30;// Big enough that it won't buffer overflow regardless of the value of editDistance char nmString[nmStringSize]; snprintf(nmString, nmStringSize, "\tNM:i:%d",editDistance); unsigned auxLen; bool auxSAM; char* aux = read->getAuxiliaryData(&auxLen, &auxSAM); static bool warningPrinted = false; const char* readGroupSeparator = ""; const char* readGroupString = ""; if (aux != NULL && (! auxSAM)) { if (! warningPrinted) { fprintf(stderr, "warning: translating optional fields from BAM->SAM not yet implemented, optional fields will not be included in output\n"); warningPrinted = true; } if (read->getReadGroup() == READ_GROUP_FROM_AUX) { for (BAMAlignAux* bamAux = (BAMAlignAux*) aux; (char*) bamAux < aux + auxLen; bamAux = bamAux->next()) { if (bamAux->tag[0] == 'R' && bamAux->tag[1] == 'G' && bamAux->val_type == 'Z') { readGroupSeparator = "\tRG:Z:"; readGroupString = (char*) bamAux->value(); break; } } } aux = NULL; auxLen = 0; } if (read->getReadGroup() != NULL && read->getReadGroup() != READ_GROUP_FROM_AUX) { readGroupSeparator = "\tRG:Z:"; readGroupString = read->getReadGroup(); } int charsInString = snprintf(buffer, bufferSpace, "%.*s\t%d\t%s\t%u\t%d\t%s\t%s\t%u\t%lld\t%.*s\t%.*s%s%.*s%s%s\tPG:Z:SNAP%s\n", qnameLen, read->getId(), flags, contigName, positionInContig, mapQuality, cigar, matecontigName, matePositionInContig, templateLength, fullLength, data, fullLength, quality, aux != NULL ? "\t" : "", auxLen, aux != NULL ? aux : "", readGroupSeparator, readGroupString, nmString); if (charsInString > bufferSpace) { // // Out of buffer space. // return false; } else if (charsInString == bufferSpace) { buffer[bufferSpace-1] = '\n'; // overwrite trailing null with newline } if (NULL != spaceUsed) { *spaceUsed = charsInString; } return true; }
bool SAMFormat::writeHeader( const ReaderContext& context, char *header, size_t headerBufferSize, size_t *headerActualSize, bool sorted, int argc, const char **argv, const char *version, const char *rgLine) const { char *commandLine; size_t commandLineSize = 0; for (int i = 0; i < argc; i++) { commandLineSize += strlen(argv[i]) + 1; // +1 is either a space or the terminating null } commandLine = new char[commandLineSize]; commandLine[0] = '\0'; for (int i = 0; i < argc; i++) { strcat(commandLine,argv[i]); if (i != argc-1) { strcat(commandLine," "); } } size_t bytesConsumed = snprintf(header, headerBufferSize, "@HD\tVN:1.4\tSO:%s\n%s%s@PG\tID:SNAP\tPN:SNAP\tCL:%s\tVN:%s\n", sorted ? "coordinate" : "unsorted", context.header == NULL ? (rgLine == NULL ? "@RG\tID:FASTQ\tSM:sample" : rgLine) : "", context.header == NULL ? "\n" : "", commandLine,version); delete [] commandLine; commandLine = NULL; if (bytesConsumed >= headerBufferSize) { fprintf(stderr,"SAMWriter: header buffer too small\n"); return false; } if (context.header != NULL) { bool hasRG = false; for (const char* p = context.header; p < context.header + context.headerLength; ) { const char* newline = strnchr(p, '\n', (context.header + context.headerLength) - p); if (newline == NULL) { newline = context.header + context.headerLength; } _ASSERT(newline - p >= 3); // skip @HD lines, and also @SQ lines if header does not match index hasRG |= strncmp(p, "@RG", 3) == 0; if (strncmp(p, "@HD", 3) != 0 && (context.headerMatchesIndex || strncmp(p, "@SQ", 3) != 0) && strncmp(p, "@PG\tID:SNAP\t", 12) != 0) { if (bytesConsumed + (newline - p) + 1 >= headerBufferSize) { fprintf(stderr,"SAMWriter: header buffer too small\n"); return false; } memcpy(header + bytesConsumed, p, (newline - p)); * (header + bytesConsumed + (newline - p)) = '\n'; bytesConsumed += (newline - p) + 1; } p = newline + 1; } if (! hasRG) { int n = snprintf(header + bytesConsumed, headerBufferSize - bytesConsumed, "%s\n", rgLine == NULL ? "@RG\tID:FASTQ\tSM:sample" : rgLine); if (n > headerBufferSize - bytesConsumed) { fprintf(stderr, "SAMWriter: header buffer too small\n"); return false; } bytesConsumed += n; } } #ifndef SKIP_SQ_LINES if ((context.header == NULL || ! context.headerMatchesIndex) && context.genome != NULL) { // Write an @SQ line for each chromosome / contig in the genome const Genome::Contig *contigs = context.genome->getContigs(); int numContigs = context.genome->getNumContigs(); unsigned genomeLen = context.genome->getCountOfBases(); for (int i = 0; i < numContigs; i++) { unsigned start = contigs[i].beginningOffset; unsigned end = ((i + 1 < numContigs) ? contigs[i+1].beginningOffset : genomeLen) - context.genome->getChromosomePadding(); bytesConsumed += snprintf(header + bytesConsumed, headerBufferSize - bytesConsumed, "@SQ\tSN:%s\tLN:%u\n", contigs[i].name, end - start); if (bytesConsumed >= headerBufferSize) { fprintf(stderr,"SAMWriter: header buffer too small\n"); return false; } } } #endif // SKIP_SQ_LINES *headerActualSize = bytesConsumed; return true; }
bool SAMReader::parseHeader( const char *fileName, char *firstLine, char *endOfBuffer, const Genome *genome, _int64 *o_headerSize, bool *o_headerMatchesIndex) { char *nextLineToProcess = firstLine; *o_headerMatchesIndex = true; int numSQLines = 0; while (NULL != nextLineToProcess && nextLineToProcess < endOfBuffer && '@' == *nextLineToProcess) { if (!strncmp("@SQ",nextLineToProcess,3)) { // // These lines represent sequences in the reference genome, what are // called "contigs" in the Genome class. (Roughly, chromosomes or major // variants like some versions of the MHC genes on chr6; or more // particularly the things that come in different FASTA files from the // reference assembly). // // Verify that they actually match what's in our reference genome. // numSQLines++; if (nextLineToProcess + 3 >= endOfBuffer || ' ' != nextLineToProcess[3] && '\t' != nextLineToProcess[3]) { fprintf(stderr,"Malformed SAM file '%s' has @SQ without a following space or tab.\n",fileName); return false; } char *snStart = nextLineToProcess + 4; while (snStart < endOfBuffer && strncmp(snStart,"SN:",__min(3,endOfBuffer-snStart)) && *snStart != '\n' && *snStart != 0) { snStart++; } if (snStart >= endOfBuffer || *snStart == '\n' || *snStart == 0) { fprintf(stderr,"Malformed @SQ line doesn't have 'SN:' in file '%s'\n",fileName); return false; } const size_t contigNameBufferSize = 512; char contigName[contigNameBufferSize]; for (unsigned i = 0; i < contigNameBufferSize && snStart+3+i < endOfBuffer; i++) { if (snStart[3+i] == ' ' || snStart[3+i] == '\t' || snStart[3+i] == '\n' || snStart[3+i] == 0) { contigName[i] = '\0'; } else { contigName[i] = snStart[3+i]; } } contigName[contigNameBufferSize - 1] = '\0'; if (genome == NULL || !genome->getOffsetOfContig(contigName,NULL)) { *o_headerMatchesIndex = false; } } else if (!strncmp("@HD",nextLineToProcess,3) || !strncmp("@RG",nextLineToProcess,3) || !strncmp("@PG",nextLineToProcess,3) || !strncmp("@CO",nextLineToProcess,3)) { // // Ignore these lines. // } else { fprintf(stderr,"Unrecognized header line in SAM file.\n"); return false; } char * p = strnchr(nextLineToProcess,'\n',endOfBuffer-nextLineToProcess); if (p == NULL) { // no newline, look for null to truncate buffer p = (char*) memchr(nextLineToProcess, 0, endOfBuffer - nextLineToProcess); nextLineToProcess = p != NULL ? p + 1 : endOfBuffer; break; } nextLineToProcess = p + 1; } *o_headerMatchesIndex &= genome != NULL && numSQLines == genome->getNumContigs(); *o_headerSize = nextLineToProcess - firstLine; return true; }