Beispiel #1
0
    void
SAMReader::reinit(_int64 startingOffset, _int64 amountOfFileToProcess)
{
    _ASSERT(-1 != headerSize && startingOffset >= headerSize);  // Must call init() before reinit()
    //
    // There's no way to tell if we start at the very beginning of a read, we need to see the previous newline.
    // So, read one byte before our assigned read in case that was the terminating newline of the previous read.
    //
    if (startingOffset > headerSize) {
        startingOffset--;
        amountOfFileToProcess++;
    }
    data->reinit(startingOffset, amountOfFileToProcess);
    char* buffer;
    _int64 validBytes;
    if (!data->getData(&buffer, &validBytes)) {
        return;
    }
    if (startingOffset != headerSize) {
        char *firstNewline = strnchr(buffer,'\n',validBytes);
        if (NULL == firstNewline) {
            return;
        }

        data->advance((unsigned)(firstNewline - buffer + 1)); // +1 skips over the newline.
    }
}
Beispiel #2
0
    bool
SAMReader::parseLine(char *line, char *endOfBuffer, char *result[], size_t *linelength, size_t fieldLengths[])
{
    *linelength = 0;

    char *next = line;
    char *endOfLine = strnchr(line,'\n',endOfBuffer-line);
    if (NULL == endOfLine) {
        return false;
    }

    //
    // Skip over any leading spaces and tabs
    //
    while (next < endOfLine && (*next == ' ' || *next == '\t')) {
        next++;
    }

    for (unsigned i = 0; i < nSAMFields; i++) {
        if (NULL == next || next >= endOfLine) {
            if (i == OPT) {
                // no optional fields
                result[OPT] = NULL;
                break;
            } else {
                //
                // Too few fields.
                //
                return false;
            }
        }

        result[i] = next;
        if (i == OPT) {
            // OPT field is actually all fields until end of line
            fieldLengths[OPT] = endOfLine - next;
            break;
        }

        next = skipToBeyondNextRunOfSpacesAndTabs(next,endOfLine,&fieldLengths[i]);
    }

    *linelength =  endOfLine - line + 1;    // +1 skips over the \n
    return true;
}
Beispiel #3
0
    bool
SAMReader::getNextRead(
    Read *read,
    AlignmentResult *alignmentResult,
    unsigned *genomeLocation,
    Direction *direction,
    unsigned *mapQ, 
    unsigned *flag,
    bool ignoreEndOfRange,
    const char **cigar)
{
    unsigned local_flag;
    if (NULL == flag) {
        flag = &local_flag;
    }
    do {
        char* buffer;
        _int64 bytes;
        if (! data->getData(&buffer, &bytes)) {
            data->nextBatch();
            if (! data->getData(&buffer, &bytes)) {
                return false;
            }
        }
        char *newLine = strnchr(buffer, '\n', bytes);
        if (NULL == newLine) {
            //
            // There is no newline, so the line crosses the end of the buffer.
            // This should never happen since underlying reader manages overflow between chunks.
            //
            WriteErrorMessage("SAM file has too long a line, or doesn't end with a newline!  Failing.  fileOffset = %lld\n", data->getFileOffset());
            soft_exit(1);
        }

        size_t lineLength;
        read->setReadGroup(context.defaultReadGroup);
        getReadFromLine(context.genome, buffer,buffer + bytes, read, alignmentResult, genomeLocation, direction, mapQ, &lineLength, flag, cigar, clipping);
        read->setBatch(data->getBatch());
        data->advance((newLine + 1) - buffer);
    } while ((context.ignoreSecondaryAlignments && ((*flag) & SAM_SECONDARY)) ||
             (context.ignoreSupplementaryAlignments && ((*flag) & SAM_SUPPLEMENTARY)));

    return true;
}
Beispiel #4
0
    bool
SAMFormat::writeRead(
    const Genome * genome,
    LandauVishkinWithCigar * lv,
    char * buffer,
    size_t bufferSpace, 
    size_t * spaceUsed,
    size_t qnameLen,
    Read * read,
    AlignmentResult result, 
    int mapQuality,
    unsigned genomeLocation,
    Direction direction,
    bool hasMate,
    bool firstInPair,
    Read * mate, 
    AlignmentResult mateResult,
    unsigned mateLocation,
    Direction mateDirection) const
{
    const int MAX_READ = MAX_READ_LENGTH;
    const int cigarBufSize = MAX_READ * 2;
    char cigarBuf[cigarBufSize];

    const int cigarBufWithClippingSize = MAX_READ * 2 + 32;
    char cigarBufWithClipping[cigarBufWithClippingSize];

    int flags = 0;
    const char *contigName = "*";
    int contigIndex = -1;
    unsigned positionInContig = 0;
    const char *cigar = "*";
    const char *matecontigName = "*";
    int mateContigIndex = -1;
    unsigned matePositionInContig = 0;
    _int64 templateLength = 0;

    char data[MAX_READ];
    char quality[MAX_READ];

    const char* clippedData;
    unsigned fullLength;
    unsigned clippedLength;
    unsigned basesClippedBefore;
    unsigned extraBasesClippedBefore;   // Clipping added if we align before the beginning of a chromosome
    unsigned basesClippedAfter;
    unsigned extraBasesClippedAfter;    // Clipping added if we align off the end of a chromosome
    int editDistance = -1;

    if (! createSAMLine(genome, lv, data, quality, MAX_READ, contigName, contigIndex, 
        flags, positionInContig, mapQuality, matecontigName, mateContigIndex, matePositionInContig, templateLength,
        fullLength, clippedData, clippedLength, basesClippedBefore, basesClippedAfter,
        qnameLen, read, result, genomeLocation, direction, useM,
        hasMate, firstInPair, mate, mateResult, mateLocation, mateDirection, 
        &extraBasesClippedBefore, &extraBasesClippedAfter))
    {
        return false;
    }
    if (genomeLocation != InvalidGenomeLocation) {
        cigar = computeCigarString(genome, lv, cigarBuf, cigarBufSize, cigarBufWithClipping, cigarBufWithClippingSize, 
                                   clippedData, clippedLength, basesClippedBefore, extraBasesClippedBefore, basesClippedAfter, extraBasesClippedAfter, 
                                   read->getOriginalFrontHardClipping(), read->getOriginalBackHardClipping(), genomeLocation, direction, useM, &editDistance);
    }

    // Write the SAM entry, which requires the following fields:
    //
    // 1. QNAME: Query name of the read or the read pair
    // 2. FLAG: Bitwise flag (pairing, strand, mate strand, etc.)
    // 3. RNAME: Reference sequence name
    // 4. POS: 1-Based leftmost position of clipped alignment
    // 5. MAPQ: Mapping quality (Phred-scaled)
    // 6. CIGAR: Extended CIGAR string (operations: MIDNSHP)
    // 7. MRNM: Mate reference name (‘=’ if same as RNAME)
    // 8. MPOS: 1-based leftmost mate position
    // 9. ISIZE: Inferred insert size
    // 10. SEQQuery: Sequence on the same strand as the reference
    // 11. QUAL: Query quality (ASCII-33=Phred base quality)    

    //
    // Some FASTQ files have spaces in their ID strings, which is illegal in SAM.  Just truncate them at the space.
    //
    const char *firstSpace = strnchr(read->getId(),' ',qnameLen);
    if (NULL != firstSpace) {
        qnameLen = (unsigned)(firstSpace - read->getId());
    }

    const int nmStringSize = 30;// Big enough that it won't buffer overflow regardless of the value of editDistance
    char nmString[nmStringSize];  
    snprintf(nmString, nmStringSize, "\tNM:i:%d",editDistance);

    unsigned auxLen;
    bool auxSAM;
    char* aux = read->getAuxiliaryData(&auxLen, &auxSAM);
    static bool warningPrinted = false;
    const char* readGroupSeparator = "";
    const char* readGroupString = "";
    if (aux != NULL && (! auxSAM)) {
        if (! warningPrinted) {
            fprintf(stderr, "warning: translating optional fields from BAM->SAM not yet implemented, optional fields will not be included in output\n");
            warningPrinted = true;
        }
        if (read->getReadGroup() == READ_GROUP_FROM_AUX) {
            for (BAMAlignAux* bamAux = (BAMAlignAux*) aux; (char*) bamAux < aux + auxLen; bamAux = bamAux->next()) {
                if (bamAux->tag[0] == 'R' && bamAux->tag[1] == 'G' && bamAux->val_type == 'Z') {
                    readGroupSeparator = "\tRG:Z:";
                    readGroupString = (char*) bamAux->value();
                    break;
                }
            }
        }
        aux = NULL;
        auxLen = 0;
    }
    if (read->getReadGroup() != NULL && read->getReadGroup() != READ_GROUP_FROM_AUX) {
        readGroupSeparator = "\tRG:Z:";
        readGroupString = read->getReadGroup();
    }
    int charsInString = snprintf(buffer, bufferSpace, "%.*s\t%d\t%s\t%u\t%d\t%s\t%s\t%u\t%lld\t%.*s\t%.*s%s%.*s%s%s\tPG:Z:SNAP%s\n",
        qnameLen, read->getId(),
        flags,
        contigName,
        positionInContig,
        mapQuality,
        cigar,
        matecontigName,
        matePositionInContig,
        templateLength,
        fullLength, data,
        fullLength, quality,
        aux != NULL ? "\t" : "", auxLen, aux != NULL ? aux : "",
        readGroupSeparator, readGroupString,
        nmString);

    if (charsInString > bufferSpace) {
        //
        // Out of buffer space.
        //
        return false;
    } else if (charsInString == bufferSpace) {
      buffer[bufferSpace-1] = '\n'; // overwrite trailing null with newline
    }


    if (NULL != spaceUsed) {
        *spaceUsed = charsInString;
    }
    return true;
}
Beispiel #5
0
    bool
SAMFormat::writeHeader(
    const ReaderContext& context,
    char *header,
    size_t headerBufferSize,
    size_t *headerActualSize,
    bool sorted,
    int argc,
    const char **argv,
    const char *version,
    const char *rgLine)
    const
{
    char *commandLine;
	size_t commandLineSize = 0;
	for (int i = 0; i < argc; i++) {
		commandLineSize += strlen(argv[i]) + 1;	// +1 is either a space or the terminating null
	}
	commandLine = new char[commandLineSize];
	commandLine[0] = '\0';
	for (int i = 0; i < argc; i++) {
		strcat(commandLine,argv[i]);
		if (i != argc-1) {
			strcat(commandLine," ");
		}
	}

    size_t bytesConsumed = snprintf(header, headerBufferSize, "@HD\tVN:1.4\tSO:%s\n%s%s@PG\tID:SNAP\tPN:SNAP\tCL:%s\tVN:%s\n", 
		sorted ? "coordinate" : "unsorted",
        context.header == NULL ? (rgLine == NULL ? "@RG\tID:FASTQ\tSM:sample" : rgLine) : "",
        context.header == NULL ? "\n" : "",
        commandLine,version);

	delete [] commandLine;
	commandLine = NULL;
    if (bytesConsumed >= headerBufferSize) {
        fprintf(stderr,"SAMWriter: header buffer too small\n");
        return false;
    }

    if (context.header != NULL) {
		bool hasRG = false;
        for (const char* p = context.header; p < context.header + context.headerLength; ) {
            const char* newline = strnchr(p, '\n', (context.header + context.headerLength) - p);
            if (newline == NULL) {
                newline = context.header + context.headerLength;
            }
            _ASSERT(newline - p >= 3);
            // skip @HD lines, and also @SQ lines if header does not match index
			hasRG |= strncmp(p, "@RG", 3) == 0;
            if (strncmp(p, "@HD", 3) != 0 &&
                    (context.headerMatchesIndex || strncmp(p, "@SQ", 3) != 0) &&
                    strncmp(p, "@PG\tID:SNAP\t", 12) != 0) {
                if (bytesConsumed + (newline - p) + 1 >= headerBufferSize) {
                    fprintf(stderr,"SAMWriter: header buffer too small\n");
                    return false;
                }
                memcpy(header + bytesConsumed, p, (newline - p));
                * (header + bytesConsumed + (newline - p)) = '\n';
                bytesConsumed += (newline - p) + 1;
            }
            p = newline + 1;
        }
		if (! hasRG) {
			int n = snprintf(header + bytesConsumed, headerBufferSize - bytesConsumed, "%s\n",
				rgLine == NULL ? "@RG\tID:FASTQ\tSM:sample" : rgLine);
			if (n > headerBufferSize - bytesConsumed) {
				fprintf(stderr, "SAMWriter: header buffer too small\n");
                return false;
            }
			bytesConsumed += n;
		}
    }
#ifndef SKIP_SQ_LINES
    if ((context.header == NULL || ! context.headerMatchesIndex) && context.genome != NULL) {
        // Write an @SQ line for each chromosome / contig in the genome
        const Genome::Contig *contigs = context.genome->getContigs();
        int numContigs = context.genome->getNumContigs();
        unsigned genomeLen = context.genome->getCountOfBases();
        for (int i = 0; i < numContigs; i++) {
            unsigned start = contigs[i].beginningOffset;
            unsigned end = ((i + 1 < numContigs) ? contigs[i+1].beginningOffset : genomeLen) - context.genome->getChromosomePadding();
            bytesConsumed += snprintf(header + bytesConsumed, headerBufferSize - bytesConsumed, "@SQ\tSN:%s\tLN:%u\n", contigs[i].name, end - start);

            if (bytesConsumed >= headerBufferSize) {
                fprintf(stderr,"SAMWriter: header buffer too small\n");
                return false;
            }
        }
    }
#endif // SKIP_SQ_LINES

    *headerActualSize = bytesConsumed;
    return true;
}
Beispiel #6
0
    bool
SAMReader::parseHeader(
    const char *fileName, 
    char *firstLine, 
    char *endOfBuffer, 
    const Genome *genome, 
    _int64 *o_headerSize,
    bool *o_headerMatchesIndex)
{
    char *nextLineToProcess = firstLine;
    *o_headerMatchesIndex = true;
    int numSQLines = 0;
    while (NULL != nextLineToProcess && nextLineToProcess < endOfBuffer && '@' == *nextLineToProcess) {
        if (!strncmp("@SQ",nextLineToProcess,3)) {
            //
            // These lines represent sequences in the reference genome, what are
            // called "contigs" in the Genome class.  (Roughly, chromosomes or major
            // variants like some versions of the MHC genes on chr6; or more
            // particularly the things that come in different FASTA files from the
            // reference assembly).
            //
            // Verify that they actually match what's in our reference genome.
            //
            numSQLines++;
            if (nextLineToProcess + 3 >= endOfBuffer || ' ' != nextLineToProcess[3] && '\t' != nextLineToProcess[3]) {
                fprintf(stderr,"Malformed SAM file '%s' has @SQ without a following space or tab.\n",fileName);
                return false;
            }

            char *snStart = nextLineToProcess + 4;
            while (snStart < endOfBuffer && strncmp(snStart,"SN:",__min(3,endOfBuffer-snStart)) && *snStart != '\n' && *snStart != 0) {
                snStart++;
            }

            if (snStart >= endOfBuffer || *snStart == '\n' || *snStart == 0) {
                fprintf(stderr,"Malformed @SQ line doesn't have 'SN:' in file '%s'\n",fileName);
                return false;
            }

            const size_t contigNameBufferSize = 512;
            char contigName[contigNameBufferSize];
            for (unsigned i = 0; i < contigNameBufferSize && snStart+3+i < endOfBuffer; i++) {
                if (snStart[3+i] == ' ' || snStart[3+i] == '\t' || snStart[3+i] == '\n' || snStart[3+i] == 0) {
                    contigName[i] = '\0';
                } else {
                    contigName[i] = snStart[3+i];
                }
            }
            contigName[contigNameBufferSize - 1] = '\0';

            if (genome == NULL || !genome->getOffsetOfContig(contigName,NULL)) {
                *o_headerMatchesIndex = false;
            }
        } else if (!strncmp("@HD",nextLineToProcess,3) || !strncmp("@RG",nextLineToProcess,3) || !strncmp("@PG",nextLineToProcess,3) ||
            !strncmp("@CO",nextLineToProcess,3)) {
            //
            // Ignore these lines.
            //
        } else {
            fprintf(stderr,"Unrecognized header line in SAM file.\n");
            return false;
        }
		char * p = strnchr(nextLineToProcess,'\n',endOfBuffer-nextLineToProcess);
		if (p == NULL) {
            // no newline, look for null to truncate buffer
            p = (char*) memchr(nextLineToProcess, 0, endOfBuffer - nextLineToProcess);
            nextLineToProcess = p != NULL ? p + 1 : endOfBuffer;
            break;
		}
        nextLineToProcess = p + 1;
    }

    *o_headerMatchesIndex &= genome != NULL && numSQLines == genome->getNumContigs();
	*o_headerSize = nextLineToProcess - firstLine;
    return true;
}