Ejemplo n.º 1
0
void
WorkerThreadMain(void *param)
{
    ThreadContext *context = (ThreadContext *)param;

    _int64 rangeStart, rangeLength;

    SAMReader *samReader = NULL;
    ReaderContext rcontext;
    rcontext.clipping = NoClipping;
    rcontext.genome = genome;
    rcontext.paired = false;
    rcontext.defaultReadGroup = "";
    while (rangeSplitter->getNextRange(&rangeStart, &rangeLength)) {
        if (NULL == samReader) {
            samReader = SAMReader::create(DataSupplier::Default[true], inputFileName, rcontext, rangeStart, rangeLength);
        } else {
            ((ReadReader *)samReader)->reinit(rangeStart, rangeLength);
        }

        AlignmentResult alignmentResult;
        unsigned genomeLocation;
        Direction isRC;
        unsigned mapQ;
        unsigned flag;
        const char *cigar;
        unsigned nextFileToWrite = 0;
        Read read;
        LandauVishkinWithCigar lv;
        while (samReader->getNextRead(&read, &alignmentResult, &genomeLocation, &isRC, &mapQ, &flag, &cigar)) {

            if (mapQ < 0 || mapQ > MaxMAPQ) {
                fprintf(stderr,"Invalid MAPQ: %d\n",mapQ);
                exit(1);
            }

            if (0xffffffff == genomeLocation) {
                context->nUnaligned++;
            } else {
                if (flag & SAM_REVERSE_COMPLEMENT) {
                    read.becomeRC();
                }
                            
                const Genome::Piece *piece = genome->getPieceAtLocation(genomeLocation);
                if (NULL == piece) {
                    fprintf(stderr,"couldn't find genome piece for offset %u\n",genomeLocation);
                    exit(1);
                }
                unsigned offsetA, offsetB;
                bool matched;

                const unsigned cigarBufLen = 1000;
                char cigarForAligned[cigarBufLen];
                const char *alignedGenomeData = genome->getSubstring(genomeLocation, 1); 
                int editDistance = lv.computeEditDistance(alignedGenomeData, read.getDataLength() + 20, read.getData(), read.getDataLength(), 30, cigarForAligned, cigarBufLen, false);

                if (editDistance == -1 || editDistance > MaxEditDistance) {
                    editDistance = MaxEditDistance;
                }

                //
                // Parse the read ID.  The format is ChrName_OffsetA_OffsetB_?:<more stuff>.  This would be simple to parse, except that
                // ChrName can include "_".  So, we parse it by looking for the first : and then working backward.
                //
                char idBuffer[10000];   // Hopefully big enough.  I'm not worried about malicious input data here.

                memcpy(idBuffer,read.getId(),read.getIdLength());
                idBuffer[read.getIdLength()] = 0;
                    
                const char *firstColon = strchr(idBuffer,':');
                bool badParse = true;
                size_t chrNameLen;
                const char *beginningOfSecondNumber;
                const char *beginningOfFirstNumber; int stage = 0;
                unsigned offsetOfCorrectChromosome;
 
                if (NULL != firstColon && firstColon - 3 > idBuffer && (*(firstColon-1) == '?' || isADigit(*(firstColon - 1)))) {
                    //
                    // We've parsed backwards to see that we have at least #: or ?: where '#' is a digit and ? is literal.  If it's
                    // a digit, then scan backwards through that number.
                    //
                    const char *underscoreBeforeFirstColon = firstColon - 2;
                    while (underscoreBeforeFirstColon > idBuffer && isADigit(*underscoreBeforeFirstColon)) {
                        underscoreBeforeFirstColon--;
                    }

                    if (*underscoreBeforeFirstColon == '_' && (isADigit(*(underscoreBeforeFirstColon - 1)) || *(underscoreBeforeFirstColon - 1) == '_')) {
                        stage = 1;
                        if (isADigit(*(underscoreBeforeFirstColon - 1))) {
                            beginningOfSecondNumber = firstColon - 3;
                            while (beginningOfSecondNumber > idBuffer && isADigit(*beginningOfSecondNumber)) {
                                beginningOfSecondNumber--;
                            }
                            beginningOfSecondNumber++; // That loop actually moved us back one char before the beginning;
                        } else {
                            //
                            // There's only one number,  we have two consecutive underscores.
                            //
                            beginningOfSecondNumber = underscoreBeforeFirstColon;
                        }
                        if (beginningOfSecondNumber - 2 > idBuffer && *(beginningOfSecondNumber - 1) == '_' && isADigit(*(beginningOfSecondNumber - 2))) {
                            stage = 2;
                            beginningOfFirstNumber = beginningOfSecondNumber - 2;
                            while (beginningOfFirstNumber > idBuffer && isADigit(*beginningOfFirstNumber)) {
                                beginningOfFirstNumber--;
                            }
                            beginningOfFirstNumber++; // Again, we went one too far.

                           offsetA = -1;
                           offsetB = -1;

                            if (*(beginningOfFirstNumber - 1) == '_' && 1 == sscanf(beginningOfFirstNumber,"%u",&offsetA) &&
                                ('_' == *beginningOfSecondNumber || 1 == sscanf(beginningOfSecondNumber,"%u", &offsetB))) {
                                    stage = 3;

                                chrNameLen = (beginningOfFirstNumber - 1) - idBuffer;
                                char correctChromosomeName[1000];
                                memcpy(correctChromosomeName, idBuffer, chrNameLen);
                                correctChromosomeName[chrNameLen] = '\0';

                                if (!genome->getOffsetOfPiece(correctChromosomeName, &offsetOfCorrectChromosome)) {
                                    fprintf(stderr, "Couldn't parse chromosome name '%s' from read id\n", correctChromosomeName);
                                } else {
                                    badParse = false;
                                }
                            }
                        }
                    }

 

                    if (badParse) {
                        fprintf(stderr,"Unable to parse read ID '%s', perhaps this isn't simulated data.  piecelen = %d, pieceName = '%s', piece offset = %u, genome offset = %u\n", idBuffer, strlen(piece->name), piece->name, piece->beginningOffset, genomeLocation);
                        exit(1);
                    }

 
                    bool match0 = false;
                    bool match1 = false;
                    if (-1 == offsetA || -1 == offsetB) {
                        matched = false;
                    }  else if(strncmp(piece->name, idBuffer, __min(read.getIdLength(), chrNameLen))) {
                        matched = false;
                    } else {
                        if (isWithin(offsetA, genomeLocation - piece->beginningOffset, 50)) {
                            matched = true;
                            match0 = true;
                        } else if (isWithin(offsetB, genomeLocation - piece->beginningOffset, 50)) {
                            matched = true;
                            match1 = true;
                        } else {
                            matched = false;
                            if (flag & SAM_FIRST_SEGMENT) {
                                match0 = true;
                            } else {
                                match1 = true;
                            }
                        }
                    }

                    context->countOfReads[mapQ]++;
                    context->countOfReadsByEditDistance[mapQ][editDistance]++;

                    if (!matched) {
                        context->countOfMisalignments[mapQ]++;
                        context->countOfMisalignmentsByEditDistance[mapQ][editDistance]++;

                        if (70 == mapQ || 69 == mapQ) {

                            //
                            // We don't know which offset is correct, because neither one matched.  Just take the one with the lower edit distance.
                            //
                            unsigned correctLocationA = offsetOfCorrectChromosome + offsetA;
                            unsigned correctLocationB = offsetOfCorrectChromosome + offsetB;

                            unsigned correctLocation = 0;
                            const char *correctData = NULL;

                            const char *dataA = genome->getSubstring(correctLocationA, 1);
                            const char *dataB = genome->getSubstring(correctLocationB, 1);
                            int distanceA, distanceB;
                            char cigarA[cigarBufLen];
                            char cigarB[cigarBufLen];

                            cigarA[0] = '*'; cigarA[1] = '\0';
                            cigarB[0] = '*'; cigarB[1] = '\0';

                            if (dataA == NULL) {
                                distanceA = -1;
                            } else {
                                distanceA = lv.computeEditDistance(dataA, read.getDataLength() + 20, read.getData(), read.getDataLength(), 30, cigarA, cigarBufLen, false);
                            }

                            if (dataB == NULL) {
                                distanceB = -1;
                            } else {
                                distanceB = lv.computeEditDistance(dataB, read.getDataLength() + 20, read.getData(), read.getDataLength(), 30, cigarB, cigarBufLen, false);
                            }

                            const char *correctGenomeData;
                            char *cigarForCorrect;

                            if (distanceA != -1 && distanceA <= distanceB || distanceB == -1) {
                                correctGenomeData = dataA;
                                correctLocation = correctLocationA;
                                cigarForCorrect = cigarA;
                            } else {
                                correctGenomeData = dataB;
                                correctLocation = correctLocationB;
                                cigarForCorrect = cigarB;
                            }
                           
                            printf("%s\t%d\t%s\t%u\t%d\t%s\t*\t*\t100\t%.*s\t%.*s\tAlignedGenomeLocation:%u\tCorrectGenomeLocation: %u\tCigarForCorrect: %s\tCorrectData: %.*s\tAlignedData: %.*s\n", 
                                idBuffer, flag, piece->name, genomeLocation - piece->beginningOffset, mapQ, cigarForAligned, read.getDataLength(), read.getData(), 
                                read.getDataLength(), read.getQuality(),  genomeLocation, correctLocation, cigarForCorrect, read.getDataLength(),
                                correctGenomeData, read.getDataLength(), alignedGenomeData);
                        }
                    }
                }
            } // if it was mapped
        } // for each read from the sam reader
    }

     if (0 == InterlockedAdd64AndReturnNewValue(&nRunningThreads, -1)) {
        SignalSingleWaiterObject(&allThreadsDone);
    }
}
Ejemplo n.º 2
0
    void
SingleAlignerContext::runIterationThread()
{
	PreventMachineHibernationWhileThisThreadIsAlive();

    ReadSupplier *supplier = readSupplierGenerator->generateNewReadSupplier();
    if (NULL == supplier) {
        //
        // No work for this thread to do.
        //
        return;
    }
	if (extension->runIterationThread(supplier, this)) {
		delete supplier;
		return;
	}
    if (index == NULL) {
        // no alignment, just input/output
        Read *read;
        while (NULL != (read = supplier->getNextRead())) {
            stats->totalReads++;
            SingleAlignmentResult result;
            result.status = NotFound;
            result.direction = FORWARD;
            result.mapq = 0;
            result.score = 0;
            result.location = InvalidGenomeLocation;
            if (options->passFilter(read, NotFound, read->getDataLength() < minReadLength || read->countOfNs() > maxDist, false)) {
                stats->notFound++;
                if (NULL != readWriter) {
                    readWriter->writeReads(readerContext, read, &result, 1, true);
                }
            } else {
                stats->filtered++;
            }
        }
        delete supplier;
        return;
    }

    int maxReadSize = MAX_READ_LENGTH;

    SingleAlignmentResult *alignmentResults = NULL;
    unsigned alignmentResultBufferCount;
    if (maxSecondaryAlignmentAdditionalEditDistance < 0) {
        alignmentResultBufferCount = 1; // For the primary alignment
    } else {
        alignmentResultBufferCount = BaseAligner::getMaxSecondaryResults(numSeedsFromCommandLine, seedCoverage, maxReadSize, maxHits, index->getSeedLength()) + 1; // +1 for the primary alignment
    }
    size_t alignmentResultBufferSize = sizeof(*alignmentResults) * (alignmentResultBufferCount + 1); // +1 is for primary result
 
    BigAllocator *allocator = new BigAllocator(BaseAligner::getBigAllocatorReservation(index, true, maxHits, maxReadSize, index->getSeedLength(), numSeedsFromCommandLine, seedCoverage, maxSecondaryAlignmentsPerContig) 
        + alignmentResultBufferSize);
   
    BaseAligner *aligner = new (allocator) BaseAligner(
            index,
            maxHits,
            maxDist,
            maxReadSize,
            numSeedsFromCommandLine,
            seedCoverage,
			minWeightToCheck,
            extraSearchDepth,
            noUkkonen,
            noOrderedEvaluation,
			noTruncation,
            maxSecondaryAlignmentsPerContig,
            NULL,               // LV (no need to cache in the single aligner)
            NULL,               // reverse LV
            stats,
            allocator);

    alignmentResults = (SingleAlignmentResult *)allocator->allocate(alignmentResultBufferSize);
 
    allocator->checkCanaries();

    aligner->setExplorePopularSeeds(options->explorePopularSeeds);
    aligner->setStopOnFirstHit(options->stopOnFirstHit);

#ifdef  _MSC_VER
    if (options->useTimingBarrier) {
        if (0 == InterlockedDecrementAndReturnNewValue(nThreadsAllocatingMemory)) {
            AllowEventWaitersToProceed(memoryAllocationCompleteBarrier);
        } else {
            WaitForEvent(memoryAllocationCompleteBarrier);
        }
    }
#endif  // _MSC_VER

    // Align the reads.
    Read *read;
    _uint64 lastReportTime = timeInMillis();
    _uint64 readsWhenLastReported = 0;

    while (NULL != (read = supplier->getNextRead())) {
        stats->totalReads++;

        if (AlignerOptions::useHadoopErrorMessages && stats->totalReads % 10000 == 0 && timeInMillis() - lastReportTime > 10000) {
            fprintf(stderr,"reporter:counter:SNAP,readsAligned,%lu\n",stats->totalReads - readsWhenLastReported);
            readsWhenLastReported = stats->totalReads;
            lastReportTime = timeInMillis();
        }

        // Skip the read if it has too many Ns or trailing 2 quality scores.
        if (read->getDataLength() < minReadLength || read->countOfNs() > maxDist) {
            if (!options->passFilter(read, NotFound, true, false)) {
                stats->filtered++;
            } else {
                if (NULL != readWriter) {
                    SingleAlignmentResult result;
                    result.status = NotFound;
                    result.location = InvalidGenomeLocation;
                    result.mapq = 0;
                    result.direction = FORWARD;
                    readWriter->writeReads(readerContext, read, &result, 1, true);
                }
                stats->uselessReads++;
            }
            continue;
        }

#if     TIME_HISTOGRAM
        _int64 startTime = timeInNanos();
#endif // TIME_HISTOGRAM

        int nSecondaryResults = 0;

#ifdef LONG_READS
        int oldMaxK = aligner->getMaxK();
        if (options->maxDistFraction > 0.0) {
            aligner->setMaxK(min(MAX_K, (int)(read->getDataLength() * options->maxDistFraction)));
        }
#endif

        aligner->AlignRead(read, alignmentResults, maxSecondaryAlignmentAdditionalEditDistance, alignmentResultBufferCount - 1, &nSecondaryResults, maxSecondaryAlignments, alignmentResults + 1);
#ifdef LONG_READS
        aligner->setMaxK(oldMaxK);
#endif

#if     TIME_HISTOGRAM
        _int64 runTime = timeInNanos() - startTime;
        int timeBucket = min(30, cheezyLogBase2(runTime));
        stats->countByTimeBucket[timeBucket]++;
        stats->nanosByTimeBucket[timeBucket] += runTime;
#endif // TIME_HISTOGRAM

        allocator->checkCanaries();

        bool containsPrimary = true;
        if (NULL != readWriter) {
            //
            // Remove any reads that don't pass the filter, then send the remainder down to the writer.
            //
            for (int i = 0; i <= nSecondaryResults; i++) {
                if (!options->passFilter(read, alignmentResults[i].status, false, i != 0 || !containsPrimary)) {
                    if (i == 0) {
                        containsPrimary = false;
                    }
                    //
                    // Copy the last result here.
                    //
                    alignmentResults[i] = alignmentResults[nSecondaryResults];
                    nSecondaryResults--;

                    //
                    // And back up so it gets checked.
                    //
                    i--;
                }
            } // For each result

            stats->extraAlignments += nSecondaryResults + (containsPrimary ? 0 : 1);    // If it doesn't contain the primary, then it's a secondary.
            readWriter->writeReads(readerContext, read, alignmentResults, nSecondaryResults + 1, containsPrimary);

        }

        if (containsPrimary) {
            updateStats(stats, read, alignmentResults[0].status, alignmentResults[0].score, alignmentResults[0].mapq);
        } else {
            stats->filtered++;
        }


    }

    aligner->~BaseAligner(); // This calls the destructor without calling operator delete, allocator owns the memory.
 
    if (supplier != NULL) {
        delete supplier;
    }

    delete allocator;   // This is what actually frees the memory.
}