void WorkerThreadMain(void *param) { ThreadContext *context = (ThreadContext *)param; _int64 rangeStart, rangeLength; SAMReader *samReader = NULL; ReaderContext rcontext; rcontext.clipping = NoClipping; rcontext.genome = genome; rcontext.paired = false; rcontext.defaultReadGroup = ""; while (rangeSplitter->getNextRange(&rangeStart, &rangeLength)) { if (NULL == samReader) { samReader = SAMReader::create(DataSupplier::Default[true], inputFileName, rcontext, rangeStart, rangeLength); } else { ((ReadReader *)samReader)->reinit(rangeStart, rangeLength); } AlignmentResult alignmentResult; unsigned genomeLocation; Direction isRC; unsigned mapQ; unsigned flag; const char *cigar; unsigned nextFileToWrite = 0; Read read; LandauVishkinWithCigar lv; while (samReader->getNextRead(&read, &alignmentResult, &genomeLocation, &isRC, &mapQ, &flag, &cigar)) { if (mapQ < 0 || mapQ > MaxMAPQ) { fprintf(stderr,"Invalid MAPQ: %d\n",mapQ); exit(1); } if (0xffffffff == genomeLocation) { context->nUnaligned++; } else { if (flag & SAM_REVERSE_COMPLEMENT) { read.becomeRC(); } const Genome::Piece *piece = genome->getPieceAtLocation(genomeLocation); if (NULL == piece) { fprintf(stderr,"couldn't find genome piece for offset %u\n",genomeLocation); exit(1); } unsigned offsetA, offsetB; bool matched; const unsigned cigarBufLen = 1000; char cigarForAligned[cigarBufLen]; const char *alignedGenomeData = genome->getSubstring(genomeLocation, 1); int editDistance = lv.computeEditDistance(alignedGenomeData, read.getDataLength() + 20, read.getData(), read.getDataLength(), 30, cigarForAligned, cigarBufLen, false); if (editDistance == -1 || editDistance > MaxEditDistance) { editDistance = MaxEditDistance; } // // Parse the read ID. The format is ChrName_OffsetA_OffsetB_?:<more stuff>. This would be simple to parse, except that // ChrName can include "_". So, we parse it by looking for the first : and then working backward. // char idBuffer[10000]; // Hopefully big enough. I'm not worried about malicious input data here. memcpy(idBuffer,read.getId(),read.getIdLength()); idBuffer[read.getIdLength()] = 0; const char *firstColon = strchr(idBuffer,':'); bool badParse = true; size_t chrNameLen; const char *beginningOfSecondNumber; const char *beginningOfFirstNumber; int stage = 0; unsigned offsetOfCorrectChromosome; if (NULL != firstColon && firstColon - 3 > idBuffer && (*(firstColon-1) == '?' || isADigit(*(firstColon - 1)))) { // // We've parsed backwards to see that we have at least #: or ?: where '#' is a digit and ? is literal. If it's // a digit, then scan backwards through that number. // const char *underscoreBeforeFirstColon = firstColon - 2; while (underscoreBeforeFirstColon > idBuffer && isADigit(*underscoreBeforeFirstColon)) { underscoreBeforeFirstColon--; } if (*underscoreBeforeFirstColon == '_' && (isADigit(*(underscoreBeforeFirstColon - 1)) || *(underscoreBeforeFirstColon - 1) == '_')) { stage = 1; if (isADigit(*(underscoreBeforeFirstColon - 1))) { beginningOfSecondNumber = firstColon - 3; while (beginningOfSecondNumber > idBuffer && isADigit(*beginningOfSecondNumber)) { beginningOfSecondNumber--; } beginningOfSecondNumber++; // That loop actually moved us back one char before the beginning; } else { // // There's only one number, we have two consecutive underscores. // beginningOfSecondNumber = underscoreBeforeFirstColon; } if (beginningOfSecondNumber - 2 > idBuffer && *(beginningOfSecondNumber - 1) == '_' && isADigit(*(beginningOfSecondNumber - 2))) { stage = 2; beginningOfFirstNumber = beginningOfSecondNumber - 2; while (beginningOfFirstNumber > idBuffer && isADigit(*beginningOfFirstNumber)) { beginningOfFirstNumber--; } beginningOfFirstNumber++; // Again, we went one too far. offsetA = -1; offsetB = -1; if (*(beginningOfFirstNumber - 1) == '_' && 1 == sscanf(beginningOfFirstNumber,"%u",&offsetA) && ('_' == *beginningOfSecondNumber || 1 == sscanf(beginningOfSecondNumber,"%u", &offsetB))) { stage = 3; chrNameLen = (beginningOfFirstNumber - 1) - idBuffer; char correctChromosomeName[1000]; memcpy(correctChromosomeName, idBuffer, chrNameLen); correctChromosomeName[chrNameLen] = '\0'; if (!genome->getOffsetOfPiece(correctChromosomeName, &offsetOfCorrectChromosome)) { fprintf(stderr, "Couldn't parse chromosome name '%s' from read id\n", correctChromosomeName); } else { badParse = false; } } } } if (badParse) { fprintf(stderr,"Unable to parse read ID '%s', perhaps this isn't simulated data. piecelen = %d, pieceName = '%s', piece offset = %u, genome offset = %u\n", idBuffer, strlen(piece->name), piece->name, piece->beginningOffset, genomeLocation); exit(1); } bool match0 = false; bool match1 = false; if (-1 == offsetA || -1 == offsetB) { matched = false; } else if(strncmp(piece->name, idBuffer, __min(read.getIdLength(), chrNameLen))) { matched = false; } else { if (isWithin(offsetA, genomeLocation - piece->beginningOffset, 50)) { matched = true; match0 = true; } else if (isWithin(offsetB, genomeLocation - piece->beginningOffset, 50)) { matched = true; match1 = true; } else { matched = false; if (flag & SAM_FIRST_SEGMENT) { match0 = true; } else { match1 = true; } } } context->countOfReads[mapQ]++; context->countOfReadsByEditDistance[mapQ][editDistance]++; if (!matched) { context->countOfMisalignments[mapQ]++; context->countOfMisalignmentsByEditDistance[mapQ][editDistance]++; if (70 == mapQ || 69 == mapQ) { // // We don't know which offset is correct, because neither one matched. Just take the one with the lower edit distance. // unsigned correctLocationA = offsetOfCorrectChromosome + offsetA; unsigned correctLocationB = offsetOfCorrectChromosome + offsetB; unsigned correctLocation = 0; const char *correctData = NULL; const char *dataA = genome->getSubstring(correctLocationA, 1); const char *dataB = genome->getSubstring(correctLocationB, 1); int distanceA, distanceB; char cigarA[cigarBufLen]; char cigarB[cigarBufLen]; cigarA[0] = '*'; cigarA[1] = '\0'; cigarB[0] = '*'; cigarB[1] = '\0'; if (dataA == NULL) { distanceA = -1; } else { distanceA = lv.computeEditDistance(dataA, read.getDataLength() + 20, read.getData(), read.getDataLength(), 30, cigarA, cigarBufLen, false); } if (dataB == NULL) { distanceB = -1; } else { distanceB = lv.computeEditDistance(dataB, read.getDataLength() + 20, read.getData(), read.getDataLength(), 30, cigarB, cigarBufLen, false); } const char *correctGenomeData; char *cigarForCorrect; if (distanceA != -1 && distanceA <= distanceB || distanceB == -1) { correctGenomeData = dataA; correctLocation = correctLocationA; cigarForCorrect = cigarA; } else { correctGenomeData = dataB; correctLocation = correctLocationB; cigarForCorrect = cigarB; } printf("%s\t%d\t%s\t%u\t%d\t%s\t*\t*\t100\t%.*s\t%.*s\tAlignedGenomeLocation:%u\tCorrectGenomeLocation: %u\tCigarForCorrect: %s\tCorrectData: %.*s\tAlignedData: %.*s\n", idBuffer, flag, piece->name, genomeLocation - piece->beginningOffset, mapQ, cigarForAligned, read.getDataLength(), read.getData(), read.getDataLength(), read.getQuality(), genomeLocation, correctLocation, cigarForCorrect, read.getDataLength(), correctGenomeData, read.getDataLength(), alignedGenomeData); } } } } // if it was mapped } // for each read from the sam reader } if (0 == InterlockedAdd64AndReturnNewValue(&nRunningThreads, -1)) { SignalSingleWaiterObject(&allThreadsDone); } }
void SingleAlignerContext::runIterationThread() { PreventMachineHibernationWhileThisThreadIsAlive(); ReadSupplier *supplier = readSupplierGenerator->generateNewReadSupplier(); if (NULL == supplier) { // // No work for this thread to do. // return; } if (extension->runIterationThread(supplier, this)) { delete supplier; return; } if (index == NULL) { // no alignment, just input/output Read *read; while (NULL != (read = supplier->getNextRead())) { stats->totalReads++; SingleAlignmentResult result; result.status = NotFound; result.direction = FORWARD; result.mapq = 0; result.score = 0; result.location = InvalidGenomeLocation; if (options->passFilter(read, NotFound, read->getDataLength() < minReadLength || read->countOfNs() > maxDist, false)) { stats->notFound++; if (NULL != readWriter) { readWriter->writeReads(readerContext, read, &result, 1, true); } } else { stats->filtered++; } } delete supplier; return; } int maxReadSize = MAX_READ_LENGTH; SingleAlignmentResult *alignmentResults = NULL; unsigned alignmentResultBufferCount; if (maxSecondaryAlignmentAdditionalEditDistance < 0) { alignmentResultBufferCount = 1; // For the primary alignment } else { alignmentResultBufferCount = BaseAligner::getMaxSecondaryResults(numSeedsFromCommandLine, seedCoverage, maxReadSize, maxHits, index->getSeedLength()) + 1; // +1 for the primary alignment } size_t alignmentResultBufferSize = sizeof(*alignmentResults) * (alignmentResultBufferCount + 1); // +1 is for primary result BigAllocator *allocator = new BigAllocator(BaseAligner::getBigAllocatorReservation(index, true, maxHits, maxReadSize, index->getSeedLength(), numSeedsFromCommandLine, seedCoverage, maxSecondaryAlignmentsPerContig) + alignmentResultBufferSize); BaseAligner *aligner = new (allocator) BaseAligner( index, maxHits, maxDist, maxReadSize, numSeedsFromCommandLine, seedCoverage, minWeightToCheck, extraSearchDepth, noUkkonen, noOrderedEvaluation, noTruncation, maxSecondaryAlignmentsPerContig, NULL, // LV (no need to cache in the single aligner) NULL, // reverse LV stats, allocator); alignmentResults = (SingleAlignmentResult *)allocator->allocate(alignmentResultBufferSize); allocator->checkCanaries(); aligner->setExplorePopularSeeds(options->explorePopularSeeds); aligner->setStopOnFirstHit(options->stopOnFirstHit); #ifdef _MSC_VER if (options->useTimingBarrier) { if (0 == InterlockedDecrementAndReturnNewValue(nThreadsAllocatingMemory)) { AllowEventWaitersToProceed(memoryAllocationCompleteBarrier); } else { WaitForEvent(memoryAllocationCompleteBarrier); } } #endif // _MSC_VER // Align the reads. Read *read; _uint64 lastReportTime = timeInMillis(); _uint64 readsWhenLastReported = 0; while (NULL != (read = supplier->getNextRead())) { stats->totalReads++; if (AlignerOptions::useHadoopErrorMessages && stats->totalReads % 10000 == 0 && timeInMillis() - lastReportTime > 10000) { fprintf(stderr,"reporter:counter:SNAP,readsAligned,%lu\n",stats->totalReads - readsWhenLastReported); readsWhenLastReported = stats->totalReads; lastReportTime = timeInMillis(); } // Skip the read if it has too many Ns or trailing 2 quality scores. if (read->getDataLength() < minReadLength || read->countOfNs() > maxDist) { if (!options->passFilter(read, NotFound, true, false)) { stats->filtered++; } else { if (NULL != readWriter) { SingleAlignmentResult result; result.status = NotFound; result.location = InvalidGenomeLocation; result.mapq = 0; result.direction = FORWARD; readWriter->writeReads(readerContext, read, &result, 1, true); } stats->uselessReads++; } continue; } #if TIME_HISTOGRAM _int64 startTime = timeInNanos(); #endif // TIME_HISTOGRAM int nSecondaryResults = 0; #ifdef LONG_READS int oldMaxK = aligner->getMaxK(); if (options->maxDistFraction > 0.0) { aligner->setMaxK(min(MAX_K, (int)(read->getDataLength() * options->maxDistFraction))); } #endif aligner->AlignRead(read, alignmentResults, maxSecondaryAlignmentAdditionalEditDistance, alignmentResultBufferCount - 1, &nSecondaryResults, maxSecondaryAlignments, alignmentResults + 1); #ifdef LONG_READS aligner->setMaxK(oldMaxK); #endif #if TIME_HISTOGRAM _int64 runTime = timeInNanos() - startTime; int timeBucket = min(30, cheezyLogBase2(runTime)); stats->countByTimeBucket[timeBucket]++; stats->nanosByTimeBucket[timeBucket] += runTime; #endif // TIME_HISTOGRAM allocator->checkCanaries(); bool containsPrimary = true; if (NULL != readWriter) { // // Remove any reads that don't pass the filter, then send the remainder down to the writer. // for (int i = 0; i <= nSecondaryResults; i++) { if (!options->passFilter(read, alignmentResults[i].status, false, i != 0 || !containsPrimary)) { if (i == 0) { containsPrimary = false; } // // Copy the last result here. // alignmentResults[i] = alignmentResults[nSecondaryResults]; nSecondaryResults--; // // And back up so it gets checked. // i--; } } // For each result stats->extraAlignments += nSecondaryResults + (containsPrimary ? 0 : 1); // If it doesn't contain the primary, then it's a secondary. readWriter->writeReads(readerContext, read, alignmentResults, nSecondaryResults + 1, containsPrimary); } if (containsPrimary) { updateStats(stats, read, alignmentResults[0].status, alignmentResults[0].score, alignmentResults[0].mapq); } else { stats->filtered++; } } aligner->~BaseAligner(); // This calls the destructor without calling operator delete, allocator owns the memory. if (supplier != NULL) { delete supplier; } delete allocator; // This is what actually frees the memory. }