// aligns the read archive void CMosaikAligner::AlignReadArchive(MosaikReadFormat::CReadReader& in, MosaikReadFormat::CAlignmentWriter& out, unsigned int* pRefBegin, unsigned int* pRefEnd, char** pBsRefSeqs) { // ============== // initialization // ============== // retrieve the concatenated reference sequence length /* vector<ReferenceSequence> referenceSequences; MosaikReadFormat::CReferenceSequenceReader refseq; refseq.Open(mSettings.ReferenceFilename); refseq.GetReferenceSequences(referenceSequences); mReferenceLength = refseq.GetReferenceSequenceLength(); const unsigned int numRefSeqs = refseq.GetNumReferenceSequences(); // retrieve the basespace reference filenames char** pBsRefSeqs = NULL; if(mFlags.EnableColorspace) { cout << "- loading basespace reference sequences... "; cout.flush(); MosaikReadFormat::CReferenceSequenceReader bsRefSeq; bsRefSeq.Open(mSettings.BasespaceReferenceFilename); if(!bsRefSeq.HasSameReferenceSequences(referenceSequences)) { printf("ERROR: The basespace and colorspace reference sequence archives do not seem to represent the same FASTA file.\n"); exit(1); } bsRefSeq.CopyReferenceSequences(pBsRefSeqs); bsRefSeq.Close(); cout << "finished." << endl; } // initialize our hash tables InitializeHashTables(CalculateHashTableSize(mReferenceLength, mSettings.HashSize)); // hash the concatenated reference sequence if(!mFlags.IsUsingJumpDB) HashReferenceSequence(refseq); cout << "- loading reference sequence... "; cout.flush(); refseq.LoadConcatenatedSequence(mReference); cout << "finished." << endl; refseq.Close(); // create our reference sequence LUTs unsigned int* pRefBegin = new unsigned int[numRefSeqs]; unsigned int* pRefEnd = new unsigned int[numRefSeqs]; for(unsigned int j = 0; j < numRefSeqs; j++) { pRefBegin[j] = referenceSequences[j].Begin; pRefEnd[j] = referenceSequences[j].End; } // set the hash positions threshold if(mFlags.IsUsingHashPositionThreshold && (mAlgorithm == CAlignmentThread::AlignerAlgorithm_ALL)) mpDNAHash->RandomizeAndTrimHashPositions(mSettings.HashPositionThreshold); // localize the read archive filenames string inputReadArchiveFilename = mSettings.InputReadArchiveFilename; string outputReadArchiveFilename = mSettings.OutputReadArchiveFilename; // define our read format reader and writer MosaikReadFormat::CReadReader in; in.Open(inputReadArchiveFilename); MosaikReadFormat::ReadGroup readGroup = in.GetReadGroup(); */ ReadStatus readStatus = in.GetStatus(); /* mSettings.SequencingTechnology = readGroup.SequencingTechnology; mSettings.MedianFragmentLength = readGroup.MedianFragmentLength; */ const bool isPairedEnd = (readStatus == RS_PAIRED_END_READ ? true : false); /* vector<MosaikReadFormat::ReadGroup> readGroups; readGroups.push_back(readGroup); // set the alignment status flags AlignmentStatus alignmentStatus = AS_UNSORTED_READ | readStatus; if(mMode == CAlignmentThread::AlignerMode_ALL) alignmentStatus |= AS_ALL_MODE; else alignmentStatus |= AS_UNIQUE_MODE; MosaikReadFormat::CAlignmentWriter out; out.Open(mSettings.OutputReadArchiveFilename.c_str(), referenceSequences, readGroups, alignmentStatus); */ // open the unaligned read report file FILE* unalignedStream = NULL; if(mFlags.IsReportingUnalignedReads) { if(fopen_s(&unalignedStream, mSettings.UnalignedReadReportFilename.c_str(), "wb") != 0) { cout << "ERROR: Unable to open the unaligned read FASTQ file for output." << endl; exit(1); } } // localize our read and reference counts. Initialize our statistical counters uint64_t numReadArchiveReads = in.GetNumReads(); uint64_t readCounter = 0; // initialize our threads pthread_t* activeThreads = new pthread_t[mSettings.NumThreads]; CAlignmentThread::ThreadData td; td.Algorithm = mAlgorithm; td.ReferenceLen = mReferenceLength; td.Filters = mFilters; td.SplitFilters = mSplitFilters; td.Flags = mFlags; td.Mode = mMode; td.pReference = mReference; td.pCounters = &mStatisticsCounters; td.pDnaHash = mpDNAHash; td.pIn = ∈ td.pOut = &out; td.pUnalignedStream = unalignedStream; td.pRefBegin = pRefBegin; td.pRefEnd = pRefEnd; td.Settings = mSettings; td.pReadCounter = &readCounter; td.IsPairedEnd = isPairedEnd; td.pBsRefSeqs = pBsRefSeqs; // unenable EnableColorspace flag for low-memory algorithm, deal with the SOLiD convertion when sorting the aligned archives //if ( mFlags.UseLowMemory ) // td.Flags.EnableColorspace = false; pthread_attr_t attr; pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); pthread_mutex_init(&CAlignmentThread::mGetReadMutex, NULL); pthread_mutex_init(&CAlignmentThread::mReportUnalignedMate1Mutex, NULL); pthread_mutex_init(&CAlignmentThread::mReportUnalignedMate2Mutex, NULL); pthread_mutex_init(&CAlignmentThread::mSaveReadMutex, NULL); pthread_mutex_init(&CAlignmentThread::mStatisticsMutex, NULL); pthread_mutex_init(&CAbstractDnaHash::mJumpCacheMutex, NULL); pthread_mutex_init(&CAbstractDnaHash::mJumpKeyMutex, NULL); pthread_mutex_init(&CAbstractDnaHash::mJumpPositionMutex, NULL); // =========================== // start our alignment threads // =========================== // initialize our progress bar if ( !mFlags.UseLowMemory ) { CConsole::Heading(); cout << endl; } cout << "Aligning read library (" << numReadArchiveReads << "):" << endl; if ( !mFlags.UseLowMemory ) CConsole::Reset(); CProgressBar<uint64_t>::StartThread(&readCounter, 0, numReadArchiveReads, "reads"); // create our threads for(unsigned int i = 0; i < mSettings.NumThreads; i++) pthread_create(&activeThreads[i], &attr, CAlignmentThread::StartThread, (void*)&td); pthread_attr_destroy(&attr); CBenchmark alignmentBench; alignmentBench.Start(); // wait for the threads to complete void* status = NULL; for(unsigned int i = 0; i < mSettings.NumThreads; i++) pthread_join(activeThreads[i], &status); // wait for the progress bar to finish CProgressBar<uint64_t>::WaitThread(); alignmentBench.Stop(); // free up some memory //delete [] mReference; delete [] activeThreads; activeThreads = NULL; //if(pRefBegin) delete [] pRefBegin; //if(pRefEnd) delete [] pRefEnd; //if(pBsRefSeqs) { // for(unsigned int i = 0; i < numRefSeqs; ++i) delete [] pBsRefSeqs[i]; // delete [] pBsRefSeqs; //} // close open file streams //in.Close(); // solid references should be one-base longer after converting back to basespace //if(mFlags.EnableColorspace) out.AdjustSolidReferenceBases(); //out.Close(); if(mFlags.IsReportingUnalignedReads) fclose(unalignedStream); //if(mFlags.IsUsingJumpDB) mpDNAHash->FreeMemory(); // ==================== // print our statistics // ==================== /* const uint64_t totalMates = mStatisticsCounters.ShortMates + mStatisticsCounters.FailedHashMates + mStatisticsCounters.UniqueMates + mStatisticsCounters.NonUniqueMates + mStatisticsCounters.FilteredOutMates; const uint64_t totalAlignedMates = mStatisticsCounters.UniqueMates + mStatisticsCounters.NonUniqueMates; const uint64_t totalAlignedReads = mStatisticsCounters.AlignedReads; // print our alignment statistics (mates) if don't enable low-memory algorithm if ( !mFlags.UseLowMemory ) { printf("\n"); CConsole::Heading(); printf("Alignment statistics (mates):\n"); CConsole::Reset(); printf("===================================\n"); if(mStatisticsCounters.ShortMates > 0) printf("# too short: %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.ShortMates, (mStatisticsCounters.ShortMates / (double)totalMates) * 100.0); if(mStatisticsCounters.FailedHashMates > 0) printf("# failed hash: %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.FailedHashMates, (mStatisticsCounters.FailedHashMates / (double)totalMates) * 100.0); if(mStatisticsCounters.FilteredOutMates > 0) printf("# filtered out: %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.FilteredOutMates, (mStatisticsCounters.FilteredOutMates / (double)totalMates) * 100.0); if(mStatisticsCounters.UniqueMates > 0) printf("# unique: %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.UniqueMates, (mStatisticsCounters.UniqueMates / (double)totalMates) * 100.0); if(mStatisticsCounters.NonUniqueMates > 0) printf("# non-unique: %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.NonUniqueMates, (mStatisticsCounters.NonUniqueMates / (double)totalMates) * 100.0); printf("-----------------------------------\n"); printf("total: %9llu\n", (unsigned long long)totalMates); printf("total aligned: "); CConsole::Bold(); printf("%9llu", (unsigned long long)totalAlignedMates); CConsole::Reset(); printf(" ("); CConsole::Bold(); printf("%5.1f %%", (totalAlignedMates / (double)totalMates) * 100.0); CConsole::Reset(); printf(")\n"); // print our local alignment search statistics if(mFlags.UseLocalAlignmentSearch) { printf("\n"); CConsole::Heading(); printf("Local alignment search statistics:\n"); CConsole::Reset(); printf("===================================\n"); double rescuedAlignmentsPercent = mStatisticsCounters.AdditionalLocalMates / (double)totalMates * 100.0; printf("rescued mates: %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.AdditionalLocalMates, rescuedAlignmentsPercent); } // print our alignment statistics (reads) if(isPairedEnd) { printf("\n"); CConsole::Heading(); printf("Alignment statistics (reads):\n"); CConsole::Reset(); printf("============================================\n"); printf("# unaligned: %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.UnalignedReads, (mStatisticsCounters.UnalignedReads / (double)numReadArchiveReads) * 100.0); printf("# orphaned: %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.OrphanedReads, (mStatisticsCounters.OrphanedReads / (double)numReadArchiveReads) * 100.0); printf("# both mates unique: %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.BothUniqueReads, (mStatisticsCounters.BothUniqueReads / (double)numReadArchiveReads) * 100.0); printf("# one mate non-unique: %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.OneNonUniqueReads, (mStatisticsCounters.OneNonUniqueReads / (double)numReadArchiveReads) * 100.0); printf("# both mates non-unique: %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.BothNonUniqueReads, (mStatisticsCounters.BothNonUniqueReads / (double)numReadArchiveReads) * 100.0); printf("--------------------------------------------\n"); printf("total reads: "); CConsole::Bold(); printf("%9llu", (unsigned long long)numReadArchiveReads); CConsole::Reset(); printf("\n"); printf("total reads aligned: "); CConsole::Bold(); printf("%9llu", (unsigned long long)totalAlignedReads); CConsole::Reset(); printf(" ("); CConsole::Bold(); printf("%5.1f %%", (totalAlignedReads / (double)numReadArchiveReads) * 100.0); CConsole::Reset(); printf(")\n"); } // print our jump cache statistics if(mFlags.IsUsingJumpDB && (mSettings.NumCachedHashes > 0)) { printf("\n"); CConsole::Heading(); printf("Jump database cache statistics:\n"); CConsole::Reset(); printf("====================================\n"); uint64_t cacheHits = 0, cacheMisses = 0, cacheTotal = 0; CJumpDnaHash* pJump = (CJumpDnaHash*)mpDNAHash; pJump->GetCacheStatistics(cacheHits, cacheMisses); cacheTotal = cacheHits + cacheMisses; double cacheHitsPercent = cacheHits / (double)cacheTotal * 100.0; printf("cache hits: %10llu (%5.1f %%)\n", (unsigned long long)cacheHits, cacheHitsPercent); printf("cache misses: %10llu\n", (unsigned long long)cacheMisses); } printf("\n"); CConsole::Heading(); printf("Miscellaneous statistics:\n"); CConsole::Reset(); printf("==================================\n"); printf("aligned mate bp: %10llu\n", (unsigned long long)mStatisticsCounters.MateBasesAligned); printf("alignment candidates/s: %10.1f\n", mStatisticsCounters.AlignmentCandidates / alignmentBench.GetElapsedWallTime()); } */ }
// print our statistics void CMosaikAligner::PrintStatistics () { MosaikReadFormat::CReadReader in; string inputReadArchiveFilename = mSettings.InputReadArchiveFilename; in.Open(inputReadArchiveFilename); const uint64_t numReadArchiveReads = in.GetNumReads(); ReadStatus readStatus = in.GetStatus(); const bool isPairedEnd = (readStatus == RS_PAIRED_END_READ ? true : false); const uint64_t totalMates = isPairedEnd ? numReadArchiveReads * 2 : numReadArchiveReads; if ( mFlags.UseLowMemory ) { mStatisticsCounters.ShortMates = 0; mStatisticsCounters.FailedHashMates = 0; mStatisticsCounters.UnalignedReads = numReadArchiveReads - mStatisticsCounters.AlignedReads; mStatisticsCounters.FilteredOutMates = mStatisticsCounters.UnalignedReads; } const uint64_t totalAlignedMates = mStatisticsCounters.UniqueMates + mStatisticsCounters.NonUniqueMates; const uint64_t totalAlignedReads = mStatisticsCounters.AlignedReads; // print our alignment statistics (mates) if don't enable low-memory algorithm printf("\n"); CConsole::Heading(); printf("Alignment statistics (mates):\n"); CConsole::Reset(); printf("===================================\n"); if(mStatisticsCounters.ShortMates > 0) printf("# too short: %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.ShortMates, (mStatisticsCounters.ShortMates / (double)totalMates) * 100.0); if(mStatisticsCounters.FailedHashMates > 0) printf("# failed hash: %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.FailedHashMates, (mStatisticsCounters.FailedHashMates / (double)totalMates) * 100.0); if(mStatisticsCounters.FilteredOutMates > 0) printf("# filtered out: %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.FilteredOutMates, (mStatisticsCounters.FilteredOutMates / (double)totalMates) * 100.0); if(mStatisticsCounters.UniqueMates > 0) printf("# unique: %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.UniqueMates, (mStatisticsCounters.UniqueMates / (double)totalMates) * 100.0); if(mStatisticsCounters.NonUniqueMates > 0) printf("# non-unique: %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.NonUniqueMates, (mStatisticsCounters.NonUniqueMates / (double)totalMates) * 100.0); printf("-----------------------------------\n"); printf("total: %9llu\n", (unsigned long long)totalMates); printf("total aligned: "); CConsole::Bold(); printf("%9llu", (unsigned long long)totalAlignedMates); CConsole::Reset(); printf(" ("); CConsole::Bold(); printf("%5.1f %%", (totalAlignedMates / (double)totalMates) * 100.0); CConsole::Reset(); printf(")\n"); // print our local alignment search statistics // we don't print out local alignment information when the low-memory approach is enabled. if( !mFlags.UseLowMemory && mFlags.UseLocalAlignmentSearch ) { printf("\n"); CConsole::Heading(); printf("Local alignment search statistics:\n"); CConsole::Reset(); printf("===================================\n"); double rescuedAlignmentsPercent = mStatisticsCounters.AdditionalLocalMates / (double)totalMates * 100.0; printf("rescued mates: %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.AdditionalLocalMates, rescuedAlignmentsPercent); } // print our alignment statistics (reads) if(isPairedEnd) { printf("\n"); CConsole::Heading(); printf("Alignment statistics (reads):\n"); CConsole::Reset(); printf("============================================\n"); printf("# unaligned: %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.UnalignedReads, (mStatisticsCounters.UnalignedReads / (double)numReadArchiveReads) * 100.0); printf("# orphaned: %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.OrphanedReads, (mStatisticsCounters.OrphanedReads / (double)numReadArchiveReads) * 100.0); printf("# both mates unique: %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.BothUniqueReads, (mStatisticsCounters.BothUniqueReads / (double)numReadArchiveReads) * 100.0); printf("# one mate non-unique: %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.OneNonUniqueReads, (mStatisticsCounters.OneNonUniqueReads / (double)numReadArchiveReads) * 100.0); printf("# both mates non-unique: %9llu (%5.1f %%)\n", (unsigned long long)mStatisticsCounters.BothNonUniqueReads, (mStatisticsCounters.BothNonUniqueReads / (double)numReadArchiveReads) * 100.0); printf("--------------------------------------------\n"); printf("total reads: "); CConsole::Bold(); printf("%9llu", (unsigned long long)numReadArchiveReads); CConsole::Reset(); printf("\n"); printf("total reads aligned: "); CConsole::Bold(); printf("%9llu", (unsigned long long)totalAlignedReads); CConsole::Reset(); printf(" ("); CConsole::Bold(); printf("%5.1f %%", (totalAlignedReads / (double)numReadArchiveReads) * 100.0); CConsole::Reset(); printf(")\n"); } // print our jump cache statistics if( !mFlags.UseLowMemory && mFlags.IsUsingJumpDB && (mSettings.NumCachedHashes > 0)) { printf("\n"); CConsole::Heading(); printf("Jump database cache statistics:\n"); CConsole::Reset(); printf("====================================\n"); uint64_t cacheHits = 0, cacheMisses = 0, cacheTotal = 0; CJumpDnaHash* pJump = (CJumpDnaHash*)mpDNAHash; pJump->GetCacheStatistics(cacheHits, cacheMisses); cacheTotal = cacheHits + cacheMisses; double cacheHitsPercent = cacheHits / (double)cacheTotal * 100.0; printf("cache hits: %10llu (%5.1f %%)\n", (unsigned long long)cacheHits, cacheHitsPercent); printf("cache misses: %10llu\n", (unsigned long long)cacheMisses); } //if ( !mFlags.UseLowMemory ) { // printf("\n"); // CConsole::Heading(); printf("Miscellaneous statistics:\n"); CConsole::Reset(); // printf("==================================\n"); // printf("aligned mate bp: %10llu\n", (unsigned long long)mStatisticsCounters.MateBasesAligned); // printf("alignment candidates/s: %10.1f\n", mStatisticsCounters.AlignmentCandidates / alignmentBench.GetElapsedWallTime()); //} }
void CMosaikAligner::AlignReadArchiveLowMemory(void) { // ============== // initialization // ============== // retrieve the concatenated reference sequence length // vector<ReferenceSequence> referenceSequences; MosaikReadFormat::CReferenceSequenceReader refseq; refseq.Open(mSettings.ReferenceFilename); refseq.GetReferenceSequences(referenceSequences); mReferenceLength = refseq.GetReferenceSequenceLength(); const unsigned int numRefSeqs = refseq.GetNumReferenceSequences(); refseq.Close(); // retrieve the basespace reference filenames //char** pBsRefSeqs = NULL; if(mFlags.EnableColorspace) { MosaikReadFormat::CReferenceSequenceReader bsRefSeq; bsRefSeq.Open(mSettings.BasespaceReferenceFilename); if(!bsRefSeq.HasSameReferenceSequences(referenceSequences)) { printf("ERROR: The basespace and colorspace reference sequence archives do not seem to represent the same FASTA file.\n"); exit(1); } bsRefSeq.Close(); } // initialize our hash tables //InitializeHashTables(CalculateHashTableSize(mReferenceLength, mSettings.HashSize)); // hash the concatenated reference sequence //if(!mFlags.IsUsingJumpDB) { // InitializeHashTables(CalculateHashTableSize(mReferenceLength, mSettings.HashSize), 0, 0, 0); // HashReferenceSequence(refseq); //} //cout << "- loading reference sequence... "; //cout.flush(); //refseq.LoadConcatenatedSequence(mReference); //cout << "finished." << endl; // create our reference sequence LUTs //unsigned int* pRefBegin = new unsigned int[numRefSeqs]; //unsigned int* pRefEnd = new unsigned int[numRefSeqs]; // //for(unsigned int j = 0; j < numRefSeqs; j++) { // pRefBegin[j] = referenceSequences[j].Begin; // pRefEnd[j] = referenceSequences[j].End; //} string inputReadArchiveFilename = mSettings.InputReadArchiveFilename; if ( !mFlags.UseLowMemory ) { // prepare BS reference sequence for SOLiD data char** pBsRefSeqs = NULL; if(mFlags.EnableColorspace) { cout << "- loading basespace reference sequences... "; cout.flush(); MosaikReadFormat::CReferenceSequenceReader bsRefSeq; bsRefSeq.Open(mSettings.BasespaceReferenceFilename); bsRefSeq.CopyReferenceSequences(pBsRefSeqs); bsRefSeq.Close(); cout << "finished." << endl; } // prepare reference sequence refseq.Open(mSettings.ReferenceFilename); cout << "- loading reference sequence... "; cout.flush(); refseq.LoadConcatenatedSequence(mReference); cout << "finished." << endl; refseq.Close(); unsigned int* pRefBegin = new unsigned int[numRefSeqs]; unsigned int* pRefEnd = new unsigned int[numRefSeqs]; for(unsigned int j = 0; j < numRefSeqs; j++) { pRefBegin[j] = referenceSequences[j].Begin; pRefEnd[j] = referenceSequences[j].End; } // initialize our hash tables if(!mFlags.IsUsingJumpDB) { InitializeHashTables(CalculateHashTableSize(mReferenceLength, mSettings.HashSize), 0, 0, 0, mFlags.UseLowMemory, 0); HashReferenceSequence(refseq); } else { InitializeHashTables(CalculateHashTableSize(mReferenceLength, mSettings.HashSize), pRefBegin[0], pRefEnd[numRefSeqs - 1], 0, mFlags.UseLowMemory, 0); mpDNAHash->LoadKeysNPositions(); } // set the hash positions threshold if(mFlags.IsUsingHashPositionThreshold && (mAlgorithm == CAlignmentThread::AlignerAlgorithm_ALL)) mpDNAHash->RandomizeAndTrimHashPositions(mSettings.HashPositionThreshold); // localize the read archive filenames string outputReadArchiveFilename = mSettings.OutputReadArchiveFilename; // define our read format reader and writer MosaikReadFormat::CReadReader in; in.Open(inputReadArchiveFilename); MosaikReadFormat::ReadGroup readGroup = in.GetReadGroup(); ReadStatus readStatus = in.GetStatus(); mSettings.SequencingTechnology = readGroup.SequencingTechnology; mSettings.MedianFragmentLength = readGroup.MedianFragmentLength; vector<MosaikReadFormat::ReadGroup> readGroups; readGroups.push_back(readGroup); // set the alignment status flags AlignmentStatus alignmentStatus = AS_UNSORTED_READ | readStatus; if(mMode == CAlignmentThread::AlignerMode_ALL) alignmentStatus |= AS_ALL_MODE; else alignmentStatus |= AS_UNIQUE_MODE; MosaikReadFormat::CAlignmentWriter out; out.Open(mSettings.OutputReadArchiveFilename.c_str(), referenceSequences, readGroups, alignmentStatus, ALIGNER_SIGNATURE); AlignReadArchive(in, out, pRefBegin, pRefEnd, pBsRefSeqs); // close open file streams in.Close(); // solid references should be one-base longer after converting back to basespace if(mFlags.EnableColorspace) out.AdjustSolidReferenceBases(); out.Close(); // free memory if(mFlags.IsUsingJumpDB) mpDNAHash->FreeMemory(); if(pRefBegin) delete [] pRefBegin; if(pRefEnd) delete [] pRefEnd; if(mReference) delete [] mReference; if(pBsRefSeqs) { for(unsigned int i = 0; i < numRefSeqs; ++i) delete [] pBsRefSeqs[i]; delete [] pBsRefSeqs; } pRefBegin = NULL; pRefEnd = NULL; mReference = NULL; pBsRefSeqs = NULL; } else { // grouping reference and store information in referenceGroups vector // vector< pair <unsigned int, unsigned int> > referenceGroups; GroupReferences(); // get hash statistics for adjusting mhp for each reference group and reserve memory vector< unsigned int > nHashs; // the numbers of hash positions in each reference group vector< unsigned int > expectedMemories; // the numbers of hashs in each reference group uint64_t nTotalHash; GetHashStatistics( nHashs, expectedMemories, nTotalHash ); // align reads again per chromosome group for ( unsigned int i = 0; i < referenceGroups.size(); i++) { unsigned int startRef = referenceGroups[i].first; unsigned int endRef = referenceGroups[i].first + referenceGroups[i].second - 1; CConsole::Heading(); if ( referenceGroups[i].second > 1 ) cout << endl << "Aligning chromosome " << startRef + 1 << "-" << endRef + 1 << " (of " << numRefSeqs << "):" << endl; else cout << endl << "Aligning chromosome " << startRef + 1 << " (of " << numRefSeqs << "):" << endl; CConsole::Reset(); // initialize our hash tables // calculate expected memories for jump data unsigned int expectedMemory = nHashs[i] + expectedMemories[i]; // reserve 3% more memory for unexpected usage expectedMemory = expectedMemory * 1.03; InitializeHashTables(0, referenceSequences[startRef].Begin, referenceSequences[endRef].End, referenceSequences[startRef].Begin, mFlags.UseLowMemory, expectedMemory); // set the hash positions threshold if(mFlags.IsUsingHashPositionThreshold && (mAlgorithm == CAlignmentThread::AlignerAlgorithm_ALL)) { double ratio = nHashs[i] / (double)nTotalHash; unsigned int positionThreshold = ceil(ratio * (double)mSettings.HashPositionThreshold); //cout << positionThreshold << endl; mpDNAHash->RandomizeAndTrimHashPositions(positionThreshold); } // load jump data mpDNAHash->LoadKeysNPositions(); // set reference information unsigned int* pRefBegin = new unsigned int[referenceGroups[i].second]; unsigned int* pRefEnd = new unsigned int[referenceGroups[i].second]; for ( unsigned int j = 0; j < referenceGroups[i].second; j++ ){ pRefBegin[j] = referenceSequences[startRef+j].Begin - referenceSequences[startRef].Begin; pRefEnd[j] = referenceSequences[startRef+j].End - referenceSequences[startRef].Begin; } // prepare BS reference sequence for SOLiD data char** pBsRefSeqs = NULL; if(mFlags.EnableColorspace) { cout << "- loading basespace reference sequences... "; cout.flush(); MosaikReadFormat::CReferenceSequenceReader bsRefSeq; bsRefSeq.Open(mSettings.BasespaceReferenceFilename); bsRefSeq.CopyReferenceSequences(pBsRefSeqs, startRef, referenceGroups[i].second); bsRefSeq.Close(); cout << "finished." << endl; } // prepare reference sequence refseq.Open(mSettings.ReferenceFilename); cout << "- loading reference sequence... "; cout.flush(); //refseq.LoadConcatenatedSequence(mReference); refseq.LoadConcatenatedSequence(mReference, startRef, referenceGroups[i].second); refseq.Close(); // trim reference sequence //unsigned int chrLength = referenceSequences[endRef].End - referenceSequences[startRef].Begin + 1; //char* chrReference = new char[ chrLength + 1 ]; //char* mReferencePtr = mReference + referenceSequences[startRef].Begin; //memcpy( chrReference, mReferencePtr, chrLength); //chrReference[chrLength] = 0; //delete [] mReference; //mReference = chrReference; cout << "finished." << endl; // localize the read archive filenames // get a temporary file name string tempFilename; CFileUtilities::GetTempFilename(tempFilename); outputFilenames.push_back(tempFilename); // define our read format reader and writer MosaikReadFormat::CReadReader in; in.Open(inputReadArchiveFilename); MosaikReadFormat::ReadGroup readGroup = in.GetReadGroup(); ReadStatus readStatus = in.GetStatus(); mSettings.SequencingTechnology = readGroup.SequencingTechnology; mSettings.MedianFragmentLength = readGroup.MedianFragmentLength; vector<MosaikReadFormat::ReadGroup> readGroups; readGroups.push_back(readGroup); // set the alignment status flags AlignmentStatus alignmentStatus = AS_UNSORTED_READ | readStatus; if(mMode == CAlignmentThread::AlignerMode_ALL) alignmentStatus |= AS_ALL_MODE; else alignmentStatus |= AS_UNIQUE_MODE; // prepare a new vector for the current chromosome for opening out archive vector<ReferenceSequence> smallReferenceSequences; for ( unsigned int j = 0; j < referenceGroups[i].second; j++ ){ smallReferenceSequences.push_back(referenceSequences[startRef+j]); } MosaikReadFormat::CAlignmentWriter out; out.Open(tempFilename.c_str(), smallReferenceSequences, readGroups, alignmentStatus, ALIGNER_SIGNATURE); out.AdjustPartitionSize(20000/referenceGroups.size()); AlignReadArchive(in, out, pRefBegin, pRefEnd, pBsRefSeqs); // close open file streams in.Close(); // solid references should be one-base longer after converting back to basespace if(mFlags.EnableColorspace) out.AdjustSolidReferenceBases(); out.Close(); // free memory if(mFlags.IsUsingJumpDB) mpDNAHash->FreeMemory(); if(pRefBegin) delete [] pRefBegin; if(pRefEnd) delete [] pRefEnd; if(mReference) delete [] mReference; if(pBsRefSeqs) { for(unsigned int j = 0; j < referenceGroups[i].second; j++) delete [] pBsRefSeqs[j]; delete [] pBsRefSeqs; } pRefBegin = NULL; pRefEnd = NULL; mReference = NULL; pBsRefSeqs = NULL; } } if ( mFlags.UseLowMemory ) MergeArchives(); PrintStatistics(); }