void SearchUsingBacktracker::run() { Timer timer; bool compressIntermediateBwts = true; vector <BwtReaderBase *> inBwt( alphabetSize ); LetterCountEachPile countsPerPile, countsCumulative; RangeStoreExternal r; string bwtPrefix = searchParams_["input"]; string subset_ = ""; vector<string> kmerList; if ( searchParams_["one kmer string"].isSet() ) { kmerList.push_back( searchParams_["one kmer string"] ); } else { assert( searchParams_["kmers input file"].isSet() ); string filename = searchParams_["kmers input file"]; ifstream ifs( filename ); string line; // get 1st word of each line as kmer while ( getline( ifs, line ) ) { istringstream iss( line ); string kmer; iss >> kmer; assert( kmer.size() != 1 && "todo: 1-mer search" ); if ( kmer.size() >= 2 ) kmerList.push_back( kmer ); } } SequenceNumber originalIndex = 0; for ( auto kmer : kmerList ) { std::reverse( kmer.begin(), kmer.end() ); kmerList2.push_back( KmerSearchItem( kmer, 0, 0, originalIndex++ ) ); } std::sort( kmerList2.begin(), kmerList2.end() ); Logger_if( LOG_SHOW_IF_VERY_VERBOSE ) { Logger::out() << "kmerList2:" << endl; for ( auto kmerItem : kmerList2 ) Logger::out() << " " << kmerItem.kmer << endl; } for ( int i( 0 ); i < alphabetSize; i++ ) { stringstream fileNameSS; fileNameSS << bwtPrefix << "-B0" << i; string fileName = fileNameSS.str().c_str(); if ( compressIntermediateBwts == true ) { if ( searchParams_["use indexing"].isSet() ) { Logger_if( LOG_SHOW_IF_VERBOSE ) Logger::out() << "Using indexed BWT file" << endl; inBwt[i] = new BwtReaderRunLengthIndex( fileName ); } else { Logger_if( LOG_SHOW_IF_VERBOSE ) Logger::out() << "Using non-indexed BWT file" << endl; inBwt[i] = new BwtReaderRunLength( fileName ); } } else inBwt[i] = new BwtReaderASCII( fileName ); inBwt[i]->readAndCount( countsPerPile[i] ); } countsCumulative = countsPerPile; for ( int i( 1 ); i < alphabetSize; i++ ) countsCumulative[i] += countsCumulative[i - 1]; Logger_if( LOG_SHOW_IF_VERBOSE ) { cout << "countsPerPile:" << endl; countsPerPile.print(); cout << "countsCumulative:" << endl; countsCumulative.print(); } string currentWord_notUsed; int startPosInKmerList2 = 0; int endPosInKmerList2 = 0; for ( int i( 1 ); i < alphabetSize; ++i ) { for ( int j( 1 ); j < alphabetSize; ++j ) { startPosInKmerList2 = endPosInKmerList2; while ( endPosInKmerList2 < ( int )kmerList2.size() && kmerList2[endPosInKmerList2].kmer[0] == alphabet[i] && kmerList2[endPosInKmerList2].kmer[1] == alphabet[j] ) ++endPosInKmerList2; if ( countsPerPile[i].count_[j] != 0 ) { if ( startPosInKmerList2 != endPosInKmerList2 ) r.addRange( KmerSearchRange( currentWord_notUsed, countsCumulative[i - 1].count_[j], countsPerPile[i].count_[j], false , startPosInKmerList2 , endPosInKmerList2 ) , j, i, subset_, 1 ); } else { if ( startPosInKmerList2 != endPosInKmerList2 ) { cout << "Oh nooo, I can't find: " << endl; for ( int k = startPosInKmerList2; k < endPosInKmerList2; ++k ) { string rev = kmerList2[k].kmer; std::reverse( rev.begin(), rev.end() ); cout << " " << kmerList2[k].kmer << " (" << rev << ")" << endl; } } } } // ~for j } // ~for i LetterCount countsSoFar; LetterNumber currentPos; LetterNumber numRanges, numSingletonRanges; r.clear(); for ( int cycle( 1 ); ; ++cycle ) { Logger_if( LOG_LEVEL_NORMAL ) Logger::out() << "cycle: " << cycle << endl; Logger_if( LOG_SHOW_IF_VERBOSE ) { Logger::out() << " time now: " << timer.timeNow(); Logger::out() << " usage: " << timer << endl; } numRanges = 0; numSingletonRanges = 0; r.setCycleNum( cycle ); for ( int i( 1 ); i < alphabetSize; ++i ) { inBwt[i]->rewindFile(); currentPos = 0; countsSoFar.clear(); countsSoFar += countsCumulative[i - 1]; for ( int j( 1 ); j < alphabetSize; ++j ) { r.setPortion( i, j ); OneBwtBackTracker backTracker( inBwt[i], currentPos, r, countsSoFar, subset_, cycle + 1, false, // don't propagate breakpoints until $ sign true // skip-already-processed-intervals deactivated ); KmerSearchIntervalHandler intervalHandler; KmerSearchRange rangeObject; backTracker.process( i, currentWord_notUsed, intervalHandler, rangeObject ); numRanges += backTracker.numRanges_; numSingletonRanges += backTracker.numSingletonRanges_; } // ~for j //cerr << "Done i " << i <<endl; } // ~for i r.clear(); // return 0; // %%% Logger_if( LOG_SHOW_IF_VERBOSE ) { Logger::out() << "Finished cycle " << cycle << ": ranges=" << numRanges << " singletons=" << numSingletonRanges << endl; } if ( numRanges == 0 ) break; } // ~for c for ( int i = 0; i < alphabetSize; i++ ) delete inBwt[i]; // Output { ostream *outputStreamPtr = &std::cout; string outputFilename = searchParams_["output"]; ofstream ofs; if ( outputFilename != "-" ) { ofs.open( outputFilename ); if ( ofs.good() ) outputStreamPtr = &ofs; else cerr << "Warning: Couldn't open output file " << outputFilename << ". Sending output to stdout." << endl; } IntervalWriter writer( *outputStreamPtr ); for ( auto kmerItem : kmerList2 ) { IntervalRecord rec( kmerList[kmerItem.originalIndex], kmerItem.position, kmerItem.count ); writer.write( rec ); } } }
ErrorStore BwtCorrector::findErrors() { const bool propagateSequence = false;//( compareParams_ ? ( *compareParams_ )["propagate sequence"] : false ); ErrorStore result; Timer timer; bool compressIntermediateBwts = true; vector <BwtReaderBase *> inBwt( alphabetSize ); LetterCountEachPile countsPerPile, countsCumulative; RangeStoreExternal r; int numCycles( readLength_ ); // int minOcc( numberOfReads_ ); for ( int i( 0 ); i < alphabetSize; i++ ) { stringstream fileNameSS; fileNameSS << indexPrefix_ << "-B0" << i; string fileName = fileNameSS.str().c_str(); if ( compressIntermediateBwts == true ) inBwt[i] = new BwtReaderRunLengthIndex( fileName, correctorParams_->getStringValue( "use shm" ) ); else inBwt[i] = new BwtReaderASCII( fileName ); inBwt[i]-> readAndCount( countsPerPile[i] ); } countsCumulative = countsPerPile; for ( int i( 1 ); i < alphabetSize; i++ ) countsCumulative[i] += countsCumulative[i - 1]; countsCumulative.print(); string currentWord = "xx"; for ( int i( 1 ); i < alphabetSize; ++i ) { if ( propagateSequence ) currentWord[1] = alphabet[i]; for ( int j( 1 ); j < alphabetSize; ++j ) { if ( propagateSequence ) currentWord[0] = alphabet[j]; if ( countsPerPile[i].count_[j] != 0 ) r.addRange( ErrorCorrectionRange( currentWord, countsCumulative[i - 1].count_[j], countsPerPile[i].count_[j], false ) , j, i, subset_, 1 ); } // ~for j } // ~for i LetterCount countsSoFar; LetterNumber currentPos; LetterNumber numRanges, numSingletonRanges; r.clear(); for ( int c( 0 ); c < numCycles; ++c ) { int minimumSupport = getMinSupport( c ); cout << "cycle: " << c << endl; Logger_if( LOG_SHOW_IF_VERBOSE ) { Logger::out() << " time now: " << timer.timeNow(); Logger::out() << " usage: " << timer << endl; } numRanges = 0; numSingletonRanges = 0; r.setCycleNum( c + 1 ); for ( int i( 1 ); i < alphabetSize; ++i ) { string thisWord( c + 3, 'x' ); inBwt[i]->rewindFile(); currentPos = 0; countsSoFar.clear(); countsSoFar += countsCumulative[i - 1]; #ifdef PROB_NOT_NEEDED BwtReaderRunLengthIndex *pRun; pRun = dynamic_cast<BwtReaderRunLengthIndex *>( inBwt[i] ); if ( pRun != NULL ) pRun->initIndex( countsSoFar ); else inBwtA[i]->rewindFile(); #endif for ( int j( 1 ); j < alphabetSize; ++j ) { r.setPortion( i, j ); OneBwtBackTracker backTracker( inBwt[i], currentPos, r, countsSoFar, // numCycles, subset_, c + 2, false, // doesn't propagate to end of reads true, // skip-already-processed-intervals deactivated propagateSequence, endPosFile_ ); BwtCorrectorIntervalHandler intervalHandler( result, minWitnessLength_, minimumSupport, c + 2 ); ErrorCorrectionRange rangeObject; backTracker.process( i, thisWord, intervalHandler, rangeObject ); numRanges += backTracker.numRanges_; numSingletonRanges += backTracker.numSingletonRanges_; } // ~for j //cerr << "Done i " << i <<endl; } // ~for i r.clear(); // return 0; // %%% Logger_if( LOG_SHOW_IF_VERBOSE ) { Logger::out() << "Finished cycle " << c << ": ranges=" << numRanges << " singletons=" << numSingletonRanges << endl; } if ( numRanges == 0 ) break; } // ~for c for ( int i = 0; i < alphabetSize; i++ ) delete inBwt[i]; return result; }