void SearchUsingBacktracker::run()
{
    Timer  timer;
    bool compressIntermediateBwts = true;

    vector <BwtReaderBase *> inBwt( alphabetSize );

    LetterCountEachPile countsPerPile, countsCumulative;

    RangeStoreExternal r;

    string bwtPrefix = searchParams_["input"];
    string subset_ = "";

    vector<string> kmerList;
    if ( searchParams_["one kmer string"].isSet() )
    {
        kmerList.push_back( searchParams_["one kmer string"] );
    }
    else
    {
        assert( searchParams_["kmers input file"].isSet() );
        string filename = searchParams_["kmers input file"];
        ifstream ifs( filename );
        string line;
        // get 1st word of each line as kmer
        while ( getline( ifs, line ) )
        {
            istringstream iss( line );
            string kmer;
            iss >> kmer;
            assert( kmer.size() != 1 && "todo: 1-mer search" );
            if ( kmer.size() >= 2 )
                kmerList.push_back( kmer );
        }
    }

    SequenceNumber originalIndex = 0;
    for ( auto kmer : kmerList )
    {
        std::reverse( kmer.begin(), kmer.end() );
        kmerList2.push_back( KmerSearchItem( kmer, 0, 0, originalIndex++ ) );
    }
    std::sort( kmerList2.begin(), kmerList2.end() );
    Logger_if( LOG_SHOW_IF_VERY_VERBOSE )
    {
        Logger::out() << "kmerList2:" << endl;
        for ( auto kmerItem : kmerList2 )
            Logger::out() << "  " << kmerItem.kmer << endl;
    }

    for ( int i( 0 ); i < alphabetSize; i++ )
    {
        stringstream fileNameSS;
        fileNameSS << bwtPrefix << "-B0" << i;
        string fileName = fileNameSS.str().c_str();
        if ( compressIntermediateBwts == true )
        {
            if ( searchParams_["use indexing"].isSet() )
            {
                Logger_if( LOG_SHOW_IF_VERBOSE ) Logger::out() << "Using indexed BWT file" << endl;
                inBwt[i] = new BwtReaderRunLengthIndex( fileName );
            }
            else
            {
                Logger_if( LOG_SHOW_IF_VERBOSE ) Logger::out() << "Using non-indexed BWT file" << endl;
                inBwt[i] = new BwtReaderRunLength( fileName );
            }
        }

        else
            inBwt[i] = new BwtReaderASCII( fileName );
        inBwt[i]->readAndCount( countsPerPile[i] );
    }

    countsCumulative = countsPerPile;

    for ( int i( 1 ); i < alphabetSize; i++ )
        countsCumulative[i] += countsCumulative[i - 1];

    Logger_if( LOG_SHOW_IF_VERBOSE )
    {
        cout << "countsPerPile:" << endl;
        countsPerPile.print();
        cout << "countsCumulative:" << endl;
        countsCumulative.print();
    }

    string currentWord_notUsed;
    int startPosInKmerList2 = 0;
    int endPosInKmerList2 = 0;

    for ( int i( 1 ); i < alphabetSize; ++i )
    {
        for ( int j( 1 ); j < alphabetSize; ++j )
        {
            startPosInKmerList2 = endPosInKmerList2;
            while ( endPosInKmerList2 < ( int )kmerList2.size() &&
                    kmerList2[endPosInKmerList2].kmer[0] == alphabet[i] &&
                    kmerList2[endPosInKmerList2].kmer[1] == alphabet[j] )
                ++endPosInKmerList2;

            if ( countsPerPile[i].count_[j] != 0 )
            {
                if ( startPosInKmerList2 != endPosInKmerList2 )
                    r.addRange(
                        KmerSearchRange(
                            currentWord_notUsed,
                            countsCumulative[i - 1].count_[j],
                            countsPerPile[i].count_[j],
                            false
                            , startPosInKmerList2
                            , endPosInKmerList2
                        )
                        , j,
                        i,
                        subset_,
                        1
                    );
            }
            else
            {
                if ( startPosInKmerList2 != endPosInKmerList2 )
                {
                    cout << "Oh nooo, I can't find: " << endl;
                    for ( int k = startPosInKmerList2; k < endPosInKmerList2; ++k )
                    {
                        string rev = kmerList2[k].kmer;
                        std::reverse( rev.begin(), rev.end() );
                        cout << "  " << kmerList2[k].kmer << " (" << rev << ")" << endl;
                    }
                }
            }
        } // ~for j
    } // ~for i

    LetterCount countsSoFar;
    LetterNumber currentPos;
    LetterNumber numRanges, numSingletonRanges;

    r.clear();
    for ( int cycle( 1 ); ; ++cycle )
    {
        Logger_if( LOG_LEVEL_NORMAL )
        Logger::out() << "cycle: " << cycle << endl;

        Logger_if( LOG_SHOW_IF_VERBOSE )
        {
            Logger::out() << "   time now: " << timer.timeNow();
            Logger::out() << "   usage: " << timer << endl;
        }

        numRanges = 0;
        numSingletonRanges = 0;
        r.setCycleNum( cycle );

        for ( int i( 1 ); i < alphabetSize; ++i )
        {
            inBwt[i]->rewindFile();
            currentPos = 0;
            countsSoFar.clear();
            countsSoFar += countsCumulative[i - 1];

            for ( int j( 1 ); j < alphabetSize; ++j )
            {
                r.setPortion( i, j );

                OneBwtBackTracker backTracker(
                    inBwt[i],
                    currentPos,
                    r,
                    countsSoFar,
                    subset_,
                    cycle + 1,
                    false, // don't propagate breakpoints until $ sign
                    true // skip-already-processed-intervals deactivated
                );

                KmerSearchIntervalHandler intervalHandler;
                KmerSearchRange rangeObject;
                backTracker.process( i, currentWord_notUsed, intervalHandler, rangeObject );

                numRanges += backTracker.numRanges_;
                numSingletonRanges += backTracker.numSingletonRanges_;

            } // ~for j
            //cerr << "Done i " << i <<endl;
        }     // ~for i
        r.clear();
        //    return 0; // %%%
        Logger_if( LOG_SHOW_IF_VERBOSE )
        {
            Logger::out() << "Finished cycle " << cycle << ": ranges=" << numRanges << " singletons=" << numSingletonRanges << endl;
        }

        if ( numRanges == 0 ) break;

    } // ~for c

    for ( int i = 0; i < alphabetSize; i++ )
        delete inBwt[i];

    // Output
    {
        ostream *outputStreamPtr = &std::cout;
        string outputFilename = searchParams_["output"];
        ofstream ofs;
        if ( outputFilename != "-" )
        {
            ofs.open( outputFilename );
            if ( ofs.good() )
                outputStreamPtr = &ofs;
            else
                cerr << "Warning: Couldn't open output file " << outputFilename << ". Sending output to stdout." << endl;
        }

        IntervalWriter writer( *outputStreamPtr );
        for ( auto kmerItem : kmerList2 )
        {
            IntervalRecord rec( kmerList[kmerItem.originalIndex], kmerItem.position, kmerItem.count );
            writer.write( rec );
        }
    }
}
Beispiel #2
0
ErrorStore BwtCorrector::findErrors()
{
    const bool propagateSequence = false;//( compareParams_ ? ( *compareParams_ )["propagate sequence"] : false );
    ErrorStore result;

    Timer  timer;
    bool compressIntermediateBwts = true;

    vector <BwtReaderBase *> inBwt( alphabetSize );

    LetterCountEachPile countsPerPile, countsCumulative;

    RangeStoreExternal r;

    int numCycles( readLength_ );
    //    int minOcc( numberOfReads_ );

    for ( int i( 0 ); i < alphabetSize; i++ )
    {
        stringstream fileNameSS;
        fileNameSS << indexPrefix_ << "-B0" << i;
        string fileName = fileNameSS.str().c_str();
        if ( compressIntermediateBwts == true )
            inBwt[i] = new BwtReaderRunLengthIndex( fileName, correctorParams_->getStringValue( "use shm" ) );
        else
            inBwt[i] = new BwtReaderASCII( fileName );
        inBwt[i]->  readAndCount( countsPerPile[i] );
    }

    countsCumulative = countsPerPile;

    for ( int i( 1 ); i < alphabetSize; i++ )
        countsCumulative[i] += countsCumulative[i - 1];

    countsCumulative.print();

    string currentWord = "xx";
    for ( int i( 1 ); i < alphabetSize; ++i )
    {
        if ( propagateSequence )
            currentWord[1] = alphabet[i];
        for ( int j( 1 ); j < alphabetSize; ++j )
        {
            if ( propagateSequence )
                currentWord[0] = alphabet[j];

            if ( countsPerPile[i].count_[j] != 0 )
                r.addRange(
                    ErrorCorrectionRange(
                        currentWord,
                        countsCumulative[i - 1].count_[j],
                        countsPerPile[i].count_[j],
                        false
                    )
                    , j,
                    i,
                    subset_,
                    1
                );
        } // ~for j
    } // ~for i

    LetterCount countsSoFar;
    LetterNumber currentPos;
    LetterNumber numRanges, numSingletonRanges;

    r.clear();
    for ( int c( 0 ); c < numCycles; ++c )
    {
        int minimumSupport = getMinSupport( c );

        cout << "cycle: " << c << endl;

        Logger_if( LOG_SHOW_IF_VERBOSE )
        {
            Logger::out() << "   time now: " << timer.timeNow();
            Logger::out() << "   usage: " << timer << endl;
        }

        numRanges = 0;
        numSingletonRanges = 0;
        r.setCycleNum( c + 1 );

        for ( int i( 1 ); i < alphabetSize; ++i )
        {
            string thisWord( c + 3, 'x' );

            inBwt[i]->rewindFile();
            currentPos = 0;
            countsSoFar.clear();
            countsSoFar += countsCumulative[i - 1];
#ifdef PROB_NOT_NEEDED
            BwtReaderRunLengthIndex *pRun;

            pRun = dynamic_cast<BwtReaderRunLengthIndex *>( inBwt[i] );
            if ( pRun != NULL )
                pRun->initIndex( countsSoFar );
            else
                inBwtA[i]->rewindFile();
#endif

            for ( int j( 1 ); j < alphabetSize; ++j )
            {
                r.setPortion( i, j );

                OneBwtBackTracker backTracker(
                    inBwt[i],
                    currentPos,
                    r,
                    countsSoFar,
                    //                    numCycles,
                    subset_,
                    c + 2,
                    false, // doesn't propagate to end of reads
                    true, // skip-already-processed-intervals deactivated
                    propagateSequence,
                    endPosFile_
                );

                BwtCorrectorIntervalHandler intervalHandler( result, minWitnessLength_, minimumSupport, c + 2 );
                ErrorCorrectionRange rangeObject;
                backTracker.process( i, thisWord, intervalHandler, rangeObject );

                numRanges += backTracker.numRanges_;
                numSingletonRanges += backTracker.numSingletonRanges_;

            } // ~for j
            //cerr << "Done i " << i <<endl;
        }     // ~for i
        r.clear();
        //    return 0; // %%%
        Logger_if( LOG_SHOW_IF_VERBOSE )
        {
            Logger::out() << "Finished cycle " << c << ": ranges=" << numRanges << " singletons=" << numSingletonRanges << endl;
        }

        if ( numRanges == 0 ) break;

    } // ~for c
    for ( int i = 0; i < alphabetSize; i++ )
        delete inBwt[i];

    return result;
}