示例#1
0
void eskoTest() {
    unsigned step = 5000;
    unsigned start = 30000;
    unsigned stop = 120000;
    unsigned topL = 300;
    unsigned minL = 5;

    for(unsigned cur = start; cur <= stop; cur += step) {
        takeSubstring("data/ecoli.seq", "data/tmpseq.seq", 0, cur);
        //takeSubstring("data/ecoli.seq", "data/tmpseq.seq", 278000, 293000);

        unsigned L = minL;
        unsigned R = topL;
        unsigned ans = 0U;

        while (L<=R) {
            unsigned mid = (L+R) / 2U;
            unsigned k = mid-1U;
            Genome genome;
            readGenome("data/tmpseq.seq", mid, genome);

            AhoCorasick * aho = new AhoCorasick(genome);

            aho->filterOverlaps(k);

            NFA_Automata nfa(*aho, genome, 0, genome.generatedReads.size()-1);

            delete aho;

            DFA_Automata dfa(nfa, genome, 1.0);
            // remove reads since we do not need them anymore
            genome.generatedReads.resize(0);
            genome.sequence.resize(0);

            bool uniqueOk = dfa.isCOAUnique();

            if (uniqueOk) {
                R = mid-1U;
                ans = mid;
            } else {
                L = mid+1U;
            }

        }

        std::string seq = readSequence("data/tmpseq.seq");

        std::cout << cur << " " << ans << " " << getLongestSingleRepeat(seq) << std::endl;



    }

}
示例#2
0
void run(size_t kFilter) {
    Genome genome;
    readGenome(sequenceFile, patternLen, genome);
    size_t numReads = genome.generatedReads.size();
    std::cout << genome.generatedReads.size() << " " <<
        (genome.generatedReads[0].second - genome.generatedReads[0].first) << " " << kFilter << std::endl;

    std::cout << "Longest repeat: " << getLongestSingleRepeat(genome.sequence) << std::endl;

    double coverage = double(numReads * patternLen) / double(genome.sequence.length());
    cout << numReads * patternLen << endl;
    std::cout << "START AC" << std::endl;
    AhoCorasick * aho = new AhoCorasick(genome);
    std::cout << "END AC" << std::endl;
    std::cout << "START FILTER" << std::endl;
    aho->filterOverlaps(kFilter);
    std::cout << "END FILTER" << std::endl;
    std::cout << "START NFA" << std::endl;
    NFA_Automata nfa(*aho, genome, 0, genome.generatedReads.size()-1);
    std::cout << "END NFA" << std::endl;
    // aho corasick is not needed once we have the nfa FOR NOW
    delete aho;
    #ifdef DEBUG
        nfa.saveAsSVG("data/nfa.svg");
    #endif
    // create DFA from NFA
    std::cout << "START DFA" << std::endl;
    DFA_Automata dfa(nfa, genome, coverage);
    // remove reads since we do not need them anymore
    genome.generatedReads.resize(0);
    genome.sequence.resize(0);

    std::cout << "END DFA" << std::endl;
    std::cout << "UNIQUE ? " << dfa.isCOAUnique() << std::endl;
    size_t numLoops = Statistics::getNumberOfLoops(&dfa);
    std::cout << numLoops << std::endl;




    #ifdef DEBUG
        dfa.saveAsSVG("data/dfa.svg");
    #endif
}
示例#3
0
void runBenchmark() {
    typedef std::pair<size_t, size_t> TestEntry;

    std::ofstream out("data/benchmark.txt");
    int tval = 17;
    out << std::setw(tval) << std::left << "AC_SZ" << std::setw(tval) << std::left
        << "AC_TIME" << std::setw(tval) << std::left
        << "NOA_ST_SZ" << std::setw(tval) << std::left
        << "NOA_TR_SZ" << std::setw(tval) << std::left
        << "NOA_TIME" << std::setw(tval) << std::left
        << "DOA_ST_SZ" << std::setw(tval) << std::left
        << "DOA_TR_SZ" << std::setw(tval) << std::left
        << "DOA_PATH_LEN" << std::setw(tval) << std::left
        << "DOA_CNT_LOOPS" << std::setw(tval) << std::left
        << "DOA_CNT_BR_NO" << std::setw(tval) << std::left
        << "DOA_CNT_ACC_NO" << std::setw(tval) << std::left
        << "DOA_TIME" << std::setw(tval) << std::left
        << "REF_LEN" << std::setw(tval) << std::left
        << "NUM_READS" << std::setw(tval) << std::left
        << "LEN_READS" << std::setw(tval) << std::left
        << "SAMPLING" << std::endl;

    // seq length
    size_t R = 300000;

    std::vector<TestEntry> Tests = {
        TestEntry(60, 180),
        TestEntry(120, 180),
    };

    std::vector<double> Results;

    Timer timer;
    double dt;
    size_t statesCnt;
    size_t transitionsCnt;

    for (size_t i = 0; i < Tests.size(); ++i) {
        // prepare and load test
        size_t K = Tests[i].first;
        size_t L = Tests[i].second;
        takePrefix("data/ecoli.seq", "data/test.seq", R);
        Genome genome;
        readGenome("data/test.seq", L, genome);

        double coverage = double(genome.generatedReads.size() * L) / double(R);

        TestResult tmpResult;

        // general stuff
        tmpResult.REFERENCE_LENGTH = R;
        tmpResult.NUMBER_OF_READS = genome.generatedReads.size();
        tmpResult.SAMPLING = SAMPLING_TYPE::REGULARLY_SPACED;
        tmpResult.TOTAL_LENGTH_OF_READS = genome.generatedReads.size() * L;

        std::cout << "START AHO" << std::endl;
        // AHO
        timer.start();
        AhoCorasick * aho = new AhoCorasick(genome);
        aho->filterOverlaps(K);
        timer.stop();
        dt = timer.getElapsedTimeInSeconds();
        Statistics::calcACStates(aho, statesCnt);
        tmpResult.AHO_SIZE = statesCnt;
        tmpResult.AHO_TIME = dt;
        std::cout << "END AHO" << std::endl;

        std::cout << "START NFA" << std::endl;
        timer.start();
        NFA_Automata * nfa = new NFA_Automata(*aho, genome, 0, genome.generatedReads.size()-1);
        timer.stop();
        dt = timer.getElapsedTimeInSeconds();
        std::cout << "END NFA" << std::endl;
        // clean not needed data

        Statistics::calcNOAStatesAndTransitions(nfa, statesCnt, transitionsCnt);
        tmpResult.NOA_STATES_SIZE = statesCnt;
        tmpResult.NOA_TRANSITIONS_SIZE = transitionsCnt;
        tmpResult.NOA_TIME = dt;

        // clean not needed data
        delete aho;

        std::cout << "START DFA" << std::endl;
        timer.start();
        DFA_Automata * dfa = new DFA_Automata(*nfa, genome, coverage);
        timer.stop();
        dt = timer.getElapsedTimeInSeconds();
        std::cout << "END DFA" << std::endl;
        // clean not needed data
        delete nfa;
        genome.generatedReads.clear();
        genome.sequence.clear();

        std::cout << "START STORING REV EDGES" << std::endl;
        dfa->storeReversedEdges();
        std::cout << "END STORING REV EDGES" << std::endl;

        std::cout << "START UPDATING UNITIG PATHS" << std::endl;
        dfa->updateUnitigPaths();
        std::cout << "STOP UPDATING UNITIG PATHS" << std::endl;

        Statistics::calcDOAStatesAndTransitions(dfa, statesCnt, transitionsCnt);
        tmpResult.DOA_STATES_SIZE = statesCnt;
        tmpResult.DOA_TRANSITIONS_SIZE = transitionsCnt;
        tmpResult.DOA_TIME = dt;
        tmpResult.DOA_NUMBER_BRANCHING_NODES = Statistics::getBranchingNodesCount(dfa);
        tmpResult.DOA_NUMBER_OF_LOOPS = Statistics::getNumberOfLoops(dfa);
        tmpResult.DOA_SHORTEST_PATH_LEN = Statistics::getShortestPathToAllAcceptStates(dfa);
        tmpResult.DOA_NUM_ACCEPT_STATES = dfa->m_acceptStates.size();

        // clean not needed data
        delete dfa;

        std::cout << "DONE" << std::endl;
        out << tmpResult << std::endl;
    }
    out.close();
}