void eskoTest() { unsigned step = 5000; unsigned start = 30000; unsigned stop = 120000; unsigned topL = 300; unsigned minL = 5; for(unsigned cur = start; cur <= stop; cur += step) { takeSubstring("data/ecoli.seq", "data/tmpseq.seq", 0, cur); //takeSubstring("data/ecoli.seq", "data/tmpseq.seq", 278000, 293000); unsigned L = minL; unsigned R = topL; unsigned ans = 0U; while (L<=R) { unsigned mid = (L+R) / 2U; unsigned k = mid-1U; Genome genome; readGenome("data/tmpseq.seq", mid, genome); AhoCorasick * aho = new AhoCorasick(genome); aho->filterOverlaps(k); NFA_Automata nfa(*aho, genome, 0, genome.generatedReads.size()-1); delete aho; DFA_Automata dfa(nfa, genome, 1.0); // remove reads since we do not need them anymore genome.generatedReads.resize(0); genome.sequence.resize(0); bool uniqueOk = dfa.isCOAUnique(); if (uniqueOk) { R = mid-1U; ans = mid; } else { L = mid+1U; } } std::string seq = readSequence("data/tmpseq.seq"); std::cout << cur << " " << ans << " " << getLongestSingleRepeat(seq) << std::endl; } }
void run(size_t kFilter) { Genome genome; readGenome(sequenceFile, patternLen, genome); size_t numReads = genome.generatedReads.size(); std::cout << genome.generatedReads.size() << " " << (genome.generatedReads[0].second - genome.generatedReads[0].first) << " " << kFilter << std::endl; std::cout << "Longest repeat: " << getLongestSingleRepeat(genome.sequence) << std::endl; double coverage = double(numReads * patternLen) / double(genome.sequence.length()); cout << numReads * patternLen << endl; std::cout << "START AC" << std::endl; AhoCorasick * aho = new AhoCorasick(genome); std::cout << "END AC" << std::endl; std::cout << "START FILTER" << std::endl; aho->filterOverlaps(kFilter); std::cout << "END FILTER" << std::endl; std::cout << "START NFA" << std::endl; NFA_Automata nfa(*aho, genome, 0, genome.generatedReads.size()-1); std::cout << "END NFA" << std::endl; // aho corasick is not needed once we have the nfa FOR NOW delete aho; #ifdef DEBUG nfa.saveAsSVG("data/nfa.svg"); #endif // create DFA from NFA std::cout << "START DFA" << std::endl; DFA_Automata dfa(nfa, genome, coverage); // remove reads since we do not need them anymore genome.generatedReads.resize(0); genome.sequence.resize(0); std::cout << "END DFA" << std::endl; std::cout << "UNIQUE ? " << dfa.isCOAUnique() << std::endl; size_t numLoops = Statistics::getNumberOfLoops(&dfa); std::cout << numLoops << std::endl; #ifdef DEBUG dfa.saveAsSVG("data/dfa.svg"); #endif }
void runBenchmark() { typedef std::pair<size_t, size_t> TestEntry; std::ofstream out("data/benchmark.txt"); int tval = 17; out << std::setw(tval) << std::left << "AC_SZ" << std::setw(tval) << std::left << "AC_TIME" << std::setw(tval) << std::left << "NOA_ST_SZ" << std::setw(tval) << std::left << "NOA_TR_SZ" << std::setw(tval) << std::left << "NOA_TIME" << std::setw(tval) << std::left << "DOA_ST_SZ" << std::setw(tval) << std::left << "DOA_TR_SZ" << std::setw(tval) << std::left << "DOA_PATH_LEN" << std::setw(tval) << std::left << "DOA_CNT_LOOPS" << std::setw(tval) << std::left << "DOA_CNT_BR_NO" << std::setw(tval) << std::left << "DOA_CNT_ACC_NO" << std::setw(tval) << std::left << "DOA_TIME" << std::setw(tval) << std::left << "REF_LEN" << std::setw(tval) << std::left << "NUM_READS" << std::setw(tval) << std::left << "LEN_READS" << std::setw(tval) << std::left << "SAMPLING" << std::endl; // seq length size_t R = 300000; std::vector<TestEntry> Tests = { TestEntry(60, 180), TestEntry(120, 180), }; std::vector<double> Results; Timer timer; double dt; size_t statesCnt; size_t transitionsCnt; for (size_t i = 0; i < Tests.size(); ++i) { // prepare and load test size_t K = Tests[i].first; size_t L = Tests[i].second; takePrefix("data/ecoli.seq", "data/test.seq", R); Genome genome; readGenome("data/test.seq", L, genome); double coverage = double(genome.generatedReads.size() * L) / double(R); TestResult tmpResult; // general stuff tmpResult.REFERENCE_LENGTH = R; tmpResult.NUMBER_OF_READS = genome.generatedReads.size(); tmpResult.SAMPLING = SAMPLING_TYPE::REGULARLY_SPACED; tmpResult.TOTAL_LENGTH_OF_READS = genome.generatedReads.size() * L; std::cout << "START AHO" << std::endl; // AHO timer.start(); AhoCorasick * aho = new AhoCorasick(genome); aho->filterOverlaps(K); timer.stop(); dt = timer.getElapsedTimeInSeconds(); Statistics::calcACStates(aho, statesCnt); tmpResult.AHO_SIZE = statesCnt; tmpResult.AHO_TIME = dt; std::cout << "END AHO" << std::endl; std::cout << "START NFA" << std::endl; timer.start(); NFA_Automata * nfa = new NFA_Automata(*aho, genome, 0, genome.generatedReads.size()-1); timer.stop(); dt = timer.getElapsedTimeInSeconds(); std::cout << "END NFA" << std::endl; // clean not needed data Statistics::calcNOAStatesAndTransitions(nfa, statesCnt, transitionsCnt); tmpResult.NOA_STATES_SIZE = statesCnt; tmpResult.NOA_TRANSITIONS_SIZE = transitionsCnt; tmpResult.NOA_TIME = dt; // clean not needed data delete aho; std::cout << "START DFA" << std::endl; timer.start(); DFA_Automata * dfa = new DFA_Automata(*nfa, genome, coverage); timer.stop(); dt = timer.getElapsedTimeInSeconds(); std::cout << "END DFA" << std::endl; // clean not needed data delete nfa; genome.generatedReads.clear(); genome.sequence.clear(); std::cout << "START STORING REV EDGES" << std::endl; dfa->storeReversedEdges(); std::cout << "END STORING REV EDGES" << std::endl; std::cout << "START UPDATING UNITIG PATHS" << std::endl; dfa->updateUnitigPaths(); std::cout << "STOP UPDATING UNITIG PATHS" << std::endl; Statistics::calcDOAStatesAndTransitions(dfa, statesCnt, transitionsCnt); tmpResult.DOA_STATES_SIZE = statesCnt; tmpResult.DOA_TRANSITIONS_SIZE = transitionsCnt; tmpResult.DOA_TIME = dt; tmpResult.DOA_NUMBER_BRANCHING_NODES = Statistics::getBranchingNodesCount(dfa); tmpResult.DOA_NUMBER_OF_LOOPS = Statistics::getNumberOfLoops(dfa); tmpResult.DOA_SHORTEST_PATH_LEN = Statistics::getShortestPathToAllAcceptStates(dfa); tmpResult.DOA_NUM_ACCEPT_STATES = dfa->m_acceptStates.size(); // clean not needed data delete dfa; std::cout << "DONE" << std::endl; out << tmpResult << std::endl; } out.close(); }