void eskoTest() { unsigned step = 5000; unsigned start = 30000; unsigned stop = 120000; unsigned topL = 300; unsigned minL = 5; for(unsigned cur = start; cur <= stop; cur += step) { takeSubstring("data/ecoli.seq", "data/tmpseq.seq", 0, cur); //takeSubstring("data/ecoli.seq", "data/tmpseq.seq", 278000, 293000); unsigned L = minL; unsigned R = topL; unsigned ans = 0U; while (L<=R) { unsigned mid = (L+R) / 2U; unsigned k = mid-1U; Genome genome; readGenome("data/tmpseq.seq", mid, genome); AhoCorasick * aho = new AhoCorasick(genome); aho->filterOverlaps(k); NFA_Automata nfa(*aho, genome, 0, genome.generatedReads.size()-1); delete aho; DFA_Automata dfa(nfa, genome, 1.0); // remove reads since we do not need them anymore genome.generatedReads.resize(0); genome.sequence.resize(0); bool uniqueOk = dfa.isCOAUnique(); if (uniqueOk) { R = mid-1U; ans = mid; } else { L = mid+1U; } } std::string seq = readSequence("data/tmpseq.seq"); std::cout << cur << " " << ans << " " << getLongestSingleRepeat(seq) << std::endl; } }
KmerHash::KmerHash(int K, string genomePath, string exonBoundaryPath, string readPath) { this -> K = K; this -> readLength = -1; mask = 0; for(int i = 0; i < K; i++) { mask = (mask << 2) + 3; } NW = 0; NG = 0; NE.clear(); kmerCount.clear(); kmerTable.clear(); geneBoundary.clear(); readReads(readPath); readGenome(exonBoundaryPath); buildKmerTable(genomePath); mergeKmerTable(); }
int main(int argc, char *argv[]) { int readLength1, readLength2; std::string qualitiesFile1, qualitiesFile2, temp; std::string outFile, inFile, qualFile; int numberOfReads, insertSizeMean, insertSizeSD; int qualityAdjust; unsigned seed; double snpProb; time_t start, end; /** BOOST Program Options **/ po::options_description desc("Allowed options"); desc.add_options() ("help", "produce help message") ("n", po::value<int>(), "number of read pairs to generate (default 50000)") ("mean", po::value<int>(), "mean insert size (default 200)") ("sd", po::value<int>(), "standard deviation of insert sizes (default 60)") ("l1", po::value<int>(), "read length of first read") ("l2", po::value<int>(), "read length of second read") ("q1", po::value<string>(), "qualities to be sampled for first read") ("q2", po::value<string>(), "qualities to be sampled for second read") ("genome", po::value<string>(), "path to reference genome reads will be sampled from") ("output", po::value<string>(), "") ("temp", po::value<string>(), "") ("qualfile", po::value<string>(), "") ("seed", po::value<int>(), "") ("solexa", po::value<int>(), "") ("snp_prob", po::value<double>(), "") ; po::variables_map vm; po::store(po::parse_command_line(argc, argv, desc), vm); po::notify(vm); if (vm.count("help")) { cout << desc << "\n"; return EXIT_FAILURE; } if (vm.count("n")) { numberOfReads = vm["n"].as<int>(); } else { numberOfReads = 50000; } if (vm.count("mean")) { insertSizeMean = vm["mean"].as<int>(); } else { insertSizeMean = 400; } if (vm.count("sd")) { insertSizeSD = vm["sd"].as<int>(); } else { insertSizeSD = 60; } if (vm.count("l1")) { readLength1 = vm["l1"].as<int>(); } else { readLength1 = 50; } if (vm.count("l2")) { readLength2 = vm["l2"].as<int>(); } else { readLength2 = 50; } if (vm.count("q1")) { qualitiesFile1 = vm["q1"].as<string>(); } else { cerr << "Qualities files for read one not provided" << endl; return EXIT_FAILURE; } if (vm.count("q2")) { qualitiesFile2 = vm["q2"].as<string>(); } else { cerr << "Qualities files for read two not provided" << endl; return EXIT_FAILURE; } if (vm.count("genome")) { inFile = vm["genome"].as<string>(); } else { cerr << "No genome file provided" << endl; return EXIT_FAILURE; } if (vm.count("output")) { outFile = vm["output"].as<string>(); } else { cerr << "No output file specified" << endl; return EXIT_FAILURE; } if (vm.count("temp")) { temp = vm["temp"].as<string>(); } else { temp = "/tmp"; } if (vm.count("qualfile")) { qualFile = vm["qualfile"].as<string>(); } else { qualFile = ""; } if (vm.count("seed")) { seed = vm["seed"].as<int>(); } else { seed = static_cast<unsigned> (std::time(0)); } if (vm.count("solexa")) { qualityAdjust = 64; } else { qualityAdjust = 33; } if (vm.count("snp_prob")) { snpProb = vm["snp_prob"].as<double>(); } else { snpProb = (double) 1 / 1000; } /** END BOOST Program Options **/ // seed the random number generator // this is only used for the tmp file names, so don't use the seed srand( std::time(0) ); time(&start); std::string tmpQuals1 = createTmpQualitiesFile2(temp, qualitiesFile1, readLength1, qualFile); time(&end); printf("Elapsed time: %.0f seconds\n", difftime(end, start)); time(&start); std::string tmpQuals2 = createTmpQualitiesFile2(temp, qualitiesFile2, readLength2, qualFile); time(&end); printf("Elapsed time: %.0f seconds\n", difftime(end, start)); time(&start); Genome gen = readGenome(inFile); time(&end); printf("Elapsed time: %.0f seconds\n", difftime(end, start)); time(&start); Reads reads = gen.generateReads(numberOfReads, readLength1, readLength2, insertSizeMean, insertSizeSD, seed); time(&end); printf("Elapsed time: %.0f seconds\n", difftime(end, start)); time(&start); reads.generateQualities(tmpQuals1, tmpQuals2, seed); time(&end); printf("Elapsed time: %.0f seconds\n", difftime(end, start)); // reads.generateQualities(tmpQuals2, 2); time(&start); #if defined (_OPENMP) reads.applyQualities_MP(seed, qualityAdjust, snpProb); #else reads.applyQualities(seed, qualityAdjust, snpProb); #endif time(&end); printf("Elapsed time: %.0f seconds\n", difftime(end, start)); reads.writeFASTQ(outFile); remove(tmpQuals1.c_str()); remove(tmpQuals2.c_str()); return 0; }
void runBenchmark() { typedef std::pair<size_t, size_t> TestEntry; std::ofstream out("data/benchmark.txt"); int tval = 17; out << std::setw(tval) << std::left << "AC_SZ" << std::setw(tval) << std::left << "AC_TIME" << std::setw(tval) << std::left << "NOA_ST_SZ" << std::setw(tval) << std::left << "NOA_TR_SZ" << std::setw(tval) << std::left << "NOA_TIME" << std::setw(tval) << std::left << "DOA_ST_SZ" << std::setw(tval) << std::left << "DOA_TR_SZ" << std::setw(tval) << std::left << "DOA_PATH_LEN" << std::setw(tval) << std::left << "DOA_CNT_LOOPS" << std::setw(tval) << std::left << "DOA_CNT_BR_NO" << std::setw(tval) << std::left << "DOA_CNT_ACC_NO" << std::setw(tval) << std::left << "DOA_TIME" << std::setw(tval) << std::left << "REF_LEN" << std::setw(tval) << std::left << "NUM_READS" << std::setw(tval) << std::left << "LEN_READS" << std::setw(tval) << std::left << "SAMPLING" << std::endl; // seq length size_t R = 300000; std::vector<TestEntry> Tests = { TestEntry(60, 180), TestEntry(120, 180), }; std::vector<double> Results; Timer timer; double dt; size_t statesCnt; size_t transitionsCnt; for (size_t i = 0; i < Tests.size(); ++i) { // prepare and load test size_t K = Tests[i].first; size_t L = Tests[i].second; takePrefix("data/ecoli.seq", "data/test.seq", R); Genome genome; readGenome("data/test.seq", L, genome); double coverage = double(genome.generatedReads.size() * L) / double(R); TestResult tmpResult; // general stuff tmpResult.REFERENCE_LENGTH = R; tmpResult.NUMBER_OF_READS = genome.generatedReads.size(); tmpResult.SAMPLING = SAMPLING_TYPE::REGULARLY_SPACED; tmpResult.TOTAL_LENGTH_OF_READS = genome.generatedReads.size() * L; std::cout << "START AHO" << std::endl; // AHO timer.start(); AhoCorasick * aho = new AhoCorasick(genome); aho->filterOverlaps(K); timer.stop(); dt = timer.getElapsedTimeInSeconds(); Statistics::calcACStates(aho, statesCnt); tmpResult.AHO_SIZE = statesCnt; tmpResult.AHO_TIME = dt; std::cout << "END AHO" << std::endl; std::cout << "START NFA" << std::endl; timer.start(); NFA_Automata * nfa = new NFA_Automata(*aho, genome, 0, genome.generatedReads.size()-1); timer.stop(); dt = timer.getElapsedTimeInSeconds(); std::cout << "END NFA" << std::endl; // clean not needed data Statistics::calcNOAStatesAndTransitions(nfa, statesCnt, transitionsCnt); tmpResult.NOA_STATES_SIZE = statesCnt; tmpResult.NOA_TRANSITIONS_SIZE = transitionsCnt; tmpResult.NOA_TIME = dt; // clean not needed data delete aho; std::cout << "START DFA" << std::endl; timer.start(); DFA_Automata * dfa = new DFA_Automata(*nfa, genome, coverage); timer.stop(); dt = timer.getElapsedTimeInSeconds(); std::cout << "END DFA" << std::endl; // clean not needed data delete nfa; genome.generatedReads.clear(); genome.sequence.clear(); std::cout << "START STORING REV EDGES" << std::endl; dfa->storeReversedEdges(); std::cout << "END STORING REV EDGES" << std::endl; std::cout << "START UPDATING UNITIG PATHS" << std::endl; dfa->updateUnitigPaths(); std::cout << "STOP UPDATING UNITIG PATHS" << std::endl; Statistics::calcDOAStatesAndTransitions(dfa, statesCnt, transitionsCnt); tmpResult.DOA_STATES_SIZE = statesCnt; tmpResult.DOA_TRANSITIONS_SIZE = transitionsCnt; tmpResult.DOA_TIME = dt; tmpResult.DOA_NUMBER_BRANCHING_NODES = Statistics::getBranchingNodesCount(dfa); tmpResult.DOA_NUMBER_OF_LOOPS = Statistics::getNumberOfLoops(dfa); tmpResult.DOA_SHORTEST_PATH_LEN = Statistics::getShortestPathToAllAcceptStates(dfa); tmpResult.DOA_NUM_ACCEPT_STATES = dfa->m_acceptStates.size(); // clean not needed data delete dfa; std::cout << "DONE" << std::endl; out << tmpResult << std::endl; } out.close(); }