Example #1
0
void eskoTest() {
    unsigned step = 5000;
    unsigned start = 30000;
    unsigned stop = 120000;
    unsigned topL = 300;
    unsigned minL = 5;

    for(unsigned cur = start; cur <= stop; cur += step) {
        takeSubstring("data/ecoli.seq", "data/tmpseq.seq", 0, cur);
        //takeSubstring("data/ecoli.seq", "data/tmpseq.seq", 278000, 293000);

        unsigned L = minL;
        unsigned R = topL;
        unsigned ans = 0U;

        while (L<=R) {
            unsigned mid = (L+R) / 2U;
            unsigned k = mid-1U;
            Genome genome;
            readGenome("data/tmpseq.seq", mid, genome);

            AhoCorasick * aho = new AhoCorasick(genome);

            aho->filterOverlaps(k);

            NFA_Automata nfa(*aho, genome, 0, genome.generatedReads.size()-1);

            delete aho;

            DFA_Automata dfa(nfa, genome, 1.0);
            // remove reads since we do not need them anymore
            genome.generatedReads.resize(0);
            genome.sequence.resize(0);

            bool uniqueOk = dfa.isCOAUnique();

            if (uniqueOk) {
                R = mid-1U;
                ans = mid;
            } else {
                L = mid+1U;
            }

        }

        std::string seq = readSequence("data/tmpseq.seq");

        std::cout << cur << " " << ans << " " << getLongestSingleRepeat(seq) << std::endl;



    }

}
Example #2
0
KmerHash::KmerHash(int K, string genomePath, string exonBoundaryPath, string readPath) {
    this -> K = K;
    this -> readLength = -1;
    mask = 0;
    for(int i = 0; i < K; i++) {
        mask = (mask << 2) + 3;
    }
    NW = 0;
    NG = 0;
    NE.clear();
    kmerCount.clear();
    kmerTable.clear();
    geneBoundary.clear();

    readReads(readPath);
    readGenome(exonBoundaryPath);
    buildKmerTable(genomePath);
    mergeKmerTable();
}
Example #3
0
int main(int argc, char *argv[]) {
    
    
    int readLength1, readLength2;
    std::string qualitiesFile1, qualitiesFile2, temp;
    std::string outFile, inFile, qualFile;
    int numberOfReads, insertSizeMean, insertSizeSD;
    int qualityAdjust;
    unsigned seed;
    double snpProb;
    
    time_t start, end;
    
    /** BOOST Program Options **/
    po::options_description desc("Allowed options");
    desc.add_options()
        ("help", "produce help message")
        ("n", po::value<int>(), "number of read pairs to generate (default 50000)")
        ("mean", po::value<int>(), "mean insert size (default 200)")
        ("sd", po::value<int>(), "standard deviation of insert sizes (default 60)")
        ("l1", po::value<int>(), "read length of first read")
        ("l2", po::value<int>(), "read length of second read")
        ("q1", po::value<string>(), "qualities to be sampled for first read")
        ("q2", po::value<string>(), "qualities to be sampled for second read")
        ("genome", po::value<string>(), "path to reference genome reads will be sampled from")
        ("output", po::value<string>(), "")
        ("temp", po::value<string>(), "")
        ("qualfile", po::value<string>(), "")
        ("seed", po::value<int>(), "")
        ("solexa", po::value<int>(), "")
        ("snp_prob", po::value<double>(), "")
    ;

    po::variables_map vm;
    po::store(po::parse_command_line(argc, argv, desc), vm);
    po::notify(vm);    

    if (vm.count("help")) {
        cout << desc << "\n";
        return EXIT_FAILURE;
    }
    
    if (vm.count("n")) {
        numberOfReads = vm["n"].as<int>();
    } else {
        numberOfReads = 50000;
    }
    
    if (vm.count("mean")) {
        insertSizeMean = vm["mean"].as<int>();
    } else {
        insertSizeMean = 400;
    }
    
    if (vm.count("sd")) {
        insertSizeSD = vm["sd"].as<int>();
    } else {
        insertSizeSD = 60;
    }
   
    if (vm.count("l1")) {
        readLength1 = vm["l1"].as<int>();
    } else {
        readLength1 = 50;
    }
    
    if (vm.count("l2")) {
        readLength2 = vm["l2"].as<int>();
    } else {
        readLength2 = 50;
    }
    
    if (vm.count("q1")) {
        qualitiesFile1 = vm["q1"].as<string>();
    } else {
        cerr << "Qualities files for read one not provided" << endl;
        return EXIT_FAILURE;
    }
    
    if (vm.count("q2")) {
        qualitiesFile2 = vm["q2"].as<string>();
    } else {
        cerr << "Qualities files for read two not provided" << endl;
        return EXIT_FAILURE;
    }
    
    if (vm.count("genome")) {
        inFile = vm["genome"].as<string>();
    } else {
        cerr << "No genome file provided" << endl;
        return EXIT_FAILURE;
    }
    
    if (vm.count("output")) {
        outFile = vm["output"].as<string>();
    } else {
        cerr << "No output file specified" << endl;
        return EXIT_FAILURE;
    }
    
    if (vm.count("temp")) {
        temp = vm["temp"].as<string>();
    } else {
        temp = "/tmp";
    }
    
    if (vm.count("qualfile")) {
        qualFile = vm["qualfile"].as<string>();
    } else {
        qualFile = "";
    }
    
    if (vm.count("seed")) {
        seed = vm["seed"].as<int>();
    } else {
        seed = static_cast<unsigned> (std::time(0));
    }
    
    if (vm.count("solexa")) {
        qualityAdjust = 64;
    } else {
        qualityAdjust = 33;
    }
    
    if (vm.count("snp_prob")) {
        snpProb = vm["snp_prob"].as<double>();
    } else {
        snpProb = (double) 1 / 1000;
    }
    /** END BOOST Program Options **/
    
    // seed the random number generator
    // this is only used for the tmp file names, so don't use the seed
    srand( std::time(0) );
        
    time(&start);
    std::string tmpQuals1 = createTmpQualitiesFile2(temp, qualitiesFile1, readLength1, qualFile);
    time(&end);
    printf("Elapsed time: %.0f seconds\n", difftime(end, start));
    
    time(&start);
    std::string tmpQuals2 = createTmpQualitiesFile2(temp, qualitiesFile2, readLength2, qualFile);    
    time(&end);
    printf("Elapsed time: %.0f seconds\n", difftime(end, start));

    time(&start);
    Genome gen = readGenome(inFile);
    time(&end);
    printf("Elapsed time: %.0f seconds\n", difftime(end, start));

    time(&start);
    Reads reads = gen.generateReads(numberOfReads, readLength1, readLength2, insertSizeMean, insertSizeSD, seed);
    time(&end);
    printf("Elapsed time: %.0f seconds\n", difftime(end, start));
    
    time(&start);
    reads.generateQualities(tmpQuals1, tmpQuals2, seed);
    time(&end);
    printf("Elapsed time: %.0f seconds\n", difftime(end, start));
   // reads.generateQualities(tmpQuals2, 2);
    
    time(&start);
    #if defined (_OPENMP)
        reads.applyQualities_MP(seed, qualityAdjust, snpProb);
    #else
        reads.applyQualities(seed, qualityAdjust, snpProb);
    #endif
    time(&end);
    printf("Elapsed time: %.0f seconds\n", difftime(end, start));
     
    reads.writeFASTQ(outFile);

    remove(tmpQuals1.c_str());
    remove(tmpQuals2.c_str());

    return 0;

}
Example #4
0
void runBenchmark() {
    typedef std::pair<size_t, size_t> TestEntry;

    std::ofstream out("data/benchmark.txt");
    int tval = 17;
    out << std::setw(tval) << std::left << "AC_SZ" << std::setw(tval) << std::left
        << "AC_TIME" << std::setw(tval) << std::left
        << "NOA_ST_SZ" << std::setw(tval) << std::left
        << "NOA_TR_SZ" << std::setw(tval) << std::left
        << "NOA_TIME" << std::setw(tval) << std::left
        << "DOA_ST_SZ" << std::setw(tval) << std::left
        << "DOA_TR_SZ" << std::setw(tval) << std::left
        << "DOA_PATH_LEN" << std::setw(tval) << std::left
        << "DOA_CNT_LOOPS" << std::setw(tval) << std::left
        << "DOA_CNT_BR_NO" << std::setw(tval) << std::left
        << "DOA_CNT_ACC_NO" << std::setw(tval) << std::left
        << "DOA_TIME" << std::setw(tval) << std::left
        << "REF_LEN" << std::setw(tval) << std::left
        << "NUM_READS" << std::setw(tval) << std::left
        << "LEN_READS" << std::setw(tval) << std::left
        << "SAMPLING" << std::endl;

    // seq length
    size_t R = 300000;

    std::vector<TestEntry> Tests = {
        TestEntry(60, 180),
        TestEntry(120, 180),
    };

    std::vector<double> Results;

    Timer timer;
    double dt;
    size_t statesCnt;
    size_t transitionsCnt;

    for (size_t i = 0; i < Tests.size(); ++i) {
        // prepare and load test
        size_t K = Tests[i].first;
        size_t L = Tests[i].second;
        takePrefix("data/ecoli.seq", "data/test.seq", R);
        Genome genome;
        readGenome("data/test.seq", L, genome);

        double coverage = double(genome.generatedReads.size() * L) / double(R);

        TestResult tmpResult;

        // general stuff
        tmpResult.REFERENCE_LENGTH = R;
        tmpResult.NUMBER_OF_READS = genome.generatedReads.size();
        tmpResult.SAMPLING = SAMPLING_TYPE::REGULARLY_SPACED;
        tmpResult.TOTAL_LENGTH_OF_READS = genome.generatedReads.size() * L;

        std::cout << "START AHO" << std::endl;
        // AHO
        timer.start();
        AhoCorasick * aho = new AhoCorasick(genome);
        aho->filterOverlaps(K);
        timer.stop();
        dt = timer.getElapsedTimeInSeconds();
        Statistics::calcACStates(aho, statesCnt);
        tmpResult.AHO_SIZE = statesCnt;
        tmpResult.AHO_TIME = dt;
        std::cout << "END AHO" << std::endl;

        std::cout << "START NFA" << std::endl;
        timer.start();
        NFA_Automata * nfa = new NFA_Automata(*aho, genome, 0, genome.generatedReads.size()-1);
        timer.stop();
        dt = timer.getElapsedTimeInSeconds();
        std::cout << "END NFA" << std::endl;
        // clean not needed data

        Statistics::calcNOAStatesAndTransitions(nfa, statesCnt, transitionsCnt);
        tmpResult.NOA_STATES_SIZE = statesCnt;
        tmpResult.NOA_TRANSITIONS_SIZE = transitionsCnt;
        tmpResult.NOA_TIME = dt;

        // clean not needed data
        delete aho;

        std::cout << "START DFA" << std::endl;
        timer.start();
        DFA_Automata * dfa = new DFA_Automata(*nfa, genome, coverage);
        timer.stop();
        dt = timer.getElapsedTimeInSeconds();
        std::cout << "END DFA" << std::endl;
        // clean not needed data
        delete nfa;
        genome.generatedReads.clear();
        genome.sequence.clear();

        std::cout << "START STORING REV EDGES" << std::endl;
        dfa->storeReversedEdges();
        std::cout << "END STORING REV EDGES" << std::endl;

        std::cout << "START UPDATING UNITIG PATHS" << std::endl;
        dfa->updateUnitigPaths();
        std::cout << "STOP UPDATING UNITIG PATHS" << std::endl;

        Statistics::calcDOAStatesAndTransitions(dfa, statesCnt, transitionsCnt);
        tmpResult.DOA_STATES_SIZE = statesCnt;
        tmpResult.DOA_TRANSITIONS_SIZE = transitionsCnt;
        tmpResult.DOA_TIME = dt;
        tmpResult.DOA_NUMBER_BRANCHING_NODES = Statistics::getBranchingNodesCount(dfa);
        tmpResult.DOA_NUMBER_OF_LOOPS = Statistics::getNumberOfLoops(dfa);
        tmpResult.DOA_SHORTEST_PATH_LEN = Statistics::getShortestPathToAllAcceptStates(dfa);
        tmpResult.DOA_NUM_ACCEPT_STATES = dfa->m_acceptStates.size();

        // clean not needed data
        delete dfa;

        std::cout << "DONE" << std::endl;
        out << tmpResult << std::endl;
    }
    out.close();
}