static void mem_collect_intv(const SalmonOpts& sopt, const mem_opt_t *opt, SalmonIndex* sidx, int len, const uint8_t *seq, smem_aux_t *a) { const bwt_t* bwt = sidx->bwaIndex()->bwt; int i, k, x = 0, old_n; int start_width = (opt->flag & MEM_F_SELF_OVLP)? 2 : 1; int split_len = (int)(opt->min_seed_len * opt->split_factor + .499); a->mem.n = 0; // first pass: find all SMEMs if (sidx->hasAuxKmerIndex()) { KmerIntervalMap& auxIdx = sidx->auxIndex(); uint32_t klen = auxIdx.k(); while (x < len) { if (seq[x] < 4) { // Make sure there are at least k bases left if (len - x < klen) { x = len; continue; } // search for this key in the auxiliary index KmerKey kmer(const_cast<uint8_t*>(&(seq[x])), klen); auto it = auxIdx.find(kmer); // if we can't find it, move to the next key if (it == auxIdx.end()) { ++x; continue; } // otherwise, start the search using the initial interval @it->second from the hash int xb = x; x = bwautils::bwt_smem1_with_kmer(bwt, len, seq, x, start_width, it->second, &a->mem1, a->tmpv); for (i = 0; i < a->mem1.n; ++i) { bwtintv_t *p = &a->mem1.a[i]; int slen = (uint32_t)p->info - (p->info>>32); // seed length if (slen >= opt->min_seed_len) kv_push(bwtintv_t, a->mem, *p); } } else ++x; }
string Bayesian::getTaxonomy(Sequence* seq) { try { string tax = ""; Kmer kmer(kmerSize); flipped = false; //get words contained in query //getKmerString returns a string where the index in the string is hte kmer number //and the character at that index can be converted to be the number of times that kmer was seen string queryKmerString = kmer.getKmerString(seq->getUnaligned()); vector<int> queryKmers; for (int i = 0; i < queryKmerString.length()-1; i++) { // the -1 is to ignore any kmer with an N in it if (queryKmerString[i] != '!') { //this kmer is in the query queryKmers.push_back(i); } } //if user wants to test reverse compliment and its reversed use that instead if (flip) { if (isReversed(queryKmers)) { flipped = true; seq->reverseComplement(); queryKmerString = kmer.getKmerString(seq->getUnaligned()); queryKmers.clear(); for (int i = 0; i < queryKmerString.length()-1; i++) { // the -1 is to ignore any kmer with an N in it if (queryKmerString[i] != '!') { //this kmer is in the query queryKmers.push_back(i); } } } } if (queryKmers.size() == 0) { m->mothurOut(seq->getName() + " is bad. It has no kmers of length " + toString(kmerSize) + "."); m->mothurOutEndLine(); simpleTax = "unknown;"; return "unknown;"; } int index = getMostProbableTaxonomy(queryKmers); if (m->control_pressed) { return tax; } //bootstrap - to set confidenceScore int numToSelect = queryKmers.size() / 8; if (m->debug) { m->mothurOut(seq->getName() + "\t"); } tax = bootstrapResults(queryKmers, index, numToSelect); if (m->debug) { m->mothurOut("\n"); } return tax; } catch(exception& e) { m->errorOut(e, "Bayesian", "getTaxonomy"); exit(1); } }
int main(int argc, char** argv) { Options opt(argc, argv); kmers_map_type kmers_map; MMAPReads readfile(opt.readFName); #pragma omp parallel { #pragma omp single { std::cout << "Using " << omp_get_num_threads() << " threads" << std::endl; } #pragma omp for for(int read_index = opt.read_st; read_index < opt.read_ed; read_index++) { bool orig = readfile.isOrig(read_index); inner_map_type occ_map; std::tr1::shared_ptr<DNASeq> cur_read(new DNASeq(readfile[read_index])); for (int pos = 0; pos <= cur_read->size() - opt.K; pos++) { Kmer kmer(cur_read, pos, opt.K); if (occ_map.find(kmer) == occ_map.end() || !orig) { occ_map[kmer] = KmerOccurrence(read_index, pos); } } for(inner_map_type::iterator it = occ_map.begin(); it != occ_map.end(); it++) { kmers_map_type::const_accessor ac; if (kmers_map.find(ac, it->first)) { if (ac->second.size() > KmerHashMap::MaxBinSize) { continue; } } else { kmers_map.insert(ac, it->first); } ac.release(); kmers_map_type::accessor acw; if (kmers_map.find(acw, it->first)) { acw->second.push_back(it->second); } else { std::cout << "ERROR: read is not found" << std::endl; } acw.release(); } } } dump_hash(kmers_map, opt.fpre, opt.fsuf); return 0; }
//******************************************************************************************************************** int Bayesian::generateWordPairDiffArr(){ try{ Kmer kmer(kmerSize); for (int i = 0; i < WordPairDiffArr.size(); i++) { int reversedWord = kmer.getReverseKmerNumber(i); WordPairDiffArr[i].reverseProb = WordPairDiffArr[reversedWord].prob; } return 0; }catch(exception& e) { m->errorOut(e, "Bayesian", "generateWordPairDiffArr"); exit(1); } }
string decode_kmer_from_intval(kmer_int_type_t intval, unsigned int kmer_length) { string kmer(kmer_length, ' '); for (unsigned int i = 1; i <= kmer_length; i++) { int base_num = intval & 3ll; kmer[kmer_length-i] = _int_to_base[base_num]; // cerr << "base: " << base << endl; intval = intval >> 2; } return(kmer); }
int main(void) { FILE *fp = fopen("rosalind_ba3a.txt", "r"); if (fp == NULL) { printf("File open failed!"); return 1; } int k; fscanf(fp, "%d", &k); char txt[TXTLEN]; fscanf(fp, "%9s", txt); printf("%d %s\n", k, txt); char **kmers = kmer(k, txt); for (int i = 0; i < (strlen(txt) - k + 1); i++) { printf("%s\n", kmers[i]); } for (int i = 0; i < (strlen(txt) - k + 1); i++) { free(kmers[i]); } free(kmers); if (fclose(fp) != 0) { printf("File close failed!"); } return 0; }
void operator() () { section("simple debruijn test based on the example data in the paper"); std::string test = "TACGTCGACGACT"; std::string alphabet = "$ACGT"; const char * i = test.c_str(); std::vector<std::string> kmers; std::string kmer("$$$$"); while (*i) { // shift kmer to the left by one kmer.erase(0,1); // append the next nucleotide kmer.push_back(*i); // and store kmers.push_back(kmer); i++; } // and shift it once more and add the $ kmer.erase(0,1); kmer.push_back('$'); kmers.push_back(kmer); // build the graph! debruijn_succinct db(4,alphabet,kmers.begin(),kmers.end()); // and .. test it check_equal(5UL, db.forward(2), "forward(2)"); check_equal(2UL, db.backward(5), "backward(5)"); check_equal(7UL, db.backward(1), "backward(1)"); check_equal(3UL, db.backward(7), "backward(1)"); check_equal(db.no_node, db.backward(0), "backward(1)"); check_equal(2, db.outdegree(6), "outdegree(6)"); check_equal(1, db.outdegree(0), "outdegree(0)"); check_equal(12UL, db.forward(8), "forward(8)"); check_equal(10UL, db.outgoing(6,'T'), "outgoing(6,T)"); const unsigned long forward_expected[13] = { 10, //0 $$$T -> $$TA 4, //1 CGAC -> GACT 5, //2 $TAC -> TACG- 8, //3 GACG -> ACGT 11, //4 GACT -> ACT$ 8, //5 TACG- -> ACGT 9, //6 GTCG -> TCGA- 1, //7 ACGA -> CGAC 12, //8 ACGT -> CGTC 1, //9 TCGA -> CGAC 2, //10 $$TA -> $TAC (unsigned long)-1,//11 ACT$ -> -1 6, //12 CGTC -> GTCG }; for (int i = 0; i < 13;i ++) check_equal(forward_expected[i], db.forward(i), "forward(i)"); check_equal(std::string("$$$"), db.label(0), "label(0)"); check_equal(std::string("CGA"), db.label(1), "label(1)"); check_equal(std::string("$TA"), db.label(2), "label(2)"); check_equal(std::string("GAC"), db.label(3), "label(3)"); check_equal(std::string("TAC"), db.label(4), "label(4)"); check_equal(std::string("GTC"), db.label(5), "label(5)"); check_equal(std::string("ACG"), db.label(6), "label(6)"); check_equal(std::string("TCG"), db.label(7), "label(7)"); check_equal(std::string("$$T"), db.label(8), "label(8)"); check_equal(std::string("ACT"), db.label(9), "label(9)"); check_equal(std::string("CGT"), db.label(10), "label(10)"); check_equal(0,db.indegree(0), "indegree(0)"); check_equal(2,db.indegree(1), "indegree(1)"); check_equal(2,db.indegree(6), "indegree(6)"); check_equal(1,db.indegree(2), "indegree(2)"); check_equal(4UL,db.incoming(6,'T'), "incoming(6,T)"); check_equal(11UL, db.num_nodes(), "num_nodes"); section("test succinct against basic debruijn"); debruijn_basic db_basic(4,kmers.begin(),kmers.end()); debruijn_comparer dc(db_basic,db,alphabet); dc.run(this); }