void verify_node_orig(kg_node_t * node, unsigned kmer_length) { assert( false && "TODO FIX! REVERSED KMER ENDIANNESS" ); int double_kmer_length = kmer_length << 1; #ifdef LARGE_KMERS Kmer mask; mask.createMask(double_kmer_length); #else Kmer mask = (Kmer(1) << double_kmer_length) - 1; #endif Kmer kmer = node->kmer; Kmer rc_kmer = reverseComplement(kmer, kmer_length); char leftmost_base = (kmer >> (double_kmer_length - 2)) & 0x3; char rightmost_base = kmer & 0x3; for (int i = 0 ; i < 4 ; ++ i) { // check on the left side kg_node_t * node2 = node->left[i]; int count = node->left_count[i]; if (node2) { assert (count != 0); if (count > 0) { Kmer kmer2 = KMER_PREPEND(kmer, i, double_kmer_length, mask); assert(kmer2 == node2->kmer); assert(node2->right[(int)rightmost_base] == node); assert(node2->right_count[(int)rightmost_base] == count); } else { Kmer kmer2 = KMER_APPEND(rc_kmer, i ^ 0x3, double_kmer_length, mask); assert(kmer2 == node2->kmer); assert(node2->left[rightmost_base ^ 0x3] == node); assert(node2->left_count[rightmost_base ^ 0x3] == count); } } else { assert (count == 0); } // check on the right side node2 = node->right[i]; count = node->right_count[i]; if (node2) { assert (count != 0); if (count > 0) { Kmer kmer2 = KMER_APPEND(kmer, i, double_kmer_length, mask); assert(kmer2 == node2->kmer); assert(node2->left[(int)leftmost_base] == node); assert(node2->left_count[(int)leftmost_base] == count); } else { Kmer kmer2 = KMER_PREPEND(rc_kmer, i ^ 0x3, double_kmer_length, mask); assert(kmer2 == node2->kmer); assert(node2->right[leftmost_base ^ 0x3] == node); assert(node2->right_count[leftmost_base ^ 0x3] == count); } } else { assert (count == 0); } } }
void printTranscripts(const KmerIndex& index, std::stringstream& o, const std::string s, const std::vector<std::pair<KmerEntry,int>>& v, const std::vector<int> u) { Kmer km; KmerEntry val; int p; // find first mapping k-mer if (!v.empty()) { p = findFirstMappingKmer(v,val); km = Kmer((s.c_str()+p)); } for (int i = 0; i < u.size(); i++) { int tr = u[i]; if (i > 0) { o << ";"; } std::pair<int, bool> xp = index.findPosition(tr, km, val, p); o << "(" << index.target_names_[tr] << "," << xp.first << ","; if (xp.second) { o << "FW)"; } else { o << "RC)"; } } }
void DeleteKMers(const Sequence &nucls, EdgeId e) { VERIFY(nucls.size() >= index_.k()); KeyWithHash kwh = index_.ConstructKWH(Kmer(index_.k(), nucls)); DeleteIfEqual(kwh, e); for (size_t i = index_.k(), n = nucls.size(); i < n; ++i) { kwh <<= nucls[i]; DeleteIfEqual(kwh, e); } }
void verify_node(KmerNode * node, KmerGraph *hashtable, unsigned kmer_length) { int double_kmer_length = kmer_length << 1; #ifdef LARGE_KMERS Kmer mask; mask.createMask(double_kmer_length); #else Kmer mask = (Kmer(1) << double_kmer_length) - 1; #endif Kmer kmer = node->kmer; Kmer rc_kmer = reverseComplement(kmer, kmer_length); char rightmost_base = KMER_GET_TAIL_BASE(kmer, kmer_length); char leftmost_base = KMER_GET_HEAD_BASE(kmer, kmer_length); KmerNode *node2; for (int i = 0 ; i < 4 ; ++ i) { // check on the left side int count = node->left_count[i]; int color = node->left_color[i]; assert( color == 0 || count != 0 ); // count must be non-zero if color is non-zero if (color == 0) { if (count > 0) { Kmer kmer2 = KMER_PREPEND(kmer, i, double_kmer_length, mask); node2 = hashtable->findNode(canonicalKmer(kmer2, kmer_length)); assert( node2 != NULL ); assert(cnorm(node2->right_count[static_cast<int>(rightmost_base)]) == cnorm(count)); assert(node2->right_color[static_cast<int>(rightmost_base)] == 0); } else if (count < 0) { Kmer kmer2 = KMER_APPEND(rc_kmer, i ^ 0x3, double_kmer_length); node2 = hashtable->findNode(canonicalKmer(kmer2, kmer_length)); assert( node2 != NULL ); assert(cnorm(node2->left_count[static_cast<int>( COMPLEMENT(rightmost_base) )]) == cnorm(count)); assert(node2->left_color[static_cast<int>( COMPLEMENT(rightmost_base) )] == 0); } } // check on the right side count = node->right_count[i]; color = node->right_color[i]; assert( color == 0 || count != 0 ); // count must be non-zero if color is non-zero if (color == 0) { if (count > 0) { Kmer kmer2 = KMER_APPEND(kmer, i, double_kmer_length); node2 = hashtable->findNode(canonicalKmer(kmer2, kmer_length)); assert( node2 != NULL ); assert(cnorm(node2->left_count[static_cast<int>(leftmost_base)]) == cnorm(count)); assert(node2->left_color[static_cast<int>(leftmost_base)] == 0); } else if (count < 0) { Kmer kmer2 = KMER_PREPEND(rc_kmer, i ^ 0x3, double_kmer_length, mask); node2 = hashtable->findNode(canonicalKmer(kmer2, kmer_length)); assert( node2 != NULL ); assert(cnorm(node2->right_count[static_cast<int>( COMPLEMENT(leftmost_base) )]) == cnorm(count)); assert(node2->right_color[static_cast<int>( COMPLEMENT(leftmost_base) )] == 0); } } } }
void testSimpleInsertACAutomaton() { PointerACAutomaton<Kmer> aho; Kmer count_acag = Kmer("ACAG"); count_acag.count = 5; Kmer count_caga = Kmer("CAGA"); count_caga.count = 2; Kmer count_caca = Kmer("CACA"); count_caca.count = 3; Kmer count_gca = Kmer("GCA"); count_gca.count = 1; aho.insert("ACAG", count_acag); aho.insert("CAGA", count_caga); aho.insert("CACA", count_caca); aho.insert("GCA", count_gca); aho.build_failure_functions(); pointer_state<Kmer> *state_ac = aho.goto_state("ac"); pointer_state<Kmer> *state_aca = aho.goto_state("aca"); pointer_state<Kmer> *state_c = aho.goto_state("c"); pointer_state<Kmer> *state_g = aho.goto_state("g"); pointer_state<Kmer> *state_cag = aho.goto_state("cag"); TAP_TEST(aho.getInitialState()->transitions[T] == aho.getInitialState() && aho.getInitialState()->transitions[N] == aho.getInitialState(), TEST_AC_TRANSITIONS, ""); TAP_TEST(state_ac->transitions[A] == state_aca, TEST_AC_TRANSITIONS, ""); TAP_TEST(state_ac->transitions[C] == state_c, TEST_AC_TRANSITIONS, ""); TAP_TEST(state_aca->transitions[A] == state_g->transitions[A], TEST_AC_TRANSITIONS, ""); TAP_TEST(state_c->transitions[C] == state_c, TEST_AC_TRANSITIONS, ""); TAP_TEST(state_g->transitions[G] == state_g, TEST_AC_TRANSITIONS, ""); TAP_TEST(state_g->transitions[C]->transitions[C] == state_c, TEST_AC_TRANSITIONS, ""); TAP_TEST(state_cag->transitions[G] == state_g, TEST_AC_TRANSITIONS, ""); TAP_TEST(state_cag->transitions[A]->is_final, TEST_AC_FINAL, ""); string caga = "caga"; string caca = "caca"; string acag = "acag"; TAP_TEST(aho.get(caga).count == 2, TEST_AC_GET, ""); TAP_TEST(aho.get(caca).count == 3, TEST_AC_GET, ""); TAP_TEST(aho.get(acag).count == 5, TEST_AC_GET, ""); }
void dump_hash(const kmers_map_type& kmers_map, const char* prefix, const char* suffix) { FILE* fout; std::ostringstream fname; fname << prefix << suffix << ".hash"; fout = fopen(fname.str().c_str(), "wb"); FILE* findexout; std::ostringstream findexname; findexname << prefix << suffix << ".index"; findexout = fopen(findexname.str().c_str(), "wb"); // compute index std::vector<std::tr1::tuple<Kmer, const occ_list_type* > > index; index.push_back(std::tr1::make_tuple(Kmer("Z"), (occ_list_type*) 0)); for (kmers_map_type::const_iterator it = kmers_map.begin(); it != kmers_map.end(); it++) { index.push_back(std::tr1::make_tuple(it->first, &(it->second))); } // Remove the (Kmer("Z"),0) tuple. index.erase(index.begin()); std::sort(index.begin(), index.end(), index_less()); // Write out index. // Format: kmer string, number of reads with kmer // Writes out: // num kmers in block, (kmer, num reads with kmer)+ const unsigned int nKmer = index.size(); fwrite(&nKmer, sizeof(unsigned int), 1, findexout); // number of kmers for (unsigned int kid = 0; kid < nKmer; kid++) { // index of kmer const char* kmer = std::tr1::get<0>(index[kid]).getKmer().c_str(); unsigned int nread = std::tr1::get<1>(index[kid])->size(); fwrite(kmer, sizeof(char), strlen(kmer) + 1, findexout); fwrite(&nread, sizeof(unsigned int), 1, findexout); } // Write out reads. // Format: read ID, hit position // Writes out: // read id, pos within read for (unsigned int kid = 0; kid < nKmer; kid++) { const Kmer& kmer = std::tr1::get<0>(index[kid]); const occ_list_type& occ_list = *std::tr1::get<1>(index[kid]); for (occ_list_type::const_iterator it = occ_list.begin(); it != occ_list.end(); it++) { const unsigned int& readid = it->readID; const unsigned int& pos = it->pos; fwrite(&readid, sizeof(unsigned int), 1, fout); fwrite(&pos, sizeof(unsigned int), 1, fout); } } fclose(fout); fclose(findexout); }
static CountDB fromFile( const std::string& fname ) { std::ifstream in(fname, std::ios::in | std::ios::binary ); uint32_t merSize; in.read( reinterpret_cast<char*>(&merSize), sizeof(merSize) ); size_t numCounts; in.read( reinterpret_cast<char*>(&numCounts), sizeof(size_t) ); auto kmers = std::make_shared<std::vector<Kmer>>(numCounts, Kmer(0)); std::vector<AtomicCount> counts(numCounts); in.read( reinterpret_cast<char*>(&((*kmers)[0])), sizeof(Kmer) * numCounts ); in.read( reinterpret_cast<char*>(&counts[0]), sizeof(AtomicCount) * numCounts ); in.close(); CountDB index(kmers, merSize); index._counts = std::move(counts); return index; }
bool checkMapability(const KmerIndex& index, const std::string &s, const std::vector<std::pair<KmerEntry,int>>& v, std::vector<int> &u) { const int maxMismatch = 2; const int maxSoftclip = 5; Kmer km; KmerEntry val; int p; if (!v.empty()) { p = findFirstMappingKmer(v,val); km = Kmer(s.c_str()+p); } else { return false; } std::vector<int> vtmp; vtmp.reserve(u.size()); for (auto tr : u) { auto trpos = index.findPosition(tr, km, val, p); int tpos = (int)trpos.first; int sz = (int)s.size(); bool add = true; if (trpos.second) { if (tpos < 1 || tpos + sz - 1 > index.target_seqs_[tr].size()) { add = false; } else { //std::cout << index.target_seqs_[tr].substr(tpos,sz) << std::endl; //std::cout << s << std::endl; int mis = 0; for (int i = 0; i < sz - maxSoftclip; i++) { if (index.target_seqs_[tr][tpos-1 + i] != s[i]) { ++mis; if (mis > maxMismatch) { break; } } } add = (mis <= maxMismatch); } } else { if (tpos > index.target_seqs_[tr].size() || tpos - sz < 1) { add = false; } else { std::string rs = revcomp(s); //std::cout << index.target_seqs_[tr].substr(tpos - sz, sz) << std::endl; //std::cout << rs << std::endl; int mis = 0; for (int i = sz-1; i >= maxSoftclip; i--) { if (index.target_seqs_[tr][tpos-sz+i] != rs[sz]) { ++mis; if (mis > maxMismatch) { break; } } } add = (mis <= maxMismatch); } } if (add) { vtmp.push_back(tr); } } if (vtmp.empty()) { return false; } if (vtmp.size() < u.size()) { u = vtmp; // copy } return true; }