static void print_index_sequences(ostream& fout, Ebwt& ebwt) { EList<string>* refnames = &(ebwt.refnames()); TStr cat_ref; ebwt.restore(cat_ref); uint32_t curr_ref = 0xffffffff; string curr_ref_seq = ""; uint32_t curr_ref_len = 0xffffffff; uint32_t last_text_off = 0; size_t orig_len = cat_ref.length(); uint32_t tlen = 0xffffffff; bool first = true; for(size_t i = 0; i < orig_len; i++) { uint32_t tidx = 0xffffffff; uint32_t textoff = 0xffffffff; tlen = 0xffffffff; bool straddled = false; ebwt.joinedToTextOff(1 /* qlen */, (uint32_t)i, tidx, textoff, tlen, true, straddled); if (tidx != 0xffffffff && textoff < tlen) { if (curr_ref != tidx) { if (curr_ref != 0xffffffff) { // Add trailing gaps, if any exist if(curr_ref_seq.length() < curr_ref_len) { curr_ref_seq += string(curr_ref_len - curr_ref_seq.length(), 'N'); } print_fasta_record(fout, (*refnames)[curr_ref], curr_ref_seq); } curr_ref = tidx; curr_ref_seq = ""; curr_ref_len = tlen; last_text_off = 0; first = true; } uint32_t textoff_adj = textoff; if(first && textoff > 0) textoff_adj++; if (textoff_adj - last_text_off > 1) curr_ref_seq += string(textoff_adj - last_text_off - 1, 'N'); curr_ref_seq.push_back(cat_ref[i]); last_text_off = textoff; first = false; } } if (curr_ref < refnames->size()) { // Add trailing gaps, if any exist if(curr_ref_seq.length() < curr_ref_len) { curr_ref_seq += string(curr_ref_len - curr_ref_seq.length(), 'N'); } print_fasta_record(fout, (*refnames)[curr_ref], curr_ref_seq); } }
static uint64_t count_idx_kmers ( Ebwt<index_t>& ebwt) { TStr cat_ref; ebwt.restore(cat_ref); cerr << "Index loaded" << endl; #ifdef TEST_KMER_COUNTING std::set<uint64_t> my_set; #endif HyperLogLogPlusMinus<uint64_t> kmer_counter(16); uint64_t word = 0; uint64_t curr_length = 0; uint8_t k = 32; TIndexOffU curr_ref = OFF_MASK; TIndexOffU last_text_off = 0; size_t orig_len = cat_ref.length(); TIndexOffU tlen = OFF_MASK; bool first = true; for(size_t i = 0; i < orig_len; i++) { TIndexOffU tidx = OFF_MASK; TIndexOffU textoff = OFF_MASK; tlen = OFF_MASK; bool straddled = false; ebwt.joinedToTextOff(1 /* qlen */, (TIndexOffU)i, tidx, textoff, tlen, true, straddled); if (tidx != OFF_MASK && textoff < tlen) { if (curr_ref != tidx) { // End of the sequence - reset word and counter curr_ref = tidx; word = 0; curr_length = 0; last_text_off = 0; first = true; } TIndexOffU textoff_adj = textoff; if(first && textoff > 0) textoff_adj++; if (textoff_adj - last_text_off > 1) { // there's an N - reset word and counter word = 0; curr_length = 0; } // add another char. int bp = (int)cat_ref[i]; // shift the first two bits off the word word = word << 2; // put the base-pair code from pos at that position word |= bp; ++curr_length; //cerr << "[" << i << "; " << curr_length << "; " << word << ":" << kmer_counter.cardinality() << "]" << endl; if (curr_length >= k) { kmer_counter.add(word); #ifdef TEST_KMER_COUNTING my_set.insert(word); cerr << " " << kmer_counter.cardinality() << " vs " << my_set.size() << endl; #endif } last_text_off = textoff; first = false; } } if (curr_length >= k) { kmer_counter.add(word); #ifdef TEST_KMER_COUNTING my_set.insert(word); #endif } #ifdef TEST_KMER_COUNTING cerr << "Exact count: " << my_set.size() << endl; #endif return kmer_counter.cardinality(); }
static void print_index_sequences(ostream& fout, Ebwt<index_t>& ebwt) { EList<string>* refnames = &(ebwt.refnames()); TStr cat_ref; ebwt.restore(cat_ref); HyperLogLogPlusMinus<uint64_t> kmer_counter; TIndexOffU curr_ref = OFF_MASK; string curr_ref_seq = ""; TIndexOffU curr_ref_len = OFF_MASK; TIndexOffU last_text_off = 0; size_t orig_len = cat_ref.length(); TIndexOffU tlen = OFF_MASK; bool first = true; for(size_t i = 0; i < orig_len; i++) { TIndexOffU tidx = OFF_MASK; TIndexOffU textoff = OFF_MASK; tlen = OFF_MASK; bool straddled = false; ebwt.joinedToTextOff(1 /* qlen */, (TIndexOffU)i, tidx, textoff, tlen, true, straddled); if (tidx != OFF_MASK && textoff < tlen) { if (curr_ref != tidx) { if (curr_ref != OFF_MASK) { // Add trailing gaps, if any exist if(curr_ref_seq.length() < curr_ref_len) { curr_ref_seq += string(curr_ref_len - curr_ref_seq.length(), 'N'); } print_fasta_record(fout, (*refnames)[curr_ref], curr_ref_seq); } curr_ref = tidx; curr_ref_seq = ""; curr_ref_len = tlen; last_text_off = 0; first = true; } TIndexOffU textoff_adj = textoff; if(first && textoff > 0) textoff_adj++; if (textoff_adj - last_text_off > 1) curr_ref_seq += string(textoff_adj - last_text_off - 1, 'N'); curr_ref_seq.push_back("ACGT"[int(cat_ref[i])]); last_text_off = textoff; first = false; } } if (curr_ref < refnames->size()) { // Add trailing gaps, if any exist if(curr_ref_seq.length() < curr_ref_len) { curr_ref_seq += string(curr_ref_len - curr_ref_seq.length(), 'N'); } print_fasta_record(fout, (*refnames)[curr_ref], curr_ref_seq); } }
void print_index_sequences( ostream& fout, Ebwt<TStr>& ebwt, const BitPairReference& refs) { vector<string>* refnames = &(ebwt.refnames()); TStr cat_ref; ebwt.restore(cat_ref); TIndexOffU curr_ref = OFF_MASK; string curr_ref_seq = ""; TIndexOffU curr_ref_len = OFF_MASK; uint32_t last_text_off = 0; size_t orig_len = seqan::length(cat_ref); TIndexOffU tlen = OFF_MASK; bool first = true; for(size_t i = 0; i < orig_len; i++) { TIndexOffU tidx = OFF_MASK; TIndexOffU textoff = OFF_MASK; tlen = OFF_MASK; ebwt.joinedToTextOff(1 /* qlen */, (TIndexOffU)i, tidx, textoff, tlen); if (tidx != OFF_MASK && textoff < tlen) { if (curr_ref != tidx) { if (curr_ref != OFF_MASK) { // Add trailing gaps, if any exist if(curr_ref_seq.length() < curr_ref_len) { curr_ref_seq += string(curr_ref_len - curr_ref_seq.length(), 'N'); } print_fasta_record(fout, (*refnames)[curr_ref], curr_ref_seq); } curr_ref = tidx; curr_ref_seq = ""; curr_ref_len = tlen; last_text_off = 0; first = true; } TIndexOffU textoff_adj = textoff; if(first && textoff > 0) textoff_adj++; if (textoff_adj - last_text_off > 1) curr_ref_seq += string(textoff_adj - last_text_off - 1, 'N'); curr_ref_seq.push_back(getValue(cat_ref,i)); last_text_off = textoff; first = false; } } if (curr_ref < refnames->size()) { // Add trailing gaps, if any exist if(curr_ref_seq.length() < curr_ref_len) { curr_ref_seq += string(curr_ref_len - curr_ref_seq.length(), 'N'); } print_fasta_record(fout, (*refnames)[curr_ref], curr_ref_seq); } }