예제 #1
0
static void print_index_sequences(ostream& fout, Ebwt& ebwt)
{
    EList<string>* refnames = &(ebwt.refnames());

    TStr cat_ref;
    ebwt.restore(cat_ref);

    uint32_t curr_ref = 0xffffffff;
    string curr_ref_seq = "";
    uint32_t curr_ref_len = 0xffffffff;
    uint32_t last_text_off = 0;
    size_t orig_len = cat_ref.length();
    uint32_t tlen = 0xffffffff;
    bool first = true;
    for(size_t i = 0; i < orig_len; i++) {
        uint32_t tidx = 0xffffffff;
        uint32_t textoff = 0xffffffff;
        tlen = 0xffffffff;
        bool straddled = false;
        ebwt.joinedToTextOff(1 /* qlen */, (uint32_t)i, tidx, textoff, tlen, true, straddled);

        if (tidx != 0xffffffff && textoff < tlen)
        {
            if (curr_ref != tidx)
            {
                if (curr_ref != 0xffffffff)
                {
                    // Add trailing gaps, if any exist
                    if(curr_ref_seq.length() < curr_ref_len) {
                        curr_ref_seq += string(curr_ref_len - curr_ref_seq.length(), 'N');
                    }
                    print_fasta_record(fout, (*refnames)[curr_ref], curr_ref_seq);
                }
                curr_ref = tidx;
                curr_ref_seq = "";
                curr_ref_len = tlen;
                last_text_off = 0;
                first = true;
            }

            uint32_t textoff_adj = textoff;
            if(first && textoff > 0) textoff_adj++;
            if (textoff_adj - last_text_off > 1)
                curr_ref_seq += string(textoff_adj - last_text_off - 1, 'N');

            curr_ref_seq.push_back(cat_ref[i]);
            last_text_off = textoff;
            first = false;
        }
    }
    if (curr_ref < refnames->size())
    {
        // Add trailing gaps, if any exist
        if(curr_ref_seq.length() < curr_ref_len) {
            curr_ref_seq += string(curr_ref_len - curr_ref_seq.length(), 'N');
        }
        print_fasta_record(fout, (*refnames)[curr_ref], curr_ref_seq);
    }

}
예제 #2
0
static uint64_t count_idx_kmers ( Ebwt<index_t>& ebwt)
{
	TStr cat_ref;
	ebwt.restore(cat_ref);
	cerr << "Index loaded" << endl;
#ifdef TEST_KMER_COUNTING
	std::set<uint64_t> my_set;
#endif

	HyperLogLogPlusMinus<uint64_t> kmer_counter(16);
	uint64_t word = 0;
	uint64_t curr_length = 0;
	uint8_t k = 32;

	TIndexOffU curr_ref = OFF_MASK;
	TIndexOffU last_text_off = 0;
	size_t orig_len = cat_ref.length();
	TIndexOffU tlen = OFF_MASK;
	bool first = true;

	for(size_t i = 0; i < orig_len; i++) {
		TIndexOffU tidx = OFF_MASK;
		TIndexOffU textoff = OFF_MASK;
		tlen = OFF_MASK;
		bool straddled = false;
		ebwt.joinedToTextOff(1 /* qlen */, (TIndexOffU)i, tidx, textoff, tlen, true, straddled);

		if (tidx != OFF_MASK && textoff < tlen) {
			if (curr_ref != tidx) {
				// End of the sequence - reset word and counter
				curr_ref = tidx;
				word = 0; curr_length = 0;
				last_text_off = 0;
				first = true;
			}

			TIndexOffU textoff_adj = textoff;
			if(first && textoff > 0) textoff_adj++;
			if (textoff_adj - last_text_off > 1) {
				// there's an N - reset word and counter
				word = 0; curr_length = 0;
			}
			// add another char.
            int bp = (int)cat_ref[i];

            // shift the first two bits off the word
            word = word << 2;
            // put the base-pair code from pos at that position
            word |= bp;
			++curr_length;
			//cerr << "[" << i << "; " << curr_length << "; " << word << ":" << kmer_counter.cardinality()  << "]" << endl;
			if (curr_length >= k) {
				kmer_counter.add(word);
#ifdef TEST_KMER_COUNTING
				my_set.insert(word);
				cerr << " " << kmer_counter.cardinality()  << " vs " << my_set.size() << endl;
#endif
			}

			last_text_off = textoff;
			first = false;

		}
	}
	if (curr_length >= k) {
		kmer_counter.add(word);
#ifdef TEST_KMER_COUNTING
		my_set.insert(word);
#endif
	}

#ifdef TEST_KMER_COUNTING
	cerr << "Exact count: " << my_set.size() << endl;
#endif

	return kmer_counter.cardinality();
}
예제 #3
0
static void print_index_sequences(ostream& fout, Ebwt<index_t>& ebwt)
{
	EList<string>* refnames = &(ebwt.refnames());

	TStr cat_ref;
	ebwt.restore(cat_ref);

	HyperLogLogPlusMinus<uint64_t> kmer_counter;
	TIndexOffU curr_ref = OFF_MASK;
	string curr_ref_seq = "";
	TIndexOffU curr_ref_len = OFF_MASK;
	TIndexOffU last_text_off = 0;
	size_t orig_len = cat_ref.length();
	TIndexOffU tlen = OFF_MASK;
	bool first = true;
	for(size_t i = 0; i < orig_len; i++) {
		TIndexOffU tidx = OFF_MASK;
		TIndexOffU textoff = OFF_MASK;
		tlen = OFF_MASK;
		bool straddled = false;
		ebwt.joinedToTextOff(1 /* qlen */, (TIndexOffU)i, tidx, textoff, tlen, true, straddled);

		if (tidx != OFF_MASK && textoff < tlen)
		{
			if (curr_ref != tidx)
			{
				if (curr_ref != OFF_MASK)
				{
					// Add trailing gaps, if any exist
					if(curr_ref_seq.length() < curr_ref_len) {
						curr_ref_seq += string(curr_ref_len - curr_ref_seq.length(), 'N');
					}
					print_fasta_record(fout, (*refnames)[curr_ref], curr_ref_seq);
				}
				curr_ref = tidx;
				curr_ref_seq = "";
				curr_ref_len = tlen;
				last_text_off = 0;
				first = true;
			}

			TIndexOffU textoff_adj = textoff;
			if(first && textoff > 0) textoff_adj++;
			if (textoff_adj - last_text_off > 1)
				curr_ref_seq += string(textoff_adj - last_text_off - 1, 'N');

            curr_ref_seq.push_back("ACGT"[int(cat_ref[i])]);			
			last_text_off = textoff;
			first = false;
		}
	}
	if (curr_ref < refnames->size())
	{
		// Add trailing gaps, if any exist
		if(curr_ref_seq.length() < curr_ref_len) {
			curr_ref_seq += string(curr_ref_len - curr_ref_seq.length(), 'N');
		}
		print_fasta_record(fout, (*refnames)[curr_ref], curr_ref_seq);
	}

}
예제 #4
0
void print_index_sequences(
    ostream& fout,
    Ebwt<TStr>& ebwt,
    const BitPairReference& refs)
{
    vector<string>* refnames = &(ebwt.refnames());

    TStr cat_ref;
    ebwt.restore(cat_ref);

    TIndexOffU curr_ref = OFF_MASK;
    string curr_ref_seq = "";
    TIndexOffU curr_ref_len = OFF_MASK;
    uint32_t last_text_off = 0;
    size_t orig_len = seqan::length(cat_ref);
    TIndexOffU tlen = OFF_MASK;
    bool first = true;
    for(size_t i = 0; i < orig_len; i++) {
        TIndexOffU tidx = OFF_MASK;
        TIndexOffU textoff = OFF_MASK;
        tlen = OFF_MASK;

        ebwt.joinedToTextOff(1 /* qlen */, (TIndexOffU)i, tidx, textoff, tlen);

        if (tidx != OFF_MASK && textoff < tlen)
        {
            if (curr_ref != tidx)
            {
                if (curr_ref != OFF_MASK)
                {
                    // Add trailing gaps, if any exist
                    if(curr_ref_seq.length() < curr_ref_len) {
                        curr_ref_seq += string(curr_ref_len - curr_ref_seq.length(), 'N');
                    }
                    print_fasta_record(fout, (*refnames)[curr_ref], curr_ref_seq);
                }
                curr_ref = tidx;
                curr_ref_seq = "";
                curr_ref_len = tlen;
                last_text_off = 0;
                first = true;
            }

            TIndexOffU textoff_adj = textoff;
            if(first && textoff > 0) textoff_adj++;
            if (textoff_adj - last_text_off > 1)
                curr_ref_seq += string(textoff_adj - last_text_off - 1, 'N');

            curr_ref_seq.push_back(getValue(cat_ref,i));
            last_text_off = textoff;
            first = false;
        }
    }
    if (curr_ref < refnames->size())
    {
        // Add trailing gaps, if any exist
        if(curr_ref_seq.length() < curr_ref_len) {
            curr_ref_seq += string(curr_ref_len - curr_ref_seq.length(), 'N');
        }
        print_fasta_record(fout, (*refnames)[curr_ref], curr_ref_seq);
    }

}