static unsigned dbget_raw(struct fheader * h, unsigned busy){ uint32_t idx; if( h->wbit){ idx = search(busy,blist[h->rank],cnk[h->rank]); return twobit_get(h->data,idx); } if (h->sorted){ idx = search(busy,(uint32_t *)(h->data),h->count0); if (idx != -1) return r0[h->value]; idx = search(busy,(uint32_t *)(h->data)+h->count0,h->count1); if (idx != -1) return r1[h->value]; }else { for(idx=0;idx<h->count0;idx++) if(((uint32_t*)(h->data))[idx] == busy) return r0[h->value]; for(;idx<h->count0+h->count1;idx++) if(((uint32_t*)(h->data))[idx] == busy) return r1[h->value]; } if (h->full){ return h->value; } error("Not in db"); }
void seqenc_encode_alignment( seqenc_t* E, uint32_t spos, uint8_t strand, const unsigned char* query_str, const twobit_t* query) { size_t qlen = twobit_len(query); size_t slen = E->supercontig_motif.n; assert(spos < slen); dist2_encode(E->ac, &E->d_type, SEQENC_TYPE_ALIGNMENT); /* encode N mask */ size_t i; reserve_nmask(E, qlen); for (i = 0; i < qlen; ++i) { dist2_encode(E->ac, &E->d_nmask[i], query_str[i] == 'N' ? 1 : 0); } dist2_encode(E->ac, &E->d_aln_strand, strand); uint32_enc_encode(E->ac, &E->d_contig_off, spos); kmer_t u; if (strand) { for (i = 0; i < qlen; ++i) { if (query_str[i] == 'N') continue; u = kmer_comp1(twobit_get(query, i)); cond_dist4_encode(E->ac, &E->supercontig_motif, slen - (spos + i) - 1, u); } } else { for (i = 0; i < qlen; ++i) { if (query_str[i] == 'N') continue; u = twobit_get(query, i); cond_dist4_encode(E->ac, &E->supercontig_motif, spos + i, u); } } }
void seqenc_set_supercontig(seqenc_t* E, const twobit_t* supercontig) { size_t len = twobit_len(supercontig); if (len == 0) return; cond_dist4_init(&E->supercontig_motif, len); cond_dist4_set_update_rate(&E->supercontig_motif, motif_update_rate); size_t i; kmer_t u; for (i = 0; i < len; ++i) { u = twobit_get(supercontig, i); E->supercontig_motif.xss[i].xs[u].count = contig_motif_prior; dist4_update(&E->supercontig_motif.xss[i]); } }
void seqenc_encode_twobit_seq(seqenc_t* E, const unsigned char* x_str, const twobit_t* x) { dist2_encode(E->ac, &E->d_type, SEQENC_TYPE_SEQUENCE); size_t n = twobit_len(x); if (n == 0) return; kmer_t uv; uint32_t ctx = 0; size_t i; for (i = 0; i < n - 1 && i / 2 < prefix_len; i += 2) { uv = (twobit_get(x, i) << 2) | twobit_get(x, i + 1); cond_dist16_encode(E->ac, &E->cs0[i/2], ctx, uv); ctx = ((ctx << 4) | uv) & E->ctx_mask; } for (; i < n - 1; i += 2) { uv = (twobit_get(x, i) << 2) | twobit_get(x, i + 1); cond_dist16_encode(E->ac, &E->cs, ctx, uv); ctx = ((ctx << 4) | uv) & E->ctx_mask; } /* handle odd read lengths */ if (i < n) { uv = twobit_get(x, i); cond_dist16_encode(E->ac, &E->cs, ctx, uv); } /* encode N mask */ reserve_nmask(E, n); for (i = 0; i < n; ++i) { dist2_encode(E->ac, &E->d_nmask[i], x_str[i] == 'N' ? 1 : 0); } }
int main(int argc, char ** argv){ uint32_t x,b,w,d; unsigned char * array; int rank; int idx = 0; if(argc!=4) panic(); b = getarg(1); w = getarg(2); d = getarg(3); rank = _popc(b); array = malloc_file(ARRAY_SIZE_S(rank),FMODE_RO,DATA_FORMAT,rank); // search index for(x=ALLONE(rank);x!=b;x=_permut(x)) idx++; if ( blist_get(b) != idx ) error("What's f***a with blist\n"); // int res = twobit_get(array+(uint64_t)((w<<rank)|d)*cnk(32,rank)/4,idx); printf("%08X %X %X = %d\n",b,w,d,res); return 0; }
static void seqenc_decode_reference_alignment(seqenc_t* E, short_read_t* r, size_t seqlen) { const twobit_t* refseq = seqmap_get(E->ref, (const char*) r->seqname.s); if (refseq == NULL) { quip_error( "A read was aligned to sequence %s, which was not found in the reference.", r->seqname.s); } str_reserve(&r->seq, seqlen + 1); r->seq.n = 0; /* decode N mask */ size_t i; reserve_nmask(E, seqlen); memset(r->seq.s, '\0', seqlen + 1); for (i = 0; i < seqlen; ++i) { if (dist2_decode(E->ac, &E->d_nmask[i])) r->seq.s[i] = 'N'; } uint32_t ref_pos = r->pos; uint32_t read_pos = 0; i = 0; /* cigar operation */ size_t j; /* position within the cigar op */ kmer_t y; /* reference nucleotide */ for (i = 0; i < r->cigar.n; ++i) { switch (r->cigar.ops[i]) { case BAM_CEQUAL: case BAM_CDIFF: case BAM_CMATCH: for (j = 0; j < r->cigar.lens[i]; ++j, ++read_pos, ++ref_pos) { if (r->seq.s[read_pos] == 'N') continue; if (dist2_decode(E->ac, &E->d_ref_match) == SEQENC_REF_MATCH) { y = twobit_get(refseq, ref_pos); r->seq.s[read_pos] = kmertochar[y]; } else { r->seq.s[read_pos] = kmertochar[dist4_decode(E->ac, &E->d_ref_ins_nuc)]; } } break; case BAM_CINS: for (j = 0; j < r->cigar.lens[i]; ++j, ++read_pos) { if (r->seq.s[read_pos] == 'N') continue; r->seq.s[read_pos] = kmertochar[dist4_decode(E->ac, &E->d_ref_ins_nuc)]; } break; case BAM_CDEL: ref_pos += r->cigar.lens[i]; break; case BAM_CREF_SKIP: ref_pos += r->cigar.lens[i]; break; case BAM_CSOFT_CLIP: for (j = 0; j < r->cigar.lens[i]; ++j, ++read_pos) { if (r->seq.s[read_pos] == 'N') continue; r->seq.s[read_pos] = kmertochar[dist4_decode(E->ac, &E->d_ref_ins_nuc)]; } break; case BAM_CHARD_CLIP: ref_pos += r->cigar.lens[i]; break; case BAM_CPAD: quip_error("Unsupported cigar operation."); break; } } r->seq.s[seqlen] = '\0'; r->seq.n = seqlen; if (read_pos != seqlen) { quip_error("Cigar operations do not account for full read length."); } if (r->strand) str_revcomp(r->seq.s, r->seq.n); }
void seqenc_encode_reference_alignment( seqenc_t* E, const short_read_t* r) { const twobit_t* refseq = seqmap_get(E->ref, (const char*) r->seqname.s); if (refseq == NULL) { quip_error( "A read was aligned to sequence %s, which was not found in the reference.\n", r->seqname.s); } str_copy(&E->tmpseq, &r->seq); if (r->strand) { str_revcomp(E->tmpseq.s, E->tmpseq.n); } /* encode N mask */ size_t i; reserve_nmask(E, r->seq.n); for (i = 0; i < r->seq.n; ++i) { dist2_encode(E->ac, &E->d_nmask[i], E->tmpseq.s[i] == 'N' ? 1 : 0); } uint32_t ref_pos = r->pos; uint32_t read_pos = 0; i = 0; /* cigar operation */ size_t j; /* position within the cigar op */ kmer_t x; /* read nucleotide */ kmer_t y; /* reference nucleotide */ for (i = 0; i < r->cigar.n; ++i) { switch (r->cigar.ops[i]) { case BAM_CEQUAL: case BAM_CDIFF: case BAM_CMATCH: for (j = 0; j < r->cigar.lens[i]; ++j, ++read_pos, ++ref_pos) { if (E->tmpseq.s[read_pos] == 'N') continue; x = chartokmer[E->tmpseq.s[read_pos]]; y = twobit_get(refseq, ref_pos); if (x == y) { dist2_encode(E->ac, &E->d_ref_match, SEQENC_REF_MATCH); } else { dist2_encode(E->ac, &E->d_ref_match, SEQENC_REF_MISMATCH); dist4_encode(E->ac, &E->d_ref_ins_nuc, chartokmer[E->tmpseq.s[read_pos]]); } } break; case BAM_CINS: for (j = 0; j < r->cigar.lens[i]; ++j, ++read_pos) { if (E->tmpseq.s[read_pos] == 'N') continue; dist4_encode(E->ac, &E->d_ref_ins_nuc, chartokmer[E->tmpseq.s[read_pos]]); } break; case BAM_CDEL: ref_pos += r->cigar.lens[i]; break; case BAM_CREF_SKIP: ref_pos += r->cigar.lens[i]; break; case BAM_CSOFT_CLIP: for (j = 0; j < r->cigar.lens[i]; ++j, ++read_pos) { if (E->tmpseq.s[read_pos] == 'N') continue; dist4_encode(E->ac, &E->d_ref_ins_nuc, chartokmer[E->tmpseq.s[read_pos]]); } break; case BAM_CHARD_CLIP: ref_pos += r->cigar.lens[i]; break; case BAM_CPAD: quip_error("Cigar PAD operation is unsupported."); break; } } if (read_pos != r->seq.n) { quip_error("Cigar operations do not account for full read length."); } }