int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only) { extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c kseq_t *seq; char name[1024]; bntseq_t *bns; uint8_t *pac = 0; int32_t m_seqs, m_holes; int64_t ret = -1, m_pac, l; bntamb1_t *q; FILE *fp; // initialization seq = kseq_init(fp_fa); bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); bns->seed = 11; // fixed seed for random generator srand48(bns->seed); m_seqs = m_holes = 8; m_pac = 0x10000; bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t)); bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t)); pac = calloc(m_pac/4, 1); q = bns->ambs; strcpy(name, prefix); strcat(name, ".pac"); fp = xopen(name, "wb"); // read sequences while (kseq_read(seq) >= 0) pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q); if (!for_only) { // add the reverse complemented sequence m_pac = (bns->l_pac * 2 + 3) / 4 * 4; pac = realloc(pac, m_pac/4); memset(pac + (bns->l_pac+3)/4, 0, (m_pac - (bns->l_pac+3)/4*4) / 4); for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac) _set_pac(pac, bns->l_pac, 3-_get_pac(pac, l)); } ret = bns->l_pac; { // finalize .pac file ubyte_t ct; err_fwrite(pac, 1, (bns->l_pac>>2) + ((bns->l_pac&3) == 0? 0 : 1), fp); // the following codes make the pac file size always (l_pac/4+1+1) if (bns->l_pac % 4 == 0) { ct = 0; err_fwrite(&ct, 1, 1, fp); } ct = bns->l_pac % 4; err_fwrite(&ct, 1, 1, fp); // close .pac file err_fflush(fp); err_fclose(fp); } bns_dump(bns, prefix); bns_destroy(bns); kseq_destroy(seq); free(pac); return ret; }
uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len) { uint8_t *seq = 0; if (end < beg) end ^= beg, beg ^= end, end ^= beg; // if end is smaller, swap if (end > l_pac<<1) end = l_pac<<1; if (beg < 0) beg = 0; if (beg >= l_pac || end <= l_pac) { int64_t k, l = 0; *len = end - beg; seq = malloc(end - beg); if (beg >= l_pac) { // reverse strand int64_t beg_f = (l_pac<<1) - 1 - end; int64_t end_f = (l_pac<<1) - 1 - beg; for (k = end_f; k > beg_f; --k) seq[l++] = 3 - _get_pac(pac, k); } else { // forward strand for (k = beg; k < end; ++k) seq[l++] = _get_pac(pac, k); } } else *len = 0; // if bridging the forward-reverse boundary, return nothing return seq; }
int main(int argc, char *argv[]) { bntseq_t *bns; bns = bns_restore(argv[1]); uint8_t *pac; pac = calloc(bns->l_pac/2+2, 1); fread(pac, 1, bns->l_pac/2+2, bns->fp_pac); int i; for(i = 0; i < bns->l_pac; ++i){ putchar( "ACGT#"[_get_pac(pac, i)]); } bns_destroy(bns); }
int64_t bis_bns_fasta2bntseq(gzFile fp_fa, const char *prefix, uint8_t parent) { extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c kseq_t *seq; char name[1024]; bntseq_t *bns; uint8_t *pac = 0, *_pac = 0; int32_t m_seqs, m_holes; int64_t ret = -1, m_pac; bntamb1_t *q; FILE *fp; // initialization gzseek(fp_fa, 0, SEEK_SET); seq = kseq_init(fp_fa); bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); bns->seed = 11; // fixed seed for random generator srand48(bns->seed); m_seqs = m_holes = 8; m_pac = 0x10000; bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t)); bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t)); _pac = calloc(m_pac/4, 1); q = bns->ambs; if (parent) { strcpy(name, prefix); strcat(name, ".par.pac"); } else { strcpy(name, prefix); strcat(name, ".dau.pac"); } fp = xopen(name, "wb"); // read sequences while (kseq_read(seq) >= 0) { _pac = bis_add1(seq, bns, _pac, &m_pac, &m_seqs, &m_holes, &q); } /* kseq_rewind(seq); */ /* gzseek(seq->f->f, 0, SEEK_SET); */ /* fprintf(stderr, "foward end\n"); */ /* fflush(stderr); */ /* while (kseq_read(seq) >= 0) { */ /* if (parent) nt256char_rev_ip(seq->seq.s, seq->seq.l); */ /* pac = bis_add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q, parent, 1); */ /* } */ int64_t l,k; m_pac = (bns->l_pac*2+3)/4*4; /* in bit */ pac = calloc(m_pac/4,sizeof(uint8_t)); for (l=0; l<bns->l_pac; ++l) { uint8_t c = _get_pac(_pac,l); if (parent && c == 1) c = 3; if (!parent && c == 2) c = 0; _set_pac(pac, l, c); } for (k=bns->l_pac-1; k>=0; --k,++l) { uint8_t c = 3-_get_pac(_pac,k); if (parent && c == 1) c = 3; if (!parent && c == 2) c = 0; _set_pac(pac, l, c); } free(_pac); /* int64_t l; */ /* fprintf(stderr, "reverse end\n"); */ /* fflush(stderr); */ /* if (!for_only) { // add the reverse complemented sequence */ /* m_pac = (bns->l_pac * 2 + 3) / 4 * 4; */ /* pac = realloc(pac, m_pac/4); */ /* memset(pac + (bns->l_pac+3)/4, 0, (m_pac - (bns->l_pac+3)/4*4) / 4); */ /* for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac) */ /* _set_pac(pac, bns->l_pac, 3-_get_pac(pac, l)); */ /* } */ assert(bns->l_pac<<1 == l); { // finalize .pac file ubyte_t ct; err_fwrite(pac, 1, (l>>2) + ((l&3) == 0? 0 : 1), fp); // the following codes make the pac file size always (l_pac/4+1+1) if (l % 4 == 0) { ct = 0; err_fwrite(&ct, 1, 1, fp); } ct = l % 4; err_fwrite(&ct, 1, 1, fp); // close .pac file err_fflush(fp); err_fclose(fp); } if (parent) bis_bns_dump(bns, prefix); bns_destroy(bns); kseq_destroy(seq); free(pac); return l; }
int64_t R_bns_fasta2bntseq(gzFile fp_fa, const char *prefix) { // extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c kseq_t *seq; char name[1024]; bntseq_t *bns; uint8_t *pac = 0; uint8_t *reverse_pac = NULL; int32_t m_seqs, m_holes; int64_t ret = -1, m_pac, l, m_r_pac; bntamb1_t *q; FILE *fp, *fp_r; int i; // initialization seq = kseq_init(fp_fa); bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); bns->seed = 11; // fixed seed for random generator srand48(bns->seed); m_seqs = m_holes = 8; m_pac = 0x10000; bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t)); bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t)); pac = calloc(m_pac/NT_PER_BYTE, 1); q = bns->ambs; strcpy(name, prefix); strcat(name, ".pac"); fp = xopen(name, "wb"); memset(name, '\0', 1024);strcpy(name, prefix); strcat(name, ".rpac"); fp_r = xopen(name, "wb"); // read sequences while (kseq_read(seq) >= 0) pac = R_add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q); // add the reverse complemented sequence ret = bns->l_pac; fprintf(stderr, "[R_bns_fasta2bntseq]:reverse_pac!\n"); m_r_pac = (bns->l_pac +NT_PER_BYTE-1)/NT_PER_BYTE *NT_PER_BYTE; reverse_pac = calloc(m_r_pac/NT_PER_BYTE, sizeof(uint8_t)); for(l = bns->l_pac-1, i =0; l>=0, i < bns->l_pac; --l, ++i) { _set_pac(reverse_pac, i, _get_pac(pac, l)); } ubyte_t ct; { // finalize .pac and file fwrite(pac, 1, (bns->l_pac>>1) + ((bns->l_pac&1) == 0? 0 : 1), fp); // the following codes make the pac file size always (l_pac/4+1+1) if (bns->l_pac % NT_PER_BYTE == 0) { ct = 0; fwrite(&ct, 1, 1, fp); } ct = bns->l_pac % NT_PER_BYTE; fwrite(&ct, 1, 1, fp); // close .pac file fclose(fp); } { // finalize .rpac and file fwrite(reverse_pac, 1, (bns->l_pac>>1) + ((bns->l_pac&1) == 0? 0 : 1), fp_r); // the following codes make the pac file size always (l_pac/4+1+1) if (bns->l_pac % NT_PER_BYTE == 0) { ct = 0; fwrite(&ct, 1, 1, fp_r); } ct = bns->l_pac % NT_PER_BYTE; fwrite(&ct, 1, 1, fp_r); // close .rpac file fclose(fp_r); } bns_dump(bns, prefix); bns_destroy(bns); kseq_destroy(seq); free(pac); free(reverse_pac); return ret; }