int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only) { extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c kseq_t *seq; char name[1024]; bntseq_t *bns; uint8_t *pac = 0; int32_t m_seqs, m_holes; int64_t ret = -1, m_pac, l; bntamb1_t *q; FILE *fp; // initialization seq = kseq_init(fp_fa); bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); bns->seed = 11; // fixed seed for random generator srand48(bns->seed); m_seqs = m_holes = 8; m_pac = 0x10000; bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t)); bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t)); pac = calloc(m_pac/4, 1); q = bns->ambs; strcpy(name, prefix); strcat(name, ".pac"); fp = xopen(name, "wb"); // read sequences while (kseq_read(seq) >= 0) pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q); if (!for_only) { // add the reverse complemented sequence m_pac = (bns->l_pac * 2 + 3) / 4 * 4; pac = realloc(pac, m_pac/4); memset(pac + (bns->l_pac+3)/4, 0, (m_pac - (bns->l_pac+3)/4*4) / 4); for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac) _set_pac(pac, bns->l_pac, 3-_get_pac(pac, l)); } ret = bns->l_pac; { // finalize .pac file ubyte_t ct; err_fwrite(pac, 1, (bns->l_pac>>2) + ((bns->l_pac&3) == 0? 0 : 1), fp); // the following codes make the pac file size always (l_pac/4+1+1) if (bns->l_pac % 4 == 0) { ct = 0; err_fwrite(&ct, 1, 1, fp); } ct = bns->l_pac % 4; err_fwrite(&ct, 1, 1, fp); // close .pac file err_fflush(fp); err_fclose(fp); } bns_dump(bns, prefix); bns_destroy(bns); kseq_destroy(seq); free(pac); return ret; }
static uint8_t *bis_add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_pac, int *m_seqs, int *m_holes, bntamb1_t **q) { bntann1_t *p; int lasts; uint32_t i; if (bns->n_seqs == *m_seqs) { *m_seqs <<= 1; bns->anns = (bntann1_t*)realloc(bns->anns, *m_seqs * sizeof(bntann1_t)); } p = bns->anns + bns->n_seqs; p->name = strdup((char*)seq->name.s); p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)"); p->gi = 0; p->len = seq->seq.l; p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len; p->n_ambs = 0; for (i = lasts = 0; i < seq->seq.l; ++i) { int c = nst_nt4_table[(int)seq->seq.s[i]]; if (c >= 4) { // N if (lasts == seq->seq.s[i]) { // contiguous N ++(*q)->len; } else { if (bns->n_holes == *m_holes) { (*m_holes) <<= 1; bns->ambs = (bntamb1_t*)realloc(bns->ambs, (*m_holes) * sizeof(bntamb1_t)); } *q = bns->ambs + bns->n_holes; (*q)->len = 1; (*q)->offset = p->offset + i; (*q)->amb = seq->seq.s[i]; ++p->n_ambs; ++bns->n_holes; } } lasts = seq->seq.s[i]; { // fill buffer if (c >= 4) c = lrand48()&3; if (bns->l_pac == *m_pac) { // double the pac size *m_pac <<= 1; pac = realloc(pac, *m_pac/4); memset(pac + bns->l_pac/4, 0, (*m_pac - bns->l_pac)/4); } _set_pac(pac, bns->l_pac, c); ++bns->l_pac; } } ++bns->n_seqs; return pac; }
int64_t bis_bns_fasta2bntseq(gzFile fp_fa, const char *prefix, uint8_t parent) { extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c kseq_t *seq; char name[1024]; bntseq_t *bns; uint8_t *pac = 0, *_pac = 0; int32_t m_seqs, m_holes; int64_t ret = -1, m_pac; bntamb1_t *q; FILE *fp; // initialization gzseek(fp_fa, 0, SEEK_SET); seq = kseq_init(fp_fa); bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); bns->seed = 11; // fixed seed for random generator srand48(bns->seed); m_seqs = m_holes = 8; m_pac = 0x10000; bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t)); bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t)); _pac = calloc(m_pac/4, 1); q = bns->ambs; if (parent) { strcpy(name, prefix); strcat(name, ".par.pac"); } else { strcpy(name, prefix); strcat(name, ".dau.pac"); } fp = xopen(name, "wb"); // read sequences while (kseq_read(seq) >= 0) { _pac = bis_add1(seq, bns, _pac, &m_pac, &m_seqs, &m_holes, &q); } /* kseq_rewind(seq); */ /* gzseek(seq->f->f, 0, SEEK_SET); */ /* fprintf(stderr, "foward end\n"); */ /* fflush(stderr); */ /* while (kseq_read(seq) >= 0) { */ /* if (parent) nt256char_rev_ip(seq->seq.s, seq->seq.l); */ /* pac = bis_add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q, parent, 1); */ /* } */ int64_t l,k; m_pac = (bns->l_pac*2+3)/4*4; /* in bit */ pac = calloc(m_pac/4,sizeof(uint8_t)); for (l=0; l<bns->l_pac; ++l) { uint8_t c = _get_pac(_pac,l); if (parent && c == 1) c = 3; if (!parent && c == 2) c = 0; _set_pac(pac, l, c); } for (k=bns->l_pac-1; k>=0; --k,++l) { uint8_t c = 3-_get_pac(_pac,k); if (parent && c == 1) c = 3; if (!parent && c == 2) c = 0; _set_pac(pac, l, c); } free(_pac); /* int64_t l; */ /* fprintf(stderr, "reverse end\n"); */ /* fflush(stderr); */ /* if (!for_only) { // add the reverse complemented sequence */ /* m_pac = (bns->l_pac * 2 + 3) / 4 * 4; */ /* pac = realloc(pac, m_pac/4); */ /* memset(pac + (bns->l_pac+3)/4, 0, (m_pac - (bns->l_pac+3)/4*4) / 4); */ /* for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac) */ /* _set_pac(pac, bns->l_pac, 3-_get_pac(pac, l)); */ /* } */ assert(bns->l_pac<<1 == l); { // finalize .pac file ubyte_t ct; err_fwrite(pac, 1, (l>>2) + ((l&3) == 0? 0 : 1), fp); // the following codes make the pac file size always (l_pac/4+1+1) if (l % 4 == 0) { ct = 0; err_fwrite(&ct, 1, 1, fp); } ct = l % 4; err_fwrite(&ct, 1, 1, fp); // close .pac file err_fflush(fp); err_fclose(fp); } if (parent) bis_bns_dump(bns, prefix); bns_destroy(bns); kseq_destroy(seq); free(pac); return l; }
int64_t R_bns_fasta2bntseq(gzFile fp_fa, const char *prefix) { // extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c kseq_t *seq; char name[1024]; bntseq_t *bns; uint8_t *pac = 0; uint8_t *reverse_pac = NULL; int32_t m_seqs, m_holes; int64_t ret = -1, m_pac, l, m_r_pac; bntamb1_t *q; FILE *fp, *fp_r; int i; // initialization seq = kseq_init(fp_fa); bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); bns->seed = 11; // fixed seed for random generator srand48(bns->seed); m_seqs = m_holes = 8; m_pac = 0x10000; bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t)); bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t)); pac = calloc(m_pac/NT_PER_BYTE, 1); q = bns->ambs; strcpy(name, prefix); strcat(name, ".pac"); fp = xopen(name, "wb"); memset(name, '\0', 1024);strcpy(name, prefix); strcat(name, ".rpac"); fp_r = xopen(name, "wb"); // read sequences while (kseq_read(seq) >= 0) pac = R_add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q); // add the reverse complemented sequence ret = bns->l_pac; fprintf(stderr, "[R_bns_fasta2bntseq]:reverse_pac!\n"); m_r_pac = (bns->l_pac +NT_PER_BYTE-1)/NT_PER_BYTE *NT_PER_BYTE; reverse_pac = calloc(m_r_pac/NT_PER_BYTE, sizeof(uint8_t)); for(l = bns->l_pac-1, i =0; l>=0, i < bns->l_pac; --l, ++i) { _set_pac(reverse_pac, i, _get_pac(pac, l)); } ubyte_t ct; { // finalize .pac and file fwrite(pac, 1, (bns->l_pac>>1) + ((bns->l_pac&1) == 0? 0 : 1), fp); // the following codes make the pac file size always (l_pac/4+1+1) if (bns->l_pac % NT_PER_BYTE == 0) { ct = 0; fwrite(&ct, 1, 1, fp); } ct = bns->l_pac % NT_PER_BYTE; fwrite(&ct, 1, 1, fp); // close .pac file fclose(fp); } { // finalize .rpac and file fwrite(reverse_pac, 1, (bns->l_pac>>1) + ((bns->l_pac&1) == 0? 0 : 1), fp_r); // the following codes make the pac file size always (l_pac/4+1+1) if (bns->l_pac % NT_PER_BYTE == 0) { ct = 0; fwrite(&ct, 1, 1, fp_r); } ct = bns->l_pac % NT_PER_BYTE; fwrite(&ct, 1, 1, fp_r); // close .rpac file fclose(fp_r); } bns_dump(bns, prefix); bns_destroy(bns); kseq_destroy(seq); free(pac); free(reverse_pac); return ret; }