int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only) { extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c kseq_t *seq; char name[1024]; bntseq_t *bns; uint8_t *pac = 0; int32_t m_seqs, m_holes; int64_t ret = -1, m_pac, l; bntamb1_t *q; FILE *fp; // initialization seq = kseq_init(fp_fa); bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); bns->seed = 11; // fixed seed for random generator srand48(bns->seed); m_seqs = m_holes = 8; m_pac = 0x10000; bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t)); bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t)); pac = calloc(m_pac/4, 1); q = bns->ambs; strcpy(name, prefix); strcat(name, ".pac"); fp = xopen(name, "wb"); // read sequences while (kseq_read(seq) >= 0) pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q); if (!for_only) { // add the reverse complemented sequence m_pac = (bns->l_pac * 2 + 3) / 4 * 4; pac = realloc(pac, m_pac/4); memset(pac + (bns->l_pac+3)/4, 0, (m_pac - (bns->l_pac+3)/4*4) / 4); for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac) _set_pac(pac, bns->l_pac, 3-_get_pac(pac, l)); } ret = bns->l_pac; { // finalize .pac file ubyte_t ct; err_fwrite(pac, 1, (bns->l_pac>>2) + ((bns->l_pac&3) == 0? 0 : 1), fp); // the following codes make the pac file size always (l_pac/4+1+1) if (bns->l_pac % 4 == 0) { ct = 0; err_fwrite(&ct, 1, 1, fp); } ct = bns->l_pac % 4; err_fwrite(&ct, 1, 1, fp); // close .pac file err_fflush(fp); err_fclose(fp); } bns_dump(bns, prefix); bns_destroy(bns); kseq_destroy(seq); free(pac); return ret; }
void bns_fasta2bntseq(gzFile fp_fa, const char *prefix) { kseq_t *seq; char name[1024]; bntseq_t *bns; bntamb1_t *q; int l_buf; unsigned char buf[0x10000]; int32_t m_seqs, m_holes, l, i; FILE *fp; // initialization seq = kseq_init(fp_fa); bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); bns->seed = 11; // fixed seed for random generator srand48(bns->seed); m_seqs = m_holes = 8; bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t)); bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t)); q = bns->ambs; l_buf = 0; strcpy(name, prefix); strcat(name, ".pac"); fp = xopen(name, "wb"); memset(buf, 0, 0x10000); // read sequences while ((l = kseq_read(seq)) >= 0) { bntann1_t *p; int lasts; if (bns->n_seqs == m_seqs) { m_seqs <<= 1; bns->anns = (bntann1_t*)realloc(bns->anns, m_seqs * sizeof(bntann1_t)); } p = bns->anns + bns->n_seqs; p->name = strdup((char*)seq->name.s); p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)"); p->gi = 0; p->len = l; p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len; p->n_ambs = 0; for (i = 0, lasts = 0; i < l; ++i) { int c = nst_nt4_table[(int)seq->seq.s[i]]; if (c >= 4) { // N if (lasts == seq->seq.s[i]) { // contiguous N ++q->len; } else { if (bns->n_holes == m_holes) { m_holes <<= 1; bns->ambs = (bntamb1_t*)realloc(bns->ambs, m_holes * sizeof(bntamb1_t)); } q = bns->ambs + bns->n_holes; q->len = 1; q->offset = p->offset + i; q->amb = seq->seq.s[i]; ++p->n_ambs; ++bns->n_holes; } } lasts = seq->seq.s[i]; { // fill buffer if (c >= 4) c = lrand48()&0x3; if (l_buf == 0x40000) { fwrite(buf, 1, 0x10000, fp); memset(buf, 0, 0x10000); l_buf = 0; } buf[l_buf>>2] |= c << ((3 - (l_buf&3)) << 1); ++l_buf; } } ++bns->n_seqs; bns->l_pac += seq->seq.l; } xassert(bns->l_pac, "zero length sequence."); { // finalize .pac file ubyte_t ct; fwrite(buf, 1, (l_buf>>2) + ((l_buf&3) == 0? 0 : 1), fp); // the following codes make the pac file size always (l_pac/4+1+1) if (bns->l_pac % 4 == 0) { ct = 0; fwrite(&ct, 1, 1, fp); } ct = bns->l_pac % 4; fwrite(&ct, 1, 1, fp); // close .pac file fclose(fp); } bns_dump(bns, prefix); bns_destroy(bns); kseq_destroy(seq); }
int64_t R_bns_fasta2bntseq(gzFile fp_fa, const char *prefix) { // extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c kseq_t *seq; char name[1024]; bntseq_t *bns; uint8_t *pac = 0; uint8_t *reverse_pac = NULL; int32_t m_seqs, m_holes; int64_t ret = -1, m_pac, l, m_r_pac; bntamb1_t *q; FILE *fp, *fp_r; int i; // initialization seq = kseq_init(fp_fa); bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); bns->seed = 11; // fixed seed for random generator srand48(bns->seed); m_seqs = m_holes = 8; m_pac = 0x10000; bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t)); bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t)); pac = calloc(m_pac/NT_PER_BYTE, 1); q = bns->ambs; strcpy(name, prefix); strcat(name, ".pac"); fp = xopen(name, "wb"); memset(name, '\0', 1024);strcpy(name, prefix); strcat(name, ".rpac"); fp_r = xopen(name, "wb"); // read sequences while (kseq_read(seq) >= 0) pac = R_add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q); // add the reverse complemented sequence ret = bns->l_pac; fprintf(stderr, "[R_bns_fasta2bntseq]:reverse_pac!\n"); m_r_pac = (bns->l_pac +NT_PER_BYTE-1)/NT_PER_BYTE *NT_PER_BYTE; reverse_pac = calloc(m_r_pac/NT_PER_BYTE, sizeof(uint8_t)); for(l = bns->l_pac-1, i =0; l>=0, i < bns->l_pac; --l, ++i) { _set_pac(reverse_pac, i, _get_pac(pac, l)); } ubyte_t ct; { // finalize .pac and file fwrite(pac, 1, (bns->l_pac>>1) + ((bns->l_pac&1) == 0? 0 : 1), fp); // the following codes make the pac file size always (l_pac/4+1+1) if (bns->l_pac % NT_PER_BYTE == 0) { ct = 0; fwrite(&ct, 1, 1, fp); } ct = bns->l_pac % NT_PER_BYTE; fwrite(&ct, 1, 1, fp); // close .pac file fclose(fp); } { // finalize .rpac and file fwrite(reverse_pac, 1, (bns->l_pac>>1) + ((bns->l_pac&1) == 0? 0 : 1), fp_r); // the following codes make the pac file size always (l_pac/4+1+1) if (bns->l_pac % NT_PER_BYTE == 0) { ct = 0; fwrite(&ct, 1, 1, fp_r); } ct = bns->l_pac % NT_PER_BYTE; fwrite(&ct, 1, 1, fp_r); // close .rpac file fclose(fp_r); } bns_dump(bns, prefix); bns_destroy(bns); kseq_destroy(seq); free(pac); free(reverse_pac); return ret; }