// Convert the given reference nucleotide FASTA and GFF file to a protein FASTA file int writeIndexProtein(const char * passPrefix, const char * passProName, const char * passAnnName) { struct CDS currentCDS; gzFile inputSeqPtr; FILE * inputAnnPtr, * outputPtr; char * outputBuffer; kseq_t * seq; unsigned long outputSize, currentLine; // Prepare file handles inputSeqPtr = xzopen(passPrefix, "r"); inputAnnPtr = fopen(passAnnName, "r"); outputPtr = fopen(passProName, "w"); // Read in 1st sequence data seq = kseq_init(inputSeqPtr); kseq_read(seq); // Iterate through each CDS sequence in the annotation file currentLine = 0; while (getNextCDS(inputAnnPtr, ¤tCDS, ¤tLine)) { convertToAA(seq->seq.s, ¤tCDS, &outputBuffer, &outputSize); fprintf(outputPtr, ">%d\n%.*s\n", currentLine, outputSize, outputBuffer); free(outputBuffer); } // Close files fflush(outputPtr); err_gzclose(inputSeqPtr); err_fclose(inputAnnPtr); err_fclose(outputPtr); return 0; }
void bns_dump(const bntseq_t *bns, const char *prefix) { char str[1024]; FILE *fp; int i; { // dump .ann strcpy(str, prefix); strcat(str, ".ann"); fp = xopen(str, "w"); err_fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->seed); for (i = 0; i != bns->n_seqs; ++i) { bntann1_t *p = bns->anns + i; err_fprintf(fp, "%d %s", p->gi, p->name); if (p->anno[0]) err_fprintf(fp, " %s\n", p->anno); else err_fprintf(fp, "\n"); err_fprintf(fp, "%lld %d %d\n", (long long)p->offset, p->len, p->n_ambs); } err_fflush(fp); err_fclose(fp); } { // dump .amb strcpy(str, prefix); strcat(str, ".amb"); fp = xopen(str, "w"); err_fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->n_holes); for (i = 0; i != bns->n_holes; ++i) { bntamb1_t *p = bns->ambs + i; err_fprintf(fp, "%lld %d %c\n", (long long)p->offset, p->len, p->amb); } err_fflush(fp); err_fclose(fp); } }
int main(int argc, char *argv[]) { int i, ret; double t_real; kstring_t pg = {0,0,0}; t_real = realtime(); ksprintf(&pg, "@PG\tID:bwa\tPN:bwa\tVN:%s\tCL:%s", PACKAGE_VERSION, argv[0]); for (i = 1; i < argc; ++i) ksprintf(&pg, " %s", argv[i]); bwa_pg = pg.s; if (argc < 2) return usage(); if (strcmp(argv[1], "fa2pac") == 0) ret = bwa_fa2pac(argc-1, argv+1); else if (strcmp(argv[1], "pac2bwt") == 0) ret = bwa_pac2bwt(argc-1, argv+1); else if (strcmp(argv[1], "pac2bwtgen") == 0) ret = bwt_bwtgen_main(argc-1, argv+1); else if (strcmp(argv[1], "bwtupdate") == 0) ret = bwa_bwtupdate(argc-1, argv+1); else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1); else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1); else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1); else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1); else if (strcmp(argv[1], "sampe") == 0) ret = bwa_sai2sam_pe(argc-1, argv+1); else if (strcmp(argv[1], "bwtsw2") == 0) ret = bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "dbwtsw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1); else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1); else if (strcmp(argv[1], "pemerge") == 0) ret = main_pemerge(argc-1, argv+1); else { fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); return 1; } #ifdef USE_HTSLIB if (strcmp(argv[1], "mem") != 0) { err_fflush(stdout); err_fclose(stdout); } #else err_fflush(stdout); err_fclose(stdout); #endif if (ret == 0) { fprintf(stderr, "[%s] Version: %s\n", __func__, PACKAGE_VERSION); fprintf(stderr, "[%s] CMD:", __func__); for (i = 0; i < argc; ++i) fprintf(stderr, " %s", argv[i]); fprintf(stderr, "\n[%s] Real time: %.3f sec; CPU: %.3f sec\n", __func__, realtime() - t_real, cputime()); } free(bwa_pg); return ret; }
int main(int argc, char *argv[]) { extern char *bwa_pg; int i, ret; double t_real; kstring_t pg = {0,0,0}; t_real = realtime(); ksprintf(&pg, "@PG\tID:biscuit\tPN:biscuit\tVN:%s\tCL:%s", PACKAGE_VERSION, argv[0]); for (i = 1; i < argc; ++i) ksprintf(&pg, " %s", argv[i]); bwa_pg = pg.s; if (argc < 2) return usage(); if (strcmp(argv[1], "index") == 0) ret = main_biscuit_index(argc-1, argv+1); else if (strcmp(argv[1], "align") == 0) ret = main_align(argc-1, argv+1); else if (strcmp(argv[1], "pileup") == 0) ret = main_pileup(argc-1, argv+1); else if (strcmp(argv[1], "somatic") == 0) ret = main_somatic(argc-1, argv+1); else { fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); return 1; } err_fflush(stdout); err_fclose(stdout); if (ret == 0) { fprintf(stderr, "[%s] Version: %s\n", __func__, PACKAGE_VERSION); fprintf(stderr, "[%s] CMD:", __func__); for (i = 0; i < argc; ++i) fprintf(stderr, " %s", argv[i]); fprintf(stderr, "\n[%s] Real time: %.3f sec; CPU: %.3f sec\n", __func__, realtime() - t_real, cputime()); } free(bwa_pg); return ret; }
int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only) { extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c kseq_t *seq; char name[1024]; bntseq_t *bns; uint8_t *pac = 0; int32_t m_seqs, m_holes; int64_t ret = -1, m_pac, l; bntamb1_t *q; FILE *fp; // initialization seq = kseq_init(fp_fa); bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); bns->seed = 11; // fixed seed for random generator srand48(bns->seed); m_seqs = m_holes = 8; m_pac = 0x10000; bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t)); bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t)); pac = calloc(m_pac/4, 1); q = bns->ambs; strcpy(name, prefix); strcat(name, ".pac"); fp = xopen(name, "wb"); // read sequences while (kseq_read(seq) >= 0) pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q); if (!for_only) { // add the reverse complemented sequence m_pac = (bns->l_pac * 2 + 3) / 4 * 4; pac = realloc(pac, m_pac/4); memset(pac + (bns->l_pac+3)/4, 0, (m_pac - (bns->l_pac+3)/4*4) / 4); for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac) _set_pac(pac, bns->l_pac, 3-_get_pac(pac, l)); } ret = bns->l_pac; { // finalize .pac file ubyte_t ct; err_fwrite(pac, 1, (bns->l_pac>>2) + ((bns->l_pac&3) == 0? 0 : 1), fp); // the following codes make the pac file size always (l_pac/4+1+1) if (bns->l_pac % 4 == 0) { ct = 0; err_fwrite(&ct, 1, 1, fp); } ct = bns->l_pac % 4; err_fwrite(&ct, 1, 1, fp); // close .pac file err_fflush(fp); err_fclose(fp); } bns_dump(bns, prefix); bns_destroy(bns); kseq_destroy(seq); free(pac); return ret; }
int64_t dump_forward_pac(gzFile fp_fa, const char *prefix) { extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c kseq_t *seq; char name[1024]; bntseq_t *bns; uint8_t *pac = 0; int32_t m_seqs, m_holes; int64_t ret = -1, m_pac; bntamb1_t *q; FILE *fp; // initialization seq = kseq_init(fp_fa); bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); bns->seed = 11; // fixed seed for random generator srand48(bns->seed); m_seqs = m_holes = 8; m_pac = 0x10000; bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t)); bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t)); pac = calloc(m_pac/4, 1); q = bns->ambs; strcpy(name, prefix); strcat(name, ".bis.pac"); fp = xopen(name, "wb"); // read sequences while (kseq_read(seq) >= 0) pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q); ret = bns->l_pac; { // finalize .pac file ubyte_t ct; err_fwrite(pac, 1, (bns->l_pac>>2) + ((bns->l_pac&3) == 0? 0 : 1), fp); // the following codes make the pac file size always (l_pac/4+1+1) if (bns->l_pac % 4 == 0) { ct = 0; err_fwrite(&ct, 1, 1, fp); } ct = bns->l_pac % 4; err_fwrite(&ct, 1, 1, fp); // close .pac file err_fflush(fp); err_fclose(fp); } /* re-dump forward bis bns, otherwise the .bis.ann and .bis.amb have twice as long pac */ /* strcpy(name, prefix); strcat(name, ".bis"); */ /* bis_bns_dump(bns, prefix); */ bns_destroy(bns); kseq_destroy(seq); free(pac); return ret; }
void bns_destroy(bntseq_t *bns) { if (bns == 0) return; else { int i; if (bns->fp_pac) err_fclose(bns->fp_pac); free(bns->ambs); for (i = 0; i < bns->n_seqs; ++i) { free(bns->anns[i].name); free(bns->anns[i].anno); } free(bns->anns); free(bns); } }
int main(int argc, char *argv[]) { int i, ret; double t_real; t_real = realtime(); if (argc < 2) return usage(); /* if (strcmp(argv[1], "fa2pac") == 0) ret = bwa_fa2pac(argc-1, argv+1); */ /* else if (strcmp(argv[1], "pac2bwt") == 0) ret = bwa_pac2bwt(argc-1, argv+1); * else if (strcmp(argv[1], "pac2bwtgen") == 0) ret = bwt_bwtgen_main(argc-1, argv+1); * else if (strcmp(argv[1], "bwtupdate") == 0) ret = bwa_bwtupdate(argc-1, argv+1); * else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1); */ else if (strcmp(argv[1], "index") == 0) ret = bwa_index_main(argc-1, argv+1); else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1); /* else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1); */ /* else if (strcmp(argv[1], "sw") == 0) ret = bwa_stdsw(argc-1, argv+1); * * else if (strcmp(argv[1], "sampe") == 0) ret = bwa_sai2sam_pe(argc-1, argv+1); * else if (strcmp(argv[1], "pac2cspac") == 0) ret = bwa_pac2cspac(argc-1, argv+1); * else if (strcmp(argv[1], "stdsw") == 0) ret = bwa_stdsw(argc-1, argv+1); * else if (strcmp(argv[1], "bwtsw2") == 0) ret = bwa_bwtsw2(argc-1, argv+1); * else if (strcmp(argv[1], "dbwtsw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); * else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); * else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1); */ else { fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); return 1; } err_fflush(stdout); err_fclose(stdout); if (ret == 0) { fprintf(stderr, "[%s] Version: %s\n", __func__, PACKAGE_VERSION); fprintf(stderr, "[%s] CMD:", __func__); for (i = 0; i < argc; ++i) fprintf(stderr, " %s", argv[i]); fprintf(stderr, "\n[%s] Real time: %.3f sec; CPU: %.3f sec\n", __func__, realtime() - t_real, cputime()); } return 0; }
// Detects ORFs in the given nucleotide FASTA file and converts to a protein FASTA file int writeReadsProtein(const char * passPrefix, const char * passProName) { struct CDS * orfList; gzFile inputSeqPtr; FILE * outputPtr; char * outputBuffer; kseq_t * seq; unsigned long seqIdx, outputSize, orfCount, orfIdx; char testHeader[4096]; // Prepare file handles inputSeqPtr = xzopen(passPrefix, "r"); outputPtr = fopen(passProName, "w"); // Iterate through each read seqIdx = 0; seq = kseq_init(inputSeqPtr); while(kseq_read(seq) >= 0) { // Search for ORFs getSequenceORF(seq->seq.s, seq->seq.l, &orfList, &orfCount); //if (orfCount > 0) testReadHeader(seq->name.s, testHeader); // Write out the corresponding protein sequence for each ORF for (orfIdx = 0 ; orfIdx < orfCount ; orfIdx++) { convertToAA(seq->seq.s, orfList+orfIdx, &outputBuffer, &outputSize); fprintf(outputPtr, ">%d:%d\n%.*s\n", seqIdx, orfIdx, outputSize, outputBuffer); free(outputBuffer); } seqIdx++; } // Close files fflush(outputPtr); err_gzclose(inputSeqPtr); err_fclose(outputPtr); return 0; }
bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename) { char str[8192]; FILE *fp; const char *fname; bntseq_t *bns; long long xx; int i; int scanres; bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); { // read .ann fp = xopen(fname = ann_filename, "r"); scanres = fscanf(fp, "%lld%d%u", &xx, &bns->n_seqs, &bns->seed); if (scanres != 3) goto badread; bns->l_pac = xx; bns->anns = (bntann1_t*)calloc(bns->n_seqs, sizeof(bntann1_t)); for (i = 0; i < bns->n_seqs; ++i) { bntann1_t *p = bns->anns + i; char *q = str; int c; // read gi and sequence name scanres = fscanf(fp, "%u%s", &p->gi, str); if (scanres != 2) goto badread; p->name = strdup(str); // read fasta comments while (q - str < sizeof(str) - 1 && (c = fgetc(fp)) != '\n' && c != EOF) *q++ = c; while (c != '\n' && c != EOF) c = fgetc(fp); if (c == EOF) { scanres = EOF; goto badread; } *q = 0; if (q - str > 1 && strcmp(str, " (null)") != 0) p->anno = strdup(str + 1); // skip leading space else p->anno = strdup(""); // read the rest scanres = fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs); if (scanres != 3) goto badread; p->offset = xx; } err_fclose(fp); } { // read .amb int64_t l_pac; int32_t n_seqs; fp = xopen(fname = amb_filename, "r"); scanres = fscanf(fp, "%lld%d%d", &xx, &n_seqs, &bns->n_holes); if (scanres != 3) goto badread; l_pac = xx; xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files."); bns->ambs = bns->n_holes? (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t)) : 0; for (i = 0; i < bns->n_holes; ++i) { bntamb1_t *p = bns->ambs + i; scanres = fscanf(fp, "%lld%d%s", &xx, &p->len, str); if (scanres != 3) goto badread; p->offset = xx; p->amb = str[0]; } err_fclose(fp); } { // open .pac bns->fp_pac = xopen(pac_filename, "rb"); } return bns; badread: if (EOF == scanres) { err_fatal(__func__, "Error reading %s : %s\n", fname, ferror(fp) ? strerror(errno) : "Unexpected end of file"); } err_fatal(__func__, "Parse error reading %s\n", fname); }
int64_t bis_bns_fasta2bntseq(gzFile fp_fa, const char *prefix, uint8_t parent) { extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c kseq_t *seq; char name[1024]; bntseq_t *bns; uint8_t *pac = 0, *_pac = 0; int32_t m_seqs, m_holes; int64_t ret = -1, m_pac; bntamb1_t *q; FILE *fp; // initialization gzseek(fp_fa, 0, SEEK_SET); seq = kseq_init(fp_fa); bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); bns->seed = 11; // fixed seed for random generator srand48(bns->seed); m_seqs = m_holes = 8; m_pac = 0x10000; bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t)); bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t)); _pac = calloc(m_pac/4, 1); q = bns->ambs; if (parent) { strcpy(name, prefix); strcat(name, ".par.pac"); } else { strcpy(name, prefix); strcat(name, ".dau.pac"); } fp = xopen(name, "wb"); // read sequences while (kseq_read(seq) >= 0) { _pac = bis_add1(seq, bns, _pac, &m_pac, &m_seqs, &m_holes, &q); } /* kseq_rewind(seq); */ /* gzseek(seq->f->f, 0, SEEK_SET); */ /* fprintf(stderr, "foward end\n"); */ /* fflush(stderr); */ /* while (kseq_read(seq) >= 0) { */ /* if (parent) nt256char_rev_ip(seq->seq.s, seq->seq.l); */ /* pac = bis_add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q, parent, 1); */ /* } */ int64_t l,k; m_pac = (bns->l_pac*2+3)/4*4; /* in bit */ pac = calloc(m_pac/4,sizeof(uint8_t)); for (l=0; l<bns->l_pac; ++l) { uint8_t c = _get_pac(_pac,l); if (parent && c == 1) c = 3; if (!parent && c == 2) c = 0; _set_pac(pac, l, c); } for (k=bns->l_pac-1; k>=0; --k,++l) { uint8_t c = 3-_get_pac(_pac,k); if (parent && c == 1) c = 3; if (!parent && c == 2) c = 0; _set_pac(pac, l, c); } free(_pac); /* int64_t l; */ /* fprintf(stderr, "reverse end\n"); */ /* fflush(stderr); */ /* if (!for_only) { // add the reverse complemented sequence */ /* m_pac = (bns->l_pac * 2 + 3) / 4 * 4; */ /* pac = realloc(pac, m_pac/4); */ /* memset(pac + (bns->l_pac+3)/4, 0, (m_pac - (bns->l_pac+3)/4*4) / 4); */ /* for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac) */ /* _set_pac(pac, bns->l_pac, 3-_get_pac(pac, l)); */ /* } */ assert(bns->l_pac<<1 == l); { // finalize .pac file ubyte_t ct; err_fwrite(pac, 1, (l>>2) + ((l&3) == 0? 0 : 1), fp); // the following codes make the pac file size always (l_pac/4+1+1) if (l % 4 == 0) { ct = 0; err_fwrite(&ct, 1, 1, fp); } ct = l % 4; err_fwrite(&ct, 1, 1, fp); // close .pac file err_fflush(fp); err_fclose(fp); } if (parent) bis_bns_dump(bns, prefix); bns_destroy(bns); kseq_destroy(seq); free(pac); return l; }