// Convert the given reference nucleotide FASTA and GFF file to a protein FASTA file int writeIndexProtein(const char * passPrefix, const char * passProName, const char * passAnnName) { struct CDS currentCDS; gzFile inputSeqPtr; FILE * inputAnnPtr, * outputPtr; char * outputBuffer; kseq_t * seq; unsigned long outputSize, currentLine; // Prepare file handles inputSeqPtr = xzopen(passPrefix, "r"); inputAnnPtr = fopen(passAnnName, "r"); outputPtr = fopen(passProName, "w"); // Read in 1st sequence data seq = kseq_init(inputSeqPtr); kseq_read(seq); // Iterate through each CDS sequence in the annotation file currentLine = 0; while (getNextCDS(inputAnnPtr, ¤tCDS, ¤tLine)) { convertToAA(seq->seq.s, ¤tCDS, &outputBuffer, &outputSize); fprintf(outputPtr, ">%d\n%.*s\n", currentLine, outputSize, outputBuffer); free(outputBuffer); } // Close files fflush(outputPtr); err_gzclose(inputSeqPtr); err_fclose(inputAnnPtr); err_fclose(outputPtr); return 0; }
static seqs_t *load_seqs(const char *fn) { seqs_t *s; seq1_t *p; gzFile fp; int l; kseq_t *seq; fp = xzopen(fn, "r"); seq = kseq_init(fp); s = (seqs_t*)calloc(1, sizeof(seqs_t)); s->m_seqs = 256; s->seqs = (seq1_t*)calloc(s->m_seqs, sizeof(seq1_t)); while ((l = kseq_read(seq)) >= 0) { if (s->n_seqs == s->m_seqs) { s->m_seqs <<= 1; s->seqs = (seq1_t*)realloc(s->seqs, s->m_seqs * sizeof(seq1_t)); } p = s->seqs + (s->n_seqs++); p->l = seq->seq.l; p->s = (unsigned char*)malloc(p->l + 1); memcpy(p->s, seq->seq.s, p->l); p->s[p->l] = 0; p->n = strdup((const char*)seq->name.s); } kseq_destroy(seq); gzclose(fp); fprintf(stderr, "[load_seqs] %d sequences are loaded.\n", s->n_seqs); return s; }
bwa_seqio_t *bwa_seq_open(const char *fn) { gzFile fp; bwa_seqio_t *bs; bs = (bwa_seqio_t*) calloc(1, sizeof(bwa_seqio_t)); fp = xzopen(fn, "r"); bs->ks = kseq_init(fp); return bs; }
static FD_t xzdOpen(const char * path, const char * mode) { FD_t fd; LZFILE *lzfile; if ((lzfile = xzopen(path, mode)) == NULL) return NULL; fd = fdNew(path); fdPop(fd); fdPush(fd, xzdio, lzfile, -1); return fdLink(fd); }
int bwa_fa2pac(int argc, char *argv[]) { gzFile fp; if (argc < 2) { fprintf(stderr, "Usage: bwa fa2pac <in.fasta> [<out.prefix>]\n"); return 1; } fp = xzopen(argv[1], "r"); bns_fasta2bntseq(fp, (argc < 3)? argv[1] : argv[2]); gzclose(fp); return 0; }
/*@-globuse@*/ static /*@null@*/ FD_t xzdOpen(const char * path, const char * fmode) /*@globals fileSystem @*/ /*@modifies fileSystem @*/ { FD_t fd; mode_t mode = (fmode && fmode[0] == 'w' ? O_WRONLY : O_RDONLY); XZFILE * xzfile = xzopen(path, fmode); if (xzfile == NULL) return NULL; fd = fdNew("open (xzdOpen)"); fdPop(fd); fdPush(fd, xzdio, xzfile, -1); fdSetOpen(fd, path, fileno(xzfile->fp), mode); return fdLink(fd, "xzdOpen"); }
int bwa_fa2pac(int argc, char *argv[]) { int c, for_only = 0; gzFile fp; while ((c = getopt(argc, argv, "f")) >= 0) { switch (c) { case 'f': for_only = 1; break; } } if (argc == optind) { fprintf(stderr, "Usage: bwa fa2pac [-f] <in.fasta> [<out.prefix>]\n"); return 1; } fp = xzopen(argv[optind], "r"); bns_fasta2bntseq(fp, (optind+1 < argc)? argv[optind+1] : argv[optind], for_only); err_gzclose(fp); return 0; }
ts_file_t *ts_fopen(const char *filepath, const char *mode) { ts_file_t *qfile=0; int len=0; TS_TRY(filepath); TS_TRY(*filepath); TS_TRY(mode); TS_TRY(*mode); if(filepath){ len=strlen(filepath); } TS_TRY( qfile=calloc(1, sizeof(ts_file_t)) ); if( ts_file_is_gz(filepath) ) { qfile->type = TS_FILE_ZLB; } else if( ts_file_is_xz(filepath) ) { qfile->type = TS_FILE_XZ; } else { qfile->type = TS_FILE_STD; } switch(qfile->type){ case TS_FILE_STD: TS_TRY( qfile->fp.std = fopen(filepath, mode)); break; case TS_FILE_ZLB: TS_TRY( qfile->fp.zlb = gzopen(filepath, mode)); break; case TS_FILE_XZ: TS_TRY( qfile->fp.xz = xzopen(filepath, mode)); break; default: (void)ts_warn(stderr, "\n"); goto fail; } return(qfile); fail: if(qfile) { free(qfile); qfile=0; } return(0); }
// Detects ORFs in the given nucleotide FASTA file and converts to a protein FASTA file int writeReadsProtein(const char * passPrefix, const char * passProName) { struct CDS * orfList; gzFile inputSeqPtr; FILE * outputPtr; char * outputBuffer; kseq_t * seq; unsigned long seqIdx, outputSize, orfCount, orfIdx; char testHeader[4096]; // Prepare file handles inputSeqPtr = xzopen(passPrefix, "r"); outputPtr = fopen(passProName, "w"); // Iterate through each read seqIdx = 0; seq = kseq_init(inputSeqPtr); while(kseq_read(seq) >= 0) { // Search for ORFs getSequenceORF(seq->seq.s, seq->seq.l, &orfList, &orfCount); //if (orfCount > 0) testReadHeader(seq->name.s, testHeader); // Write out the corresponding protein sequence for each ORF for (orfIdx = 0 ; orfIdx < orfCount ; orfIdx++) { convertToAA(seq->seq.s, orfList+orfIdx, &outputBuffer, &outputSize); fprintf(outputPtr, ">%d:%d\n%.*s\n", seqIdx, orfIdx, outputSize, outputBuffer); free(outputBuffer); } seqIdx++; } // Close files fflush(outputPtr); err_gzclose(inputSeqPtr); err_fclose(outputPtr); return 0; }
int bwa_index(int argc, char *argv[]) { char *prefix = 0, *str, *str2, *str3; int c, algo_type = 0, is_color = 0, is_64 = 0; clock_t t; int64_t l_pac; while ((c = getopt(argc, argv, "6ca:p:")) >= 0) { switch (c) { case 'a': // if -a is not set, algo_type will be determined later if (strcmp(optarg, "div") == 0) algo_type = 1; else if (strcmp(optarg, "bwtsw") == 0) algo_type = 2; else if (strcmp(optarg, "is") == 0) algo_type = 3; else err_fatal(__func__, "unknown algorithm: '%s'.", optarg); break; case 'p': prefix = strdup(optarg); break; case 'c': is_color = 1; break; case '6': is_64 = 1; break; default: return 1; } } if (optind + 1 > argc) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: bwa index [-a bwtsw|is] [-c] <in.fasta>\n\n"); fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw or is [auto]\n"); fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n"); fprintf(stderr, " -6 index files named as <in.fasta>.64.* instead of <in.fasta>.* \n"); // fprintf(stderr, " -c build color-space index\n"); fprintf(stderr, "\n"); fprintf(stderr, "Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n"); fprintf(stderr, " `-a div' do not work not for long genomes. Please choose `-a'\n"); fprintf(stderr, " according to the length of the genome.\n\n"); return 1; } if (prefix == 0) { prefix = malloc(strlen(argv[optind]) + 4); strcpy(prefix, argv[optind]); if (is_64) strcat(prefix, ".64"); } str = (char*)calloc(strlen(prefix) + 10, 1); str2 = (char*)calloc(strlen(prefix) + 10, 1); str3 = (char*)calloc(strlen(prefix) + 10, 1); if (is_color == 0) { // nucleotide indexing gzFile fp = xzopen(argv[optind], "r"); t = clock(); fprintf(stderr, "[bwa_index] Pack FASTA... "); l_pac = bns_fasta2bntseq(fp, prefix, 0); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); gzclose(fp); } else { // color indexing gzFile fp = xzopen(argv[optind], "r"); strcat(strcpy(str, prefix), ".nt"); t = clock(); fprintf(stderr, "[bwa_index] Pack nucleotide FASTA... "); l_pac = bns_fasta2bntseq(fp, str, 0); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); gzclose(fp); { char *tmp_argv[3]; tmp_argv[0] = argv[0]; tmp_argv[1] = str; tmp_argv[2] = prefix; t = clock(); fprintf(stderr, "[bwa_index] Convert nucleotide PAC to color PAC... "); bwa_pac2cspac(3, tmp_argv); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); } } if (algo_type == 0) algo_type = l_pac > 50000000? 2 : 3; // set the algorithm for generating BWT { strcpy(str, prefix); strcat(str, ".pac"); strcpy(str2, prefix); strcat(str2, ".bwt"); t = clock(); fprintf(stderr, "[bwa_index] Construct BWT for the packed sequence...\n"); if (algo_type == 2) bwt_bwtgen(str, str2); else if (algo_type == 1 || algo_type == 3) { bwt_t *bwt; bwt = bwt_pac2bwt(str, algo_type == 3); bwt_dump_bwt(str2, bwt); bwt_destroy(bwt); } fprintf(stderr, "[bwa_index] %.2f seconds elapse.\n", (float)(clock() - t) / CLOCKS_PER_SEC); } { bwt_t *bwt; strcpy(str, prefix); strcat(str, ".bwt"); t = clock(); fprintf(stderr, "[bwa_index] Update BWT... "); bwt = bwt_restore_bwt(str); bwt_bwtupdate_core(bwt); bwt_dump_bwt(str, bwt); bwt_destroy(bwt); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); } { gzFile fp = xzopen(argv[optind], "r"); t = clock(); fprintf(stderr, "[bwa_index] Pack forward-only FASTA... "); l_pac = bns_fasta2bntseq(fp, prefix, 1); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); gzclose(fp); } { bwt_t *bwt; strcpy(str, prefix); strcat(str, ".bwt"); strcpy(str3, prefix); strcat(str3, ".sa"); t = clock(); fprintf(stderr, "[bwa_index] Construct SA from BWT and Occ... "); bwt = bwt_restore_bwt(str); bwt_cal_sa(bwt, 32); bwt_dump_sa(str3, bwt); bwt_destroy(bwt); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); } free(str3); free(str2); free(str); free(prefix); return 0; }