// Convert the given reference nucleotide FASTA and GFF file to a protein FASTA file int writeIndexProtein(const char * passPrefix, const char * passProName, const char * passAnnName) { struct CDS currentCDS; gzFile inputSeqPtr; FILE * inputAnnPtr, * outputPtr; char * outputBuffer; kseq_t * seq; unsigned long outputSize, currentLine; // Prepare file handles inputSeqPtr = xzopen(passPrefix, "r"); inputAnnPtr = fopen(passAnnName, "r"); outputPtr = fopen(passProName, "w"); // Read in 1st sequence data seq = kseq_init(inputSeqPtr); kseq_read(seq); // Iterate through each CDS sequence in the annotation file currentLine = 0; while (getNextCDS(inputAnnPtr, ¤tCDS, ¤tLine)) { convertToAA(seq->seq.s, ¤tCDS, &outputBuffer, &outputSize); fprintf(outputPtr, ">%d\n%.*s\n", currentLine, outputSize, outputBuffer); free(outputBuffer); } // Close files fflush(outputPtr); err_gzclose(inputSeqPtr); err_fclose(inputAnnPtr); err_fclose(outputPtr); return 0; }
void bwa_seq_close(bwa_seqio_t *bs) { if (bs == 0) return; if (bs->is_bam) { if (0 != bam_close(bs->fp)) err_fatal_simple("Error closing bam file"); } else { err_gzclose(bs->ks->f->f); kseq_destroy(bs->ks); } free(bs); }
int main(int argc, char *argv[]) { bwaidx_t *idx; gzFile fp; kseq_t *ks; mem_opt_t *opt; if (argc < 3) { fprintf(stderr, "Usage: bwamem-lite <idx.base> <reads.fq>\n"); return 1; } idx = bwa_idx_load(argv[1], BWA_IDX_ALL); // load the BWA index if (NULL == idx) { fprintf(stderr, "Index load failed.\n"); exit(EXIT_FAILURE); } fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r"); if (NULL == fp) { fprintf(stderr, "Couldn't open %s : %s\n", strcmp(argv[2], "-") ? argv[2] : "stdin", errno ? strerror(errno) : "Out of memory"); exit(EXIT_FAILURE); } ks = kseq_init(fp); // initialize the FASTA/Q parser opt = mem_opt_init(); // initialize the BWA-MEM parameters to the default values while (kseq_read(ks) >= 0) { // read one sequence mem_alnreg_v ar; int i, k; ar = mem_align1(opt, idx->bwt, idx->bns, idx->pac, ks->seq.l, ks->seq.s); // get all the hits for (i = 0; i < ar.n; ++i) { // traverse each hit mem_aln_t a; if (ar.a[i].secondary >= 0) continue; // skip secondary alignments a = mem_reg2aln(opt, idx->bns, idx->pac, ks->seq.l, ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR // print alignment err_printf("%s\t%c\t%s\t%ld\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, (long)a.pos, a.mapq); for (k = 0; k < a.n_cigar; ++k) // print CIGAR err_printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]); err_printf("\t%d\n", a.NM); // print edit distance free(a.cigar); // don't forget to deallocate CIGAR } free(ar.a); // and deallocate the hit list } free(opt); kseq_destroy(ks); err_gzclose(fp); bwa_idx_destroy(idx); return 0; }
void bwa_seq_close(bwa_seqio_t *bs) { if (bs == 0) return; if (bs->is_bam) { #ifdef USE_HTSLIB if (0 != sam_close(bs->fp)) err_fatal_simple("Error closing sam/bam file"); bam_hdr_destroy(bs->h); #else if (0 != bam_close(bs->fp)) err_fatal_simple("Error closing bam file"); #endif } else { err_gzclose(bs->ks->f->f); kseq_destroy(bs->ks); } free(bs); }
int bwa_fa2pac(int argc, char *argv[]) { int c, for_only = 0; gzFile fp; while ((c = getopt(argc, argv, "f")) >= 0) { switch (c) { case 'f': for_only = 1; break; } } if (argc == optind) { fprintf(stderr, "Usage: bwa fa2pac [-f] <in.fasta> [<out.prefix>]\n"); return 1; } fp = xzopen(argv[optind], "r"); bns_fasta2bntseq(fp, (optind+1 < argc)? argv[optind+1] : argv[optind], for_only); err_gzclose(fp); return 0; }
// Detects ORFs in the given nucleotide FASTA file and converts to a protein FASTA file int writeReadsProtein(const char * passPrefix, const char * passProName) { struct CDS * orfList; gzFile inputSeqPtr; FILE * outputPtr; char * outputBuffer; kseq_t * seq; unsigned long seqIdx, outputSize, orfCount, orfIdx; char testHeader[4096]; // Prepare file handles inputSeqPtr = xzopen(passPrefix, "r"); outputPtr = fopen(passProName, "w"); // Iterate through each read seqIdx = 0; seq = kseq_init(inputSeqPtr); while(kseq_read(seq) >= 0) { // Search for ORFs getSequenceORF(seq->seq.s, seq->seq.l, &orfList, &orfCount); //if (orfCount > 0) testReadHeader(seq->name.s, testHeader); // Write out the corresponding protein sequence for each ORF for (orfIdx = 0 ; orfIdx < orfCount ; orfIdx++) { convertToAA(seq->seq.s, orfList+orfIdx, &outputBuffer, &outputSize); fprintf(outputPtr, ">%d:%d\n%.*s\n", seqIdx, orfIdx, outputSize, outputBuffer); free(outputBuffer); } seqIdx++; } // Close files fflush(outputPtr); err_gzclose(inputSeqPtr); err_fclose(outputPtr); return 0; }