Exemple #1
0
// Convert the given reference nucleotide FASTA and GFF file to a protein FASTA file
int writeIndexProtein(const char * passPrefix, const char * passProName, const char * passAnnName) {
	struct CDS currentCDS;
	gzFile  inputSeqPtr;
	FILE * inputAnnPtr, * outputPtr;
 	char * outputBuffer;
	kseq_t * seq;
	unsigned long outputSize, currentLine;

	// Prepare file handles
	inputSeqPtr = xzopen(passPrefix, "r");
	inputAnnPtr = fopen(passAnnName, "r");
	outputPtr = fopen(passProName, "w");

	// Read in 1st sequence data
	seq = kseq_init(inputSeqPtr);
	kseq_read(seq);

	// Iterate through each CDS sequence in the annotation file
	currentLine = 0;
	while (getNextCDS(inputAnnPtr, &currentCDS, &currentLine)) {
		convertToAA(seq->seq.s, &currentCDS, &outputBuffer, &outputSize);
		fprintf(outputPtr, ">%d\n%.*s\n", currentLine,  outputSize, outputBuffer);
		free(outputBuffer);
	}

	// Close files
	fflush(outputPtr);
	err_gzclose(inputSeqPtr);
	err_fclose(inputAnnPtr);
	err_fclose(outputPtr);

	return 0;
}
Exemple #2
0
void bwa_seq_close(bwa_seqio_t *bs)
{
	if (bs == 0) return;
	if (bs->is_bam) {
		if (0 != bam_close(bs->fp)) err_fatal_simple("Error closing bam file");
	} else {
		err_gzclose(bs->ks->f->f);
		kseq_destroy(bs->ks);
	}
	free(bs);
}
Exemple #3
0
int main(int argc, char *argv[])
{
	bwaidx_t *idx;
	gzFile fp;
	kseq_t *ks;
	mem_opt_t *opt;

	if (argc < 3) {
		fprintf(stderr, "Usage: bwamem-lite <idx.base> <reads.fq>\n");
		return 1;
	}

	idx = bwa_idx_load(argv[1], BWA_IDX_ALL); // load the BWA index
	if (NULL == idx) {
		fprintf(stderr, "Index load failed.\n");
		exit(EXIT_FAILURE);
	}
	fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r");
	if (NULL == fp) {
		fprintf(stderr, "Couldn't open %s : %s\n",
				strcmp(argv[2], "-") ? argv[2] : "stdin",
				errno ? strerror(errno) : "Out of memory");
		exit(EXIT_FAILURE);
	}
	ks = kseq_init(fp); // initialize the FASTA/Q parser
	opt = mem_opt_init(); // initialize the BWA-MEM parameters to the default values

	while (kseq_read(ks) >= 0) { // read one sequence
		mem_alnreg_v ar;
		int i, k;
		ar = mem_align1(opt, idx->bwt, idx->bns, idx->pac, ks->seq.l, ks->seq.s); // get all the hits
		for (i = 0; i < ar.n; ++i) { // traverse each hit
			mem_aln_t a;
			if (ar.a[i].secondary >= 0) continue; // skip secondary alignments
			a = mem_reg2aln(opt, idx->bns, idx->pac, ks->seq.l, ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR
			// print alignment
			err_printf("%s\t%c\t%s\t%ld\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, (long)a.pos, a.mapq);
			for (k = 0; k < a.n_cigar; ++k) // print CIGAR
				err_printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]);
			err_printf("\t%d\n", a.NM); // print edit distance
			free(a.cigar); // don't forget to deallocate CIGAR
		}
		free(ar.a); // and deallocate the hit list
	}

	free(opt);
	kseq_destroy(ks);
	err_gzclose(fp);
	bwa_idx_destroy(idx);
	return 0;
}
Exemple #4
0
void bwa_seq_close(bwa_seqio_t *bs)
{
	if (bs == 0) return;
	if (bs->is_bam) {
#ifdef USE_HTSLIB
		if (0 != sam_close(bs->fp)) err_fatal_simple("Error closing sam/bam file");
		bam_hdr_destroy(bs->h);
#else
		if (0 != bam_close(bs->fp)) err_fatal_simple("Error closing bam file");
#endif
	} else {
		err_gzclose(bs->ks->f->f);
		kseq_destroy(bs->ks);
	}
	free(bs);
}
Exemple #5
0
int bwa_fa2pac(int argc, char *argv[])
{
	int c, for_only = 0;
	gzFile fp;
	while ((c = getopt(argc, argv, "f")) >= 0) {
		switch (c) {
			case 'f': for_only = 1; break;
		}
	}
	if (argc == optind) {
		fprintf(stderr, "Usage: bwa fa2pac [-f] <in.fasta> [<out.prefix>]\n");
		return 1;
	}
	fp = xzopen(argv[optind], "r");
	bns_fasta2bntseq(fp, (optind+1 < argc)? argv[optind+1] : argv[optind], for_only);
	err_gzclose(fp);
	return 0;
}
Exemple #6
0
// Detects ORFs in the given nucleotide FASTA file and converts to a protein FASTA file
int writeReadsProtein(const char * passPrefix, const char * passProName) {
	struct CDS * orfList;
	gzFile inputSeqPtr;
	FILE * outputPtr;
 	char * outputBuffer;
	kseq_t * seq;
	unsigned long seqIdx, outputSize, orfCount, orfIdx;
	char testHeader[4096];

	// Prepare file handles
	inputSeqPtr = xzopen(passPrefix, "r");
	outputPtr = fopen(passProName, "w");

	// Iterate through each read
	seqIdx = 0;
	seq = kseq_init(inputSeqPtr);

	while(kseq_read(seq) >= 0) {
		// Search for ORFs
		getSequenceORF(seq->seq.s, seq->seq.l, &orfList, &orfCount);
		//if (orfCount > 0) testReadHeader(seq->name.s, testHeader);
		// Write out the corresponding protein sequence for each ORF
		for (orfIdx = 0 ; orfIdx < orfCount ; orfIdx++) {
			convertToAA(seq->seq.s, orfList+orfIdx, &outputBuffer, &outputSize);
			fprintf(outputPtr, ">%d:%d\n%.*s\n", seqIdx, orfIdx, outputSize, outputBuffer);
			free(outputBuffer);
		}

		seqIdx++;
	}

	// Close files
	fflush(outputPtr);
	err_gzclose(inputSeqPtr);
	err_fclose(outputPtr);

	return 0;
}