Beispiel #1
0
// Convert the given reference nucleotide FASTA and GFF file to a protein FASTA file
int writeIndexProtein(const char * passPrefix, const char * passProName, const char * passAnnName) {
	struct CDS currentCDS;
	gzFile  inputSeqPtr;
	FILE * inputAnnPtr, * outputPtr;
 	char * outputBuffer;
	kseq_t * seq;
	unsigned long outputSize, currentLine;

	// Prepare file handles
	inputSeqPtr = xzopen(passPrefix, "r");
	inputAnnPtr = fopen(passAnnName, "r");
	outputPtr = fopen(passProName, "w");

	// Read in 1st sequence data
	seq = kseq_init(inputSeqPtr);
	kseq_read(seq);

	// Iterate through each CDS sequence in the annotation file
	currentLine = 0;
	while (getNextCDS(inputAnnPtr, &currentCDS, &currentLine)) {
		convertToAA(seq->seq.s, &currentCDS, &outputBuffer, &outputSize);
		fprintf(outputPtr, ">%d\n%.*s\n", currentLine,  outputSize, outputBuffer);
		free(outputBuffer);
	}

	// Close files
	fflush(outputPtr);
	err_gzclose(inputSeqPtr);
	err_fclose(inputAnnPtr);
	err_fclose(outputPtr);

	return 0;
}
Beispiel #2
0
static seqs_t *load_seqs(const char *fn)
{
	seqs_t *s;
	seq1_t *p;
	gzFile fp;
	int l;
	kseq_t *seq;

	fp = xzopen(fn, "r");
	seq = kseq_init(fp);
	s = (seqs_t*)calloc(1, sizeof(seqs_t));
	s->m_seqs = 256;
	s->seqs = (seq1_t*)calloc(s->m_seqs, sizeof(seq1_t));
	while ((l = kseq_read(seq)) >= 0) {
		if (s->n_seqs == s->m_seqs) {
			s->m_seqs <<= 1;
			s->seqs = (seq1_t*)realloc(s->seqs, s->m_seqs * sizeof(seq1_t));
		}
		p = s->seqs + (s->n_seqs++);
		p->l = seq->seq.l;
		p->s = (unsigned char*)malloc(p->l + 1);
		memcpy(p->s, seq->seq.s, p->l);
		p->s[p->l] = 0;
		p->n = strdup((const char*)seq->name.s);
	}
	kseq_destroy(seq);
	gzclose(fp);
	fprintf(stderr, "[load_seqs] %d sequences are loaded.\n", s->n_seqs);
	return s;
}
Beispiel #3
0
bwa_seqio_t *bwa_seq_open(const char *fn) {
	gzFile fp;
	bwa_seqio_t *bs;
	bs = (bwa_seqio_t*) calloc(1, sizeof(bwa_seqio_t));
	fp = xzopen(fn, "r");
	bs->ks = kseq_init(fp);
	return bs;
}
Beispiel #4
0
static FD_t xzdOpen(const char * path, const char * mode)
{
    FD_t fd;
    LZFILE *lzfile;
    if ((lzfile = xzopen(path, mode)) == NULL)
	return NULL;
    fd = fdNew(path);
    fdPop(fd); fdPush(fd, xzdio, lzfile, -1);
    return fdLink(fd);
}
Beispiel #5
0
int bwa_fa2pac(int argc, char *argv[])
{
	gzFile fp;
	if (argc < 2) {
		fprintf(stderr, "Usage: bwa fa2pac <in.fasta> [<out.prefix>]\n");
		return 1;
	}
	fp = xzopen(argv[1], "r");
	bns_fasta2bntseq(fp, (argc < 3)? argv[1] : argv[2]);
	gzclose(fp);
	return 0;
}
Beispiel #6
0
/*@-globuse@*/
static /*@null@*/ FD_t xzdOpen(const char * path, const char * fmode)
	/*@globals fileSystem @*/
	/*@modifies fileSystem @*/
{
    FD_t fd;
    mode_t mode = (fmode && fmode[0] == 'w' ? O_WRONLY : O_RDONLY);
    XZFILE * xzfile = xzopen(path, fmode);

    if (xzfile == NULL)
	return NULL;
    fd = fdNew("open (xzdOpen)");
    fdPop(fd); fdPush(fd, xzdio, xzfile, -1);
    fdSetOpen(fd, path, fileno(xzfile->fp), mode);
    return fdLink(fd, "xzdOpen");
}
Beispiel #7
0
int bwa_fa2pac(int argc, char *argv[])
{
	int c, for_only = 0;
	gzFile fp;
	while ((c = getopt(argc, argv, "f")) >= 0) {
		switch (c) {
			case 'f': for_only = 1; break;
		}
	}
	if (argc == optind) {
		fprintf(stderr, "Usage: bwa fa2pac [-f] <in.fasta> [<out.prefix>]\n");
		return 1;
	}
	fp = xzopen(argv[optind], "r");
	bns_fasta2bntseq(fp, (optind+1 < argc)? argv[optind+1] : argv[optind], for_only);
	err_gzclose(fp);
	return 0;
}
Beispiel #8
0
ts_file_t *ts_fopen(const char *filepath, const char *mode)
{
  ts_file_t *qfile=0;
  int len=0;

  TS_TRY(filepath); TS_TRY(*filepath); TS_TRY(mode); TS_TRY(*mode);

  if(filepath){
    len=strlen(filepath);
  }

  TS_TRY( qfile=calloc(1, sizeof(ts_file_t)) );

  if( ts_file_is_gz(filepath) ) {
    qfile->type = TS_FILE_ZLB;
  } else if( ts_file_is_xz(filepath) ) {
    qfile->type = TS_FILE_XZ;
  }
  else {
    qfile->type = TS_FILE_STD;
  }

  switch(qfile->type){
  case TS_FILE_STD: TS_TRY( qfile->fp.std =  fopen(filepath, mode)); break;
  case TS_FILE_ZLB: TS_TRY( qfile->fp.zlb = gzopen(filepath, mode)); break;
  case TS_FILE_XZ:  TS_TRY( qfile->fp.xz =  xzopen(filepath, mode)); break;
  default:
    (void)ts_warn(stderr, "\n");
    goto fail;
  }
  
  return(qfile);
 fail:
  if(qfile) {
    free(qfile);
    qfile=0;
  }
  return(0);
}
Beispiel #9
0
// Detects ORFs in the given nucleotide FASTA file and converts to a protein FASTA file
int writeReadsProtein(const char * passPrefix, const char * passProName) {
	struct CDS * orfList;
	gzFile inputSeqPtr;
	FILE * outputPtr;
 	char * outputBuffer;
	kseq_t * seq;
	unsigned long seqIdx, outputSize, orfCount, orfIdx;
	char testHeader[4096];

	// Prepare file handles
	inputSeqPtr = xzopen(passPrefix, "r");
	outputPtr = fopen(passProName, "w");

	// Iterate through each read
	seqIdx = 0;
	seq = kseq_init(inputSeqPtr);

	while(kseq_read(seq) >= 0) {
		// Search for ORFs
		getSequenceORF(seq->seq.s, seq->seq.l, &orfList, &orfCount);
		//if (orfCount > 0) testReadHeader(seq->name.s, testHeader);
		// Write out the corresponding protein sequence for each ORF
		for (orfIdx = 0 ; orfIdx < orfCount ; orfIdx++) {
			convertToAA(seq->seq.s, orfList+orfIdx, &outputBuffer, &outputSize);
			fprintf(outputPtr, ">%d:%d\n%.*s\n", seqIdx, orfIdx, outputSize, outputBuffer);
			free(outputBuffer);
		}

		seqIdx++;
	}

	// Close files
	fflush(outputPtr);
	err_gzclose(inputSeqPtr);
	err_fclose(outputPtr);

	return 0;
}
Beispiel #10
0
int bwa_index(int argc, char *argv[])
{
	char *prefix = 0, *str, *str2, *str3;
	int c, algo_type = 0, is_color = 0, is_64 = 0;
	clock_t t;
	int64_t l_pac;

	while ((c = getopt(argc, argv, "6ca:p:")) >= 0) {
		switch (c) {
		case 'a': // if -a is not set, algo_type will be determined later
			if (strcmp(optarg, "div") == 0) algo_type = 1;
			else if (strcmp(optarg, "bwtsw") == 0) algo_type = 2;
			else if (strcmp(optarg, "is") == 0) algo_type = 3;
			else err_fatal(__func__, "unknown algorithm: '%s'.", optarg);
			break;
		case 'p': prefix = strdup(optarg); break;
		case 'c': is_color = 1; break;
		case '6': is_64 = 1; break;
		default: return 1;
		}
	}

	if (optind + 1 > argc) {
		fprintf(stderr, "\n");
		fprintf(stderr, "Usage:   bwa index [-a bwtsw|is] [-c] <in.fasta>\n\n");
		fprintf(stderr, "Options: -a STR    BWT construction algorithm: bwtsw or is [auto]\n");
		fprintf(stderr, "         -p STR    prefix of the index [same as fasta name]\n");
		fprintf(stderr, "         -6        index files named as <in.fasta>.64.* instead of <in.fasta>.* \n");
//		fprintf(stderr, "         -c        build color-space index\n");
		fprintf(stderr, "\n");
		fprintf(stderr,	"Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n");
		fprintf(stderr, "         `-a div' do not work not for long genomes. Please choose `-a'\n");
		fprintf(stderr, "         according to the length of the genome.\n\n");
		return 1;
	}
	if (prefix == 0) {
		prefix = malloc(strlen(argv[optind]) + 4);
		strcpy(prefix, argv[optind]);
		if (is_64) strcat(prefix, ".64");
	}
	str  = (char*)calloc(strlen(prefix) + 10, 1);
	str2 = (char*)calloc(strlen(prefix) + 10, 1);
	str3 = (char*)calloc(strlen(prefix) + 10, 1);

	if (is_color == 0) { // nucleotide indexing
		gzFile fp = xzopen(argv[optind], "r");
		t = clock();
		fprintf(stderr, "[bwa_index] Pack FASTA... ");
		l_pac = bns_fasta2bntseq(fp, prefix, 0);
		fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
		gzclose(fp);
	} else { // color indexing
		gzFile fp = xzopen(argv[optind], "r");
		strcat(strcpy(str, prefix), ".nt");
		t = clock();
		fprintf(stderr, "[bwa_index] Pack nucleotide FASTA... ");
		l_pac = bns_fasta2bntseq(fp, str, 0);
		fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
		gzclose(fp);
		{
			char *tmp_argv[3];
			tmp_argv[0] = argv[0]; tmp_argv[1] = str; tmp_argv[2] = prefix;
			t = clock();
			fprintf(stderr, "[bwa_index] Convert nucleotide PAC to color PAC... ");
			bwa_pac2cspac(3, tmp_argv);
			fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
		}
	}
	if (algo_type == 0) algo_type = l_pac > 50000000? 2 : 3; // set the algorithm for generating BWT
	{
		strcpy(str, prefix); strcat(str, ".pac");
		strcpy(str2, prefix); strcat(str2, ".bwt");
		t = clock();
		fprintf(stderr, "[bwa_index] Construct BWT for the packed sequence...\n");
		if (algo_type == 2) bwt_bwtgen(str, str2);
		else if (algo_type == 1 || algo_type == 3) {
			bwt_t *bwt;
			bwt = bwt_pac2bwt(str, algo_type == 3);
			bwt_dump_bwt(str2, bwt);
			bwt_destroy(bwt);
		}
		fprintf(stderr, "[bwa_index] %.2f seconds elapse.\n", (float)(clock() - t) / CLOCKS_PER_SEC);
	}
	{
		bwt_t *bwt;
		strcpy(str, prefix); strcat(str, ".bwt");
		t = clock();
		fprintf(stderr, "[bwa_index] Update BWT... ");
		bwt = bwt_restore_bwt(str);
		bwt_bwtupdate_core(bwt);
		bwt_dump_bwt(str, bwt);
		bwt_destroy(bwt);
		fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
	}
	{
		gzFile fp = xzopen(argv[optind], "r");
		t = clock();
		fprintf(stderr, "[bwa_index] Pack forward-only FASTA... ");
		l_pac = bns_fasta2bntseq(fp, prefix, 1);
		fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
		gzclose(fp);
	}
	{
		bwt_t *bwt;
		strcpy(str, prefix); strcat(str, ".bwt");
		strcpy(str3, prefix); strcat(str3, ".sa");
		t = clock();
		fprintf(stderr, "[bwa_index] Construct SA from BWT and Occ... ");
		bwt = bwt_restore_bwt(str);
		bwt_cal_sa(bwt, 32);
		bwt_dump_sa(str3, bwt);
		bwt_destroy(bwt);
		fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
	}
	free(str3); free(str2); free(str); free(prefix);
	return 0;
}