Beispiel #1
0
// Convert the given reference nucleotide FASTA and GFF file to a protein FASTA file
int writeIndexProtein(const char * passPrefix, const char * passProName, const char * passAnnName) {
	struct CDS currentCDS;
	gzFile  inputSeqPtr;
	FILE * inputAnnPtr, * outputPtr;
 	char * outputBuffer;
	kseq_t * seq;
	unsigned long outputSize, currentLine;

	// Prepare file handles
	inputSeqPtr = xzopen(passPrefix, "r");
	inputAnnPtr = fopen(passAnnName, "r");
	outputPtr = fopen(passProName, "w");

	// Read in 1st sequence data
	seq = kseq_init(inputSeqPtr);
	kseq_read(seq);

	// Iterate through each CDS sequence in the annotation file
	currentLine = 0;
	while (getNextCDS(inputAnnPtr, &currentCDS, &currentLine)) {
		convertToAA(seq->seq.s, &currentCDS, &outputBuffer, &outputSize);
		fprintf(outputPtr, ">%d\n%.*s\n", currentLine,  outputSize, outputBuffer);
		free(outputBuffer);
	}

	// Close files
	fflush(outputPtr);
	err_gzclose(inputSeqPtr);
	err_fclose(inputAnnPtr);
	err_fclose(outputPtr);

	return 0;
}
Beispiel #2
0
void bns_dump(const bntseq_t *bns, const char *prefix)
{
	char str[1024];
	FILE *fp;
	int i;
	{ // dump .ann
		strcpy(str, prefix); strcat(str, ".ann");
		fp = xopen(str, "w");
		err_fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->seed);
		for (i = 0; i != bns->n_seqs; ++i) {
			bntann1_t *p = bns->anns + i;
			err_fprintf(fp, "%d %s", p->gi, p->name);
			if (p->anno[0]) err_fprintf(fp, " %s\n", p->anno);
			else err_fprintf(fp, "\n");
			err_fprintf(fp, "%lld %d %d\n", (long long)p->offset, p->len, p->n_ambs);
		}
		err_fflush(fp);
		err_fclose(fp);
	}
	{ // dump .amb
		strcpy(str, prefix); strcat(str, ".amb");
		fp = xopen(str, "w");
		err_fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->n_holes);
		for (i = 0; i != bns->n_holes; ++i) {
			bntamb1_t *p = bns->ambs + i;
			err_fprintf(fp, "%lld %d %c\n", (long long)p->offset, p->len, p->amb);
		}
		err_fflush(fp);
		err_fclose(fp);
	}
}
Beispiel #3
0
int main(int argc, char *argv[])
{
	int i, ret;
	double t_real;
	kstring_t pg = {0,0,0};
	t_real = realtime();
	ksprintf(&pg, "@PG\tID:bwa\tPN:bwa\tVN:%s\tCL:%s", PACKAGE_VERSION, argv[0]);
	for (i = 1; i < argc; ++i) ksprintf(&pg, " %s", argv[i]);
	bwa_pg = pg.s;
	if (argc < 2) return usage();
	if (strcmp(argv[1], "fa2pac") == 0) ret = bwa_fa2pac(argc-1, argv+1);
	else if (strcmp(argv[1], "pac2bwt") == 0) ret = bwa_pac2bwt(argc-1, argv+1);
	else if (strcmp(argv[1], "pac2bwtgen") == 0) ret = bwt_bwtgen_main(argc-1, argv+1);
	else if (strcmp(argv[1], "bwtupdate") == 0) ret = bwa_bwtupdate(argc-1, argv+1);
	else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1);
	else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1);
	else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1);
	else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1);
	else if (strcmp(argv[1], "sampe") == 0) ret = bwa_sai2sam_pe(argc-1, argv+1);
	else if (strcmp(argv[1], "bwtsw2") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
	else if (strcmp(argv[1], "dbwtsw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
	else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
	else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1);
	else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1);
	else if (strcmp(argv[1], "pemerge") == 0) ret = main_pemerge(argc-1, argv+1);
	else {
		fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
		return 1;
	}
#ifdef USE_HTSLIB
	if (strcmp(argv[1], "mem") != 0) { 
		err_fflush(stdout);
		err_fclose(stdout);
	}
#else
	err_fflush(stdout);
	err_fclose(stdout);
#endif
	if (ret == 0) {
		fprintf(stderr, "[%s] Version: %s\n", __func__, PACKAGE_VERSION);
		fprintf(stderr, "[%s] CMD:", __func__);
		for (i = 0; i < argc; ++i)
			fprintf(stderr, " %s", argv[i]);
		fprintf(stderr, "\n[%s] Real time: %.3f sec; CPU: %.3f sec\n", __func__, realtime() - t_real, cputime());
	}
	free(bwa_pg);
	return ret;
}
Beispiel #4
0
int main(int argc, char *argv[])
{
    extern char *bwa_pg;
    int i, ret;
    double t_real;
    kstring_t pg = {0,0,0};
    t_real = realtime();
    ksprintf(&pg, "@PG\tID:biscuit\tPN:biscuit\tVN:%s\tCL:%s", PACKAGE_VERSION, argv[0]);
    for (i = 1; i < argc; ++i) ksprintf(&pg, " %s", argv[i]);
    bwa_pg = pg.s;
    if (argc < 2) return usage();
    if (strcmp(argv[1], "index") == 0) ret = main_biscuit_index(argc-1, argv+1);
    else if (strcmp(argv[1], "align") == 0) ret = main_align(argc-1, argv+1);
    else if (strcmp(argv[1], "pileup") == 0) ret = main_pileup(argc-1, argv+1);
    else if (strcmp(argv[1], "somatic") == 0) ret = main_somatic(argc-1, argv+1);
    else {
        fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
        return 1;
    }

    err_fflush(stdout);
    err_fclose(stdout);
    if (ret == 0) {
        fprintf(stderr, "[%s] Version: %s\n", __func__, PACKAGE_VERSION);
        fprintf(stderr, "[%s] CMD:", __func__);
        for (i = 0; i < argc; ++i)
            fprintf(stderr, " %s", argv[i]);
        fprintf(stderr, "\n[%s] Real time: %.3f sec; CPU: %.3f sec\n", __func__, realtime() - t_real, cputime());
    }
    free(bwa_pg);

    return ret;
}
Beispiel #5
0
int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only)
{
	extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c
	kseq_t *seq;
	char name[1024];
	bntseq_t *bns;
	uint8_t *pac = 0;
	int32_t m_seqs, m_holes;
	int64_t ret = -1, m_pac, l;
	bntamb1_t *q;
	FILE *fp;

	// initialization
	seq = kseq_init(fp_fa);
	bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
	bns->seed = 11; // fixed seed for random generator
	srand48(bns->seed);
	m_seqs = m_holes = 8; m_pac = 0x10000;
	bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t));
	bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t));
	pac = calloc(m_pac/4, 1);
	q = bns->ambs;
	strcpy(name, prefix); strcat(name, ".pac");
	fp = xopen(name, "wb");
	// read sequences
	while (kseq_read(seq) >= 0) pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q);
	if (!for_only) { // add the reverse complemented sequence
		m_pac = (bns->l_pac * 2 + 3) / 4 * 4;
		pac = realloc(pac, m_pac/4);
		memset(pac + (bns->l_pac+3)/4, 0, (m_pac - (bns->l_pac+3)/4*4) / 4);
		for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac)
			_set_pac(pac, bns->l_pac, 3-_get_pac(pac, l));
	}
	ret = bns->l_pac;
	{ // finalize .pac file
		ubyte_t ct;
		err_fwrite(pac, 1, (bns->l_pac>>2) + ((bns->l_pac&3) == 0? 0 : 1), fp);
		// the following codes make the pac file size always (l_pac/4+1+1)
		if (bns->l_pac % 4 == 0) {
			ct = 0;
			err_fwrite(&ct, 1, 1, fp);
		}
		ct = bns->l_pac % 4;
		err_fwrite(&ct, 1, 1, fp);
		// close .pac file
		err_fflush(fp);
		err_fclose(fp);
	}
	bns_dump(bns, prefix);
	bns_destroy(bns);
	kseq_destroy(seq);
	free(pac);
	return ret;
}
Beispiel #6
0
int64_t dump_forward_pac(gzFile fp_fa, const char *prefix)
{
	extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c
	kseq_t *seq;
	char name[1024];
	bntseq_t *bns;
	uint8_t *pac = 0;
	int32_t m_seqs, m_holes;
	int64_t ret = -1, m_pac;
	bntamb1_t *q;
	FILE *fp;

	// initialization
	seq = kseq_init(fp_fa);
	bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
	bns->seed = 11; // fixed seed for random generator
	srand48(bns->seed);
	m_seqs = m_holes = 8; m_pac = 0x10000;
	bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t));
	bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t));
	pac = calloc(m_pac/4, 1);
	q = bns->ambs;
	strcpy(name, prefix); strcat(name, ".bis.pac");
	fp = xopen(name, "wb");
	// read sequences
	while (kseq_read(seq) >= 0) pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q);

	ret = bns->l_pac;
	{ // finalize .pac file
		ubyte_t ct;
		err_fwrite(pac, 1, (bns->l_pac>>2) + ((bns->l_pac&3) == 0? 0 : 1), fp);
		// the following codes make the pac file size always (l_pac/4+1+1)
		if (bns->l_pac % 4 == 0) {
			ct = 0;
			err_fwrite(&ct, 1, 1, fp);
		}
		ct = bns->l_pac % 4;
		err_fwrite(&ct, 1, 1, fp);
		// close .pac file
		err_fflush(fp);
		err_fclose(fp);
	}
  /* re-dump forward bis bns, otherwise the .bis.ann and .bis.amb have twice as long pac  */
  /* strcpy(name, prefix); strcat(name, ".bis"); */
  /* bis_bns_dump(bns, prefix); */

  bns_destroy(bns);
	kseq_destroy(seq);
	free(pac);
	return ret;
}
Beispiel #7
0
void bns_destroy(bntseq_t *bns)
{
	if (bns == 0) return;
	else {
		int i;
		if (bns->fp_pac) err_fclose(bns->fp_pac);
		free(bns->ambs);
		for (i = 0; i < bns->n_seqs; ++i) {
			free(bns->anns[i].name);
			free(bns->anns[i].anno);
		}
		free(bns->anns);
		free(bns);
	}
}
Beispiel #8
0
int main(int argc, char *argv[])
{
	int i, ret;
	double t_real;
	t_real = realtime();
	if (argc < 2) return usage();
/* 	if (strcmp(argv[1], "fa2pac") == 0) ret = bwa_fa2pac(argc-1, argv+1);
 */
/* 	else if (strcmp(argv[1], "pac2bwt") == 0) ret = bwa_pac2bwt(argc-1, argv+1);
 * 	else if (strcmp(argv[1], "pac2bwtgen") == 0) ret = bwt_bwtgen_main(argc-1, argv+1);
 * 	else if (strcmp(argv[1], "bwtupdate") == 0) ret = bwa_bwtupdate(argc-1, argv+1);
 * 	else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1);
 */
	else if (strcmp(argv[1], "index") == 0) ret = bwa_index_main(argc-1, argv+1);
	else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1);
/*   	else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1);
 */
/* 	else if (strcmp(argv[1], "sw") == 0) ret = bwa_stdsw(argc-1, argv+1);
 * 
 * 	else if (strcmp(argv[1], "sampe") == 0) ret = bwa_sai2sam_pe(argc-1, argv+1);
 * 	else if (strcmp(argv[1], "pac2cspac") == 0) ret = bwa_pac2cspac(argc-1, argv+1);
 * 	else if (strcmp(argv[1], "stdsw") == 0) ret = bwa_stdsw(argc-1, argv+1);
 * 	else if (strcmp(argv[1], "bwtsw2") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
 * 	else if (strcmp(argv[1], "dbwtsw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
 * 	else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
 * 	else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1);
 */

	else {
		fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
		return 1;
	}
	err_fflush(stdout);
	err_fclose(stdout);
	if (ret == 0) {
		fprintf(stderr, "[%s] Version: %s\n", __func__, PACKAGE_VERSION);
		fprintf(stderr, "[%s] CMD:", __func__);
		for (i = 0; i < argc; ++i)
			fprintf(stderr, " %s", argv[i]);
		fprintf(stderr, "\n[%s] Real time: %.3f sec; CPU: %.3f sec\n", __func__, realtime() - t_real, cputime());
	}
	return 0;
}
Beispiel #9
0
// Detects ORFs in the given nucleotide FASTA file and converts to a protein FASTA file
int writeReadsProtein(const char * passPrefix, const char * passProName) {
	struct CDS * orfList;
	gzFile inputSeqPtr;
	FILE * outputPtr;
 	char * outputBuffer;
	kseq_t * seq;
	unsigned long seqIdx, outputSize, orfCount, orfIdx;
	char testHeader[4096];

	// Prepare file handles
	inputSeqPtr = xzopen(passPrefix, "r");
	outputPtr = fopen(passProName, "w");

	// Iterate through each read
	seqIdx = 0;
	seq = kseq_init(inputSeqPtr);

	while(kseq_read(seq) >= 0) {
		// Search for ORFs
		getSequenceORF(seq->seq.s, seq->seq.l, &orfList, &orfCount);
		//if (orfCount > 0) testReadHeader(seq->name.s, testHeader);
		// Write out the corresponding protein sequence for each ORF
		for (orfIdx = 0 ; orfIdx < orfCount ; orfIdx++) {
			convertToAA(seq->seq.s, orfList+orfIdx, &outputBuffer, &outputSize);
			fprintf(outputPtr, ">%d:%d\n%.*s\n", seqIdx, orfIdx, outputSize, outputBuffer);
			free(outputBuffer);
		}

		seqIdx++;
	}

	// Close files
	fflush(outputPtr);
	err_gzclose(inputSeqPtr);
	err_fclose(outputPtr);

	return 0;
}
Beispiel #10
0
bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename)
{
	char str[8192];
	FILE *fp;
	const char *fname;
	bntseq_t *bns;
	long long xx;
	int i;
	int scanres;
	bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
	{ // read .ann
		fp = xopen(fname = ann_filename, "r");
		scanres = fscanf(fp, "%lld%d%u", &xx, &bns->n_seqs, &bns->seed);
		if (scanres != 3) goto badread;
		bns->l_pac = xx;
		bns->anns = (bntann1_t*)calloc(bns->n_seqs, sizeof(bntann1_t));
		for (i = 0; i < bns->n_seqs; ++i) {
			bntann1_t *p = bns->anns + i;
			char *q = str;
			int c;
			// read gi and sequence name
			scanres = fscanf(fp, "%u%s", &p->gi, str);
			if (scanres != 2) goto badread;
			p->name = strdup(str);
			// read fasta comments 
			while (q - str < sizeof(str) - 1 && (c = fgetc(fp)) != '\n' && c != EOF) *q++ = c;
			while (c != '\n' && c != EOF) c = fgetc(fp);
			if (c == EOF) {
				scanres = EOF;
				goto badread;
			}
			*q = 0;
			if (q - str > 1 && strcmp(str, " (null)") != 0) p->anno = strdup(str + 1); // skip leading space
			else p->anno = strdup("");
			// read the rest
			scanres = fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs);
			if (scanres != 3) goto badread;
			p->offset = xx;
		}
		err_fclose(fp);
	}
	{ // read .amb
		int64_t l_pac;
		int32_t n_seqs;
		fp = xopen(fname = amb_filename, "r");
		scanres = fscanf(fp, "%lld%d%d", &xx, &n_seqs, &bns->n_holes);
		if (scanres != 3) goto badread;
		l_pac = xx;
		xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files.");
		bns->ambs = bns->n_holes? (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t)) : 0;
		for (i = 0; i < bns->n_holes; ++i) {
			bntamb1_t *p = bns->ambs + i;
			scanres = fscanf(fp, "%lld%d%s", &xx, &p->len, str);
			if (scanres != 3) goto badread;
			p->offset = xx;
			p->amb = str[0];
		}
		err_fclose(fp);
	}
	{ // open .pac
		bns->fp_pac = xopen(pac_filename, "rb");
	}
	return bns;

 badread:
	if (EOF == scanres) {
		err_fatal(__func__, "Error reading %s : %s\n", fname, ferror(fp) ? strerror(errno) : "Unexpected end of file");
	}
	err_fatal(__func__, "Parse error reading %s\n", fname);
}
Beispiel #11
0
int64_t bis_bns_fasta2bntseq(gzFile fp_fa, const char *prefix, uint8_t parent) {
 
  extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c
  kseq_t *seq;
  char name[1024];
  bntseq_t *bns;
  uint8_t *pac = 0, *_pac = 0;
  int32_t m_seqs, m_holes;
  int64_t ret = -1, m_pac;
  bntamb1_t *q;
  FILE *fp;

  // initialization
  gzseek(fp_fa, 0, SEEK_SET);
  seq = kseq_init(fp_fa);
  bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
  bns->seed = 11; // fixed seed for random generator
  srand48(bns->seed);
  m_seqs = m_holes = 8; m_pac = 0x10000;
  bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t));
  bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t));
  _pac = calloc(m_pac/4, 1);
  q = bns->ambs;
  if (parent) {
    strcpy(name, prefix); strcat(name, ".par.pac");
  } else {
    strcpy(name, prefix); strcat(name, ".dau.pac");
  }
  fp = xopen(name, "wb");
  // read sequences

  while (kseq_read(seq) >= 0) {
    _pac = bis_add1(seq, bns, _pac, &m_pac, &m_seqs, &m_holes, &q);
  }
  /* kseq_rewind(seq); */
  /* gzseek(seq->f->f, 0, SEEK_SET); */
  /* fprintf(stderr, "foward end\n"); */
  /* fflush(stderr); */
  /* while (kseq_read(seq) >= 0) { */
  /* if (parent) nt256char_rev_ip(seq->seq.s, seq->seq.l); */
  /* pac = bis_add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q, parent, 1); */
  /* } */

  int64_t l,k;
  m_pac = (bns->l_pac*2+3)/4*4; /* in bit */
  pac = calloc(m_pac/4,sizeof(uint8_t));
  for (l=0; l<bns->l_pac; ++l) {
    uint8_t c = _get_pac(_pac,l);
    if (parent && c == 1) c = 3;
    if (!parent && c == 2) c = 0;
    _set_pac(pac, l, c);
  }

  for (k=bns->l_pac-1; k>=0; --k,++l) {
    uint8_t c = 3-_get_pac(_pac,k);
    if (parent && c == 1) c = 3;
    if (!parent && c == 2) c = 0;
    _set_pac(pac, l, c);
  }
  free(_pac);
  /* int64_t l; */
  /* fprintf(stderr, "reverse end\n"); */
  /* fflush(stderr); */
  /* if (!for_only) { // add the reverse complemented sequence */
  /*   m_pac = (bns->l_pac * 2 + 3) / 4 * 4; */
  /*   pac = realloc(pac, m_pac/4); */
  /*   memset(pac + (bns->l_pac+3)/4, 0, (m_pac - (bns->l_pac+3)/4*4) / 4); */
  /*   for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac) */
  /*     _set_pac(pac, bns->l_pac, 3-_get_pac(pac, l)); */
  /* } */

  assert(bns->l_pac<<1 == l);
  { // finalize .pac file
    ubyte_t ct;
    err_fwrite(pac, 1, (l>>2) + ((l&3) == 0? 0 : 1), fp);
    // the following codes make the pac file size always (l_pac/4+1+1)
    if (l % 4 == 0) {
      ct = 0;
      err_fwrite(&ct, 1, 1, fp);
    }
    ct = l % 4;
    err_fwrite(&ct, 1, 1, fp);
    // close .pac file
    err_fflush(fp);
    err_fclose(fp);
  }
  if (parent) bis_bns_dump(bns, prefix);
  bns_destroy(bns);
  kseq_destroy(seq);
  free(pac);
  return l;
}