Esempio n. 1
0
File: bntseq.c Progetto: a113n/bwa
int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only)
{
	extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c
	kseq_t *seq;
	char name[1024];
	bntseq_t *bns;
	uint8_t *pac = 0;
	int32_t m_seqs, m_holes;
	int64_t ret = -1, m_pac, l;
	bntamb1_t *q;
	FILE *fp;

	// initialization
	seq = kseq_init(fp_fa);
	bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
	bns->seed = 11; // fixed seed for random generator
	srand48(bns->seed);
	m_seqs = m_holes = 8; m_pac = 0x10000;
	bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t));
	bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t));
	pac = calloc(m_pac/4, 1);
	q = bns->ambs;
	strcpy(name, prefix); strcat(name, ".pac");
	fp = xopen(name, "wb");
	// read sequences
	while (kseq_read(seq) >= 0) pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q);
	if (!for_only) { // add the reverse complemented sequence
		m_pac = (bns->l_pac * 2 + 3) / 4 * 4;
		pac = realloc(pac, m_pac/4);
		memset(pac + (bns->l_pac+3)/4, 0, (m_pac - (bns->l_pac+3)/4*4) / 4);
		for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac)
			_set_pac(pac, bns->l_pac, 3-_get_pac(pac, l));
	}
	ret = bns->l_pac;
	{ // finalize .pac file
		ubyte_t ct;
		err_fwrite(pac, 1, (bns->l_pac>>2) + ((bns->l_pac&3) == 0? 0 : 1), fp);
		// the following codes make the pac file size always (l_pac/4+1+1)
		if (bns->l_pac % 4 == 0) {
			ct = 0;
			err_fwrite(&ct, 1, 1, fp);
		}
		ct = bns->l_pac % 4;
		err_fwrite(&ct, 1, 1, fp);
		// close .pac file
		err_fflush(fp);
		err_fclose(fp);
	}
	bns_dump(bns, prefix);
	bns_destroy(bns);
	kseq_destroy(seq);
	free(pac);
	return ret;
}
Esempio n. 2
0
static uint8_t *bis_add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_pac, int *m_seqs, int *m_holes, bntamb1_t **q)
{
  bntann1_t *p;
  int lasts;
  uint32_t i;
  if (bns->n_seqs == *m_seqs) {
    *m_seqs <<= 1;
    bns->anns = (bntann1_t*)realloc(bns->anns, *m_seqs * sizeof(bntann1_t));
  }
  p = bns->anns + bns->n_seqs;
  p->name = strdup((char*)seq->name.s);
  p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)");
  p->gi = 0; p->len = seq->seq.l;
  p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len;
  p->n_ambs = 0;

  for (i = lasts = 0; i < seq->seq.l; ++i) {
    int c = nst_nt4_table[(int)seq->seq.s[i]];

    if (c >= 4) { // N
      if (lasts == seq->seq.s[i]) { // contiguous N
        ++(*q)->len;
      } else {
        if (bns->n_holes == *m_holes) {
          (*m_holes) <<= 1;
          bns->ambs = (bntamb1_t*)realloc(bns->ambs, (*m_holes) * sizeof(bntamb1_t));
        }
        *q = bns->ambs + bns->n_holes;
        (*q)->len = 1;
        (*q)->offset = p->offset + i;
        (*q)->amb = seq->seq.s[i];
        ++p->n_ambs;
        ++bns->n_holes;
      }
    }
    lasts = seq->seq.s[i];
    { // fill buffer
      if (c >= 4) c = lrand48()&3;
      if (bns->l_pac == *m_pac) { // double the pac size
        *m_pac <<= 1;
        pac = realloc(pac, *m_pac/4);
        memset(pac + bns->l_pac/4, 0, (*m_pac - bns->l_pac)/4);
      }
      _set_pac(pac, bns->l_pac, c);
      ++bns->l_pac;
    }
  }
  ++bns->n_seqs;
  return pac;
}
Esempio n. 3
0
int64_t bis_bns_fasta2bntseq(gzFile fp_fa, const char *prefix, uint8_t parent) {
 
  extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c
  kseq_t *seq;
  char name[1024];
  bntseq_t *bns;
  uint8_t *pac = 0, *_pac = 0;
  int32_t m_seqs, m_holes;
  int64_t ret = -1, m_pac;
  bntamb1_t *q;
  FILE *fp;

  // initialization
  gzseek(fp_fa, 0, SEEK_SET);
  seq = kseq_init(fp_fa);
  bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
  bns->seed = 11; // fixed seed for random generator
  srand48(bns->seed);
  m_seqs = m_holes = 8; m_pac = 0x10000;
  bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t));
  bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t));
  _pac = calloc(m_pac/4, 1);
  q = bns->ambs;
  if (parent) {
    strcpy(name, prefix); strcat(name, ".par.pac");
  } else {
    strcpy(name, prefix); strcat(name, ".dau.pac");
  }
  fp = xopen(name, "wb");
  // read sequences

  while (kseq_read(seq) >= 0) {
    _pac = bis_add1(seq, bns, _pac, &m_pac, &m_seqs, &m_holes, &q);
  }
  /* kseq_rewind(seq); */
  /* gzseek(seq->f->f, 0, SEEK_SET); */
  /* fprintf(stderr, "foward end\n"); */
  /* fflush(stderr); */
  /* while (kseq_read(seq) >= 0) { */
  /* if (parent) nt256char_rev_ip(seq->seq.s, seq->seq.l); */
  /* pac = bis_add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q, parent, 1); */
  /* } */

  int64_t l,k;
  m_pac = (bns->l_pac*2+3)/4*4; /* in bit */
  pac = calloc(m_pac/4,sizeof(uint8_t));
  for (l=0; l<bns->l_pac; ++l) {
    uint8_t c = _get_pac(_pac,l);
    if (parent && c == 1) c = 3;
    if (!parent && c == 2) c = 0;
    _set_pac(pac, l, c);
  }

  for (k=bns->l_pac-1; k>=0; --k,++l) {
    uint8_t c = 3-_get_pac(_pac,k);
    if (parent && c == 1) c = 3;
    if (!parent && c == 2) c = 0;
    _set_pac(pac, l, c);
  }
  free(_pac);
  /* int64_t l; */
  /* fprintf(stderr, "reverse end\n"); */
  /* fflush(stderr); */
  /* if (!for_only) { // add the reverse complemented sequence */
  /*   m_pac = (bns->l_pac * 2 + 3) / 4 * 4; */
  /*   pac = realloc(pac, m_pac/4); */
  /*   memset(pac + (bns->l_pac+3)/4, 0, (m_pac - (bns->l_pac+3)/4*4) / 4); */
  /*   for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac) */
  /*     _set_pac(pac, bns->l_pac, 3-_get_pac(pac, l)); */
  /* } */

  assert(bns->l_pac<<1 == l);
  { // finalize .pac file
    ubyte_t ct;
    err_fwrite(pac, 1, (l>>2) + ((l&3) == 0? 0 : 1), fp);
    // the following codes make the pac file size always (l_pac/4+1+1)
    if (l % 4 == 0) {
      ct = 0;
      err_fwrite(&ct, 1, 1, fp);
    }
    ct = l % 4;
    err_fwrite(&ct, 1, 1, fp);
    // close .pac file
    err_fflush(fp);
    err_fclose(fp);
  }
  if (parent) bis_bns_dump(bns, prefix);
  bns_destroy(bns);
  kseq_destroy(seq);
  free(pac);
  return l;
}
Esempio n. 4
0
int64_t R_bns_fasta2bntseq(gzFile fp_fa, const char *prefix)
{
//	extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c
	kseq_t *seq;
	char name[1024];
	bntseq_t *bns;
	uint8_t *pac = 0;
    uint8_t *reverse_pac = NULL;
	int32_t m_seqs, m_holes;
	int64_t ret = -1, m_pac, l, m_r_pac;
	bntamb1_t *q;
	FILE *fp, *fp_r;
    int i;
	// initialization
	seq = kseq_init(fp_fa);
	bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
	bns->seed = 11; // fixed seed for random generator
	srand48(bns->seed);
	m_seqs = m_holes = 8; m_pac = 0x10000;
	bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t));
	bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t));
	pac = calloc(m_pac/NT_PER_BYTE, 1);
	q = bns->ambs;
	strcpy(name, prefix); strcat(name, ".pac");
	fp = xopen(name, "wb");
    memset(name, '\0', 1024);strcpy(name, prefix); strcat(name, ".rpac");
    fp_r = xopen(name, "wb");
	// read sequences
	while (kseq_read(seq) >= 0) pac = R_add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q);
    
   // add the reverse complemented sequence





    ret = bns->l_pac;
    fprintf(stderr, "[R_bns_fasta2bntseq]:reverse_pac!\n");
    m_r_pac = (bns->l_pac +NT_PER_BYTE-1)/NT_PER_BYTE *NT_PER_BYTE;
    reverse_pac = calloc(m_r_pac/NT_PER_BYTE, sizeof(uint8_t));
    for(l = bns->l_pac-1, i =0; l>=0, i < bns->l_pac; --l, ++i)
    {
        _set_pac(reverse_pac, i, _get_pac(pac, l)); 
    } 
	ubyte_t ct;
    { // finalize .pac and  file
		fwrite(pac, 1, (bns->l_pac>>1) + ((bns->l_pac&1) == 0? 0 : 1), fp);
		// the following codes make the pac file size always (l_pac/4+1+1)
		if (bns->l_pac % NT_PER_BYTE == 0) {
			ct = 0;
			fwrite(&ct, 1, 1, fp);
		}
		ct = bns->l_pac % NT_PER_BYTE;
		fwrite(&ct, 1, 1, fp);
		// close .pac file
		fclose(fp);
	}	
    { // finalize .rpac and  file
		fwrite(reverse_pac, 1, (bns->l_pac>>1) + ((bns->l_pac&1) == 0? 0 : 1), fp_r);
		// the following codes make the pac file size always (l_pac/4+1+1)
		if (bns->l_pac % NT_PER_BYTE == 0) {
			ct = 0;
			fwrite(&ct, 1, 1, fp_r);
		}
		ct = bns->l_pac % NT_PER_BYTE;
		fwrite(&ct, 1, 1, fp_r);
		// close .rpac file
		fclose(fp_r);
	}
	bns_dump(bns, prefix);
	bns_destroy(bns);
	kseq_destroy(seq);
	free(pac);
    free(reverse_pac);
	return ret;
}