Exemplo n.º 1
0
void
tmap_seqs_destroy(tmap_seqs_t *seqs)
{
  int32_t i;
  for(i=0;i<seqs->m;i++) {
      tmap_seq_destroy(seqs->seqs[i]);
  }
  free(seqs->seqs);
  free(seqs);
}
Exemplo n.º 2
0
tmap_map_sams_t *
tmap_map2_aux_core(tmap_map_opt_t *_opt,
                   tmap_seq_t *seqs[4],
                   tmap_refseq_t *refseq,
                   tmap_bwt_t *bwt,
                   tmap_sa_t *sa,
                   tmap_bwt_match_hash_t *hash,
                   tmap_rand_t *rand,
                   tmap_map2_global_mempool_t *pool)
{
  tmap_map_opt_t opt;
  tmap_seq_t *orig_seq = NULL;
  tmap_string_t *seq[2]={NULL, NULL};
  tmap_string_t *rseq[2]={NULL, NULL};
  tmap_map_sams_t *sams = NULL;
  tmap_map2_aln_t *b[2]={NULL,NULL};
  tmap_string_t *bases = NULL;
  int32_t i, k, l, num_n;

  opt = (*_opt);

  // sequence length
  bases = tmap_seq_get_bases(seqs[0]);
  l = bases->l;

  // update the local opt
  tmap_map2_aux_core_update_opt(&opt, _opt, l);

  // set opt->score_thr
  if(pool->max_l < l) { // then enlarge working space for tmap_sw_extend_core()
      int32_t tmp;
      if(0 == opt.pen_gape) {
          tmp = ((l + 1) / 2 * opt.score_match + opt.pen_gape) + l;
      }
      else {
          tmp = ((l + 1) / 2 * opt.score_match + opt.pen_gape) / opt.pen_gape + l;
      }
      pool->max_l = l;
      pool->aln_mem = tmap_realloc(pool->aln_mem, sizeof(uint8_t) * (tmp + 2) * 24, "pool->aln_mem");
  }

  // get the number of Ns
  for(i=num_n=0;i<l;i++) {
      uint8_t c = (uint8_t)tmap_nt_char_to_int[(int)bases->s[i]];
      if(c >= 4) num_n++; // FIXME: ambiguous bases are not properly handled
  }

  // will we always be lower than the score threshold
  if((l*opt.score_match) + (num_n*opt.pen_mm) < opt.score_thr) {
      return tmap_map_sams_init(NULL);
  }
  
  // save sequences
  seq[0] = tmap_seq_get_bases(seqs[0]); 
  seq[1] = tmap_seq_get_bases(seqs[1]);
  rseq[0] = tmap_seq_get_bases(seqs[2]); 
  rseq[1] = tmap_seq_get_bases(seqs[3]);

  // handle ambiguous bases
  if(0 < num_n) {
      // save original to de-randomize later
      orig_seq = tmap_seq_clone(seqs[0]);

      // randomize
      for(i=0;i<l;i++) {
          uint8_t c = (uint8_t)bases->s[i];
          if(c >= 4) {
              c = (int)(tmap_rand_get(rand) * 4); // FIXME: ambiguous bases are not properly handled
              seq[0]->s[i] = c; // original
              seq[1]->s[l-1-i] = 3 - c; // reverse compliment
              rseq[0]->s[l-1-i] = 3 - c; // reverse compliment
              rseq[1]->s[i] = c; // original
              //rseq[0]->s[l-1-i] = c; // reverse 
              //rseq[1]->s[i] = 3 - c; // compliment
          }
      }
  }

  // alignment
  b[0] = tmap_map2_aux_aln(&opt, refseq, bwt, sa, hash, seq, 0, pool);
  for(k = 0; k < b[0]->n; ++k) {
      if(b[0]->hits[k].n_seeds < opt.seeds_rev) break;
  } 
  if(k < b[0]->n) {
      b[1] = tmap_map2_aux_aln(&opt, refseq, bwt, sa, hash, rseq, 1, pool);
      for(i = 0; i < b[1]->n; ++i) {
          tmap_map2_hit_t *p = b[1]->hits + i;
          int x = p->beg;
          p->flag ^= 0x10, p->is_rev ^= 1; // flip the strand
          p->beg = l - p->end;
          p->end = l - x;
          if(p->l == 0) {
              if(refseq->len * 2 < (p->k + p->tlen)) p->k = 0;
              else p->k = 2 * refseq->len - (p->k + p->tlen);
          }
      }
      tmap_map2_aux_merge_hits(b, l, 0);
  } else b[1] = 0;
  
  // set the flag to forward/reverse
  tmap_map2_aux_flag_fr(b);
  
  // tlen may overestimated due to not counting insertions properly, bound it!
  for(i = 0; i < b[0]->n; ++i) {
      if(refseq->len * 2 <= b[0]->hits[i].k + b[0]->hits[i].tlen) {
          b[0]->hits[i].tlen = (refseq->len * 2) - b[0]->hits[i].k;
      }
      else if(b[0]->hits[i].k < refseq->len && refseq->len <= b[0]->hits[i].k + b[0]->hits[i].tlen) {
          b[0]->hits[i].tlen = refseq->len - b[0]->hits[i].k;
      }
  }
  
  // make one-based for pac2real
  for(i = 0; i < b[0]->n; ++i) {
      b[0]->hits[i].k++;
  }
  
  // store in SAM 
  sams = tmap_map2_aux_store_hits(refseq, &opt, b[0], l);

  // free
  tmap_map2_aln_destroy(b[0]);

  // revert ambiguous bases 
  if(0 < num_n) {
      // de-randomize
      bases = tmap_seq_get_bases(orig_seq);
      for(i=0;i<l;i++) {
          uint8_t c = (uint8_t)bases->s[i];
          if(c >= 4) { 
              // NB: always keep them at "4"
              seq[0]->s[i] = c; // original
              seq[1]->s[l-1-i] = c; // reverse compliment
              rseq[0]->s[l-1-i] = c; // reverse compliment
              rseq[1]->s[i] = c; // original
              //rseq[0]->s[l-1-i] = c; // reverse 
              //rseq[1]->s[i] = 3 - c; // compliment
          }
      }
      tmap_seq_destroy(orig_seq);
  }

  return sams;
}
Exemplo n.º 3
0
uint64_t
tmap_refseq_fasta2pac(const char *fn_fasta, int32_t compression)
{
  tmap_file_t *fp_pac = NULL, *fp_anno = NULL;
  tmap_seq_io_t *seqio = NULL;
  tmap_seq_t *seq = NULL;
  tmap_refseq_t *refseq = NULL;
  char *fn_pac = NULL, *fn_anno = NULL;
  uint8_t buffer[TMAP_REFSEQ_BUFFER_SIZE];
  int32_t i, j, l, buffer_length;
  uint32_t num_IUPAC_found= 0, amb_bases_mem = 0;
  uint8_t x = 0;
  uint64_t ref_len;

  tmap_progress_print("packing the reference FASTA");

  refseq = tmap_calloc(1, sizeof(tmap_refseq_t), "refseq");

  refseq->version_id = TMAP_VERSION_ID; 
  refseq->package_version = tmap_string_clone2(PACKAGE_VERSION);
  refseq->seq = buffer; // IMPORTANT: must nullify later
  refseq->annos = NULL;
  refseq->num_annos = 0;
  refseq->len = 0;
  refseq->is_rev = 0;
  refseq->is_shm = 0;
  memset(buffer, 0, TMAP_REFSEQ_BUFFER_SIZE);
  buffer_length = 0;

  // input files
  seqio = tmap_seq_io_init(fn_fasta, TMAP_SEQ_TYPE_FQ, 0, compression);
  seq = tmap_seq_init(TMAP_SEQ_TYPE_FQ);

  // output files
  fn_pac = tmap_get_file_name(fn_fasta, TMAP_PAC_FILE);
  fp_pac = tmap_file_fopen(fn_pac, "wb", TMAP_PAC_COMPRESSION);

  // read in sequences
  while(0 <= (l = tmap_seq_io_read(seqio, seq))) {
      tmap_anno_t *anno = NULL;
      tmap_progress_print2("packing contig [%s:1-%d]", seq->data.fq->name->s, l);

      refseq->num_annos++;
      refseq->annos = tmap_realloc(refseq->annos, sizeof(tmap_anno_t)*refseq->num_annos, "refseq->annos");
      anno = &refseq->annos[refseq->num_annos-1];
      
      anno->name = tmap_string_clone(seq->data.fq->name); 
      anno->len = l;
      anno->offset = (1 == refseq->num_annos) ? 0 : refseq->annos[refseq->num_annos-2].offset + refseq->annos[refseq->num_annos-2].len;
      anno->amb_positions_start = NULL;
      anno->amb_positions_end = NULL;
      anno->amb_bases = NULL;
      anno->num_amb = 0;
      amb_bases_mem = 0;

      // fill the buffer
      for(i=0;i<l;i++) {
          uint8_t c = tmap_nt_char_to_int[(int)seq->data.fq->seq->s[i]];
          // handle IUPAC codes 
          if(4 <= c) {
              int32_t k;
              // warn users about IUPAC codes
              if(0 == num_IUPAC_found) { 
                  tmap_error("IUPAC codes were found and will be converted to non-matching DNA bases", Warn, OutOfRange);
                  for(j=4;j<15;j++) {
                      c = tmap_iupac_char_to_bit_string[(int)tmap_iupac_int_to_char[j]];
                      // get the lexicographically smallest base not compatible with this code
                      for(k=0;k<4;k++) {
                          if(!(c & (0x1 << k))) {
                              break;
                          }
                      } 
                      tmap_progress_print2("IUPAC code %c will be converted to %c", tmap_iupac_int_to_char[j], "ACGTN"[k & 3]);
                  }
              }
              num_IUPAC_found++;
              
              // change it to a mismatched base than the IUPAC code
              c = tmap_iupac_char_to_bit_string[(int)seq->data.fq->seq->s[i]];

              // store IUPAC bases
              if(amb_bases_mem <= anno->num_amb) { // allocate more memory if necessary
                  amb_bases_mem = anno->num_amb + 1;
                  tmap_roundup32(amb_bases_mem);
                  anno->amb_positions_start = tmap_realloc(anno->amb_positions_start, sizeof(uint32_t) * amb_bases_mem, "anno->amb_positions_start");
                  anno->amb_positions_end = tmap_realloc(anno->amb_positions_end, sizeof(uint32_t) * amb_bases_mem, "anno->amb_positions_end");
                  anno->amb_bases = tmap_realloc(anno->amb_bases, sizeof(uint8_t) * amb_bases_mem, "anno->amb_bases");
              }
              // encode stretches of the same base
              if(0 < anno->num_amb
                 && anno->amb_positions_end[anno->num_amb-1] == i
                 && anno->amb_bases[anno->num_amb-1] == tmap_iupac_char_to_int[(int)seq->data.fq->seq->s[i]]) {
                 anno->amb_positions_end[anno->num_amb-1]++; // expand the range 
              }
              else {
                  // new ambiguous base and range
                  anno->num_amb++;
                  anno->amb_positions_start[anno->num_amb-1] = i+1; // one-based
                  anno->amb_positions_end[anno->num_amb-1] = i+1; // one-based
                  anno->amb_bases[anno->num_amb-1] = tmap_iupac_char_to_int[(int)seq->data.fq->seq->s[i]];
              }
              
              // get the lexicographically smallest base not compatible with
              // this code
              for(j=0;j<4;j++) {
                  if(!(c & (0x1 << j))) {
                      break;
                  }
              } 
              c = j & 3; // Note: Ns will go to As
          }
          if(3 < c) {
              tmap_error("bug encountered", Exit, OutOfRange);
          }
          if(buffer_length == (TMAP_REFSEQ_BUFFER_SIZE << 2)) { // 2-bit
              if(tmap_refseq_seq_memory(buffer_length) != tmap_file_fwrite(buffer, sizeof(uint8_t), tmap_refseq_seq_memory(buffer_length), fp_pac)) {
                  tmap_error(fn_pac, Exit, WriteFileError);
              }
              memset(buffer, 0, TMAP_REFSEQ_BUFFER_SIZE);
              buffer_length = 0;
          }
          tmap_refseq_seq_store_i(refseq, buffer_length, c);
          buffer_length++;
      }
      refseq->len += l;
      // re-size the amibiguous bases
      if(anno->num_amb < amb_bases_mem) {
          amb_bases_mem = anno->num_amb;
          anno->amb_positions_start = tmap_realloc(anno->amb_positions_start, sizeof(uint32_t) * amb_bases_mem, "anno->amb_positions_start");
          anno->amb_positions_end = tmap_realloc(anno->amb_positions_end, sizeof(uint32_t) * amb_bases_mem, "anno->amb_positions_end");
          anno->amb_bases = tmap_realloc(anno->amb_bases, sizeof(uint8_t) * amb_bases_mem, "anno->amb_bases");
      }
  }
  // write out the buffer
  if(tmap_refseq_seq_memory(buffer_length) != tmap_file_fwrite(buffer, sizeof(uint8_t), tmap_refseq_seq_memory(buffer_length), fp_pac)) {
      tmap_error(fn_pac, Exit, WriteFileError);
  }
  if(refseq->len % 4 == 0) { // add an extra byte if we completely filled all bits
      if(1 != tmap_file_fwrite(&x, sizeof(uint8_t), 1, fp_pac)) {
          tmap_error(fn_pac, Exit, WriteFileError);
      }
  }
  // store number of unused bits at the last byte
  x = refseq->len % 4;
  if(1 != tmap_file_fwrite(&x, sizeof(uint8_t), 1, fp_pac)) {
      tmap_error(fn_pac, Exit, WriteFileError);
  }
  refseq->seq = NULL; // IMPORTANT: nullify this
  ref_len = refseq->len; // save for return
      
  tmap_progress_print2("total genome length [%u]", refseq->len);
  if(0 < num_IUPAC_found) {
      if(1 == num_IUPAC_found) {
          tmap_progress_print("%u IUPAC base was found and converted to a DNA base", num_IUPAC_found);
      }
      else {
          tmap_progress_print("%u IUPAC bases were found and converted to DNA bases", num_IUPAC_found);
      }
  }

  // write annotation file
  fn_anno = tmap_get_file_name(fn_fasta, TMAP_ANNO_FILE);
  fp_anno = tmap_file_fopen(fn_anno, "wb", TMAP_ANNO_COMPRESSION);
  tmap_refseq_write_anno(fp_anno, refseq); 

  // close files
  tmap_file_fclose(fp_pac);
  tmap_file_fclose(fp_anno);

  // check sequence name uniqueness
  for(i=0;i<refseq->num_annos;i++) {
      for(j=i+1;j<refseq->num_annos;j++) {
          if(0 == strcmp(refseq->annos[i].name->s, refseq->annos[j].name->s)) {
              tmap_file_fprintf(tmap_file_stderr, "Contigs have the same name: #%d [%s] and #%d [%s]\n",
                                i+1, refseq->annos[i].name->s, 
                                j+1, refseq->annos[j].name->s); 
              tmap_error("Contig names must be unique", Exit, OutOfRange);
          }
      }
  }

  tmap_refseq_destroy(refseq); 
  tmap_seq_io_destroy(seqio);
  tmap_seq_destroy(seq);
  free(fn_pac);
  free(fn_anno);

  tmap_progress_print2("packed the reference FASTA");

  tmap_refseq_pac2revpac(fn_fasta);

  return ref_len;
}