Пример #1
0
static inline void
tmap_refseq_read_annos(tmap_file_t *fp, tmap_anno_t *anno) 
{
  uint32_t len = 0; // includes the null-terminator
  
  if(1 != tmap_file_fread(&len, sizeof(uint32_t), 1, fp)) {
      tmap_error(NULL, Exit, ReadFileError);
  }

  anno->name = tmap_string_init(len);

  if(len != tmap_file_fread(anno->name->s, sizeof(char), len, fp)
     || 1 != tmap_file_fread(&anno->len, sizeof(uint64_t), 1, fp)
     || 1 != tmap_file_fread(&anno->offset, sizeof(uint64_t), 1, fp)
     || 1 != tmap_file_fread(&anno->num_amb, sizeof(uint32_t), 1, fp)) {
      tmap_error(NULL, Exit, ReadFileError);
  }
  if(0 < anno->num_amb) {
      anno->amb_positions_start = tmap_malloc(sizeof(uint32_t) * anno->num_amb, "anno->amb_positions_start");
      anno->amb_positions_end = tmap_malloc(sizeof(uint32_t) * anno->num_amb, "anno->amb_positions_end");
      anno->amb_bases = tmap_malloc(sizeof(uint8_t) * anno->num_amb, "anno->amb_bases");
      if(anno->num_amb != tmap_file_fread(anno->amb_positions_start, sizeof(uint32_t), anno->num_amb, fp)
         || anno->num_amb != tmap_file_fread(anno->amb_positions_end, sizeof(uint32_t), anno->num_amb, fp)
         || anno->num_amb != tmap_file_fread(anno->amb_bases, sizeof(uint8_t), anno->num_amb, fp)) {
          tmap_error(NULL, Exit, WriteFileError);
      }
  }
  else {
      anno->amb_positions_start = NULL;
      anno->amb_positions_end = NULL;
      anno->amb_bases = NULL;
  }
  // set name length
  anno->name->l = len-1;
}
Пример #2
0
inline uint8_t*
tmap_refseq_subseq2(const tmap_refseq_t *refseq, uint32_t seqid, uint32_t start, uint32_t end, uint8_t *target, int32_t to_n, int32_t *conv)
{
  uint32_t i, j;

  if(0 == seqid || refseq->num_annos < seqid || end < start) {
      return NULL;
  }

  if(NULL == target) { 
      target = tmap_malloc(sizeof(char) * (end - start + 1), "target");
  }
  if((end - start + 1) != tmap_refseq_subseq(refseq, refseq->annos[seqid-1].offset + start, end - start + 1, target)) {
      free(target);
      return NULL;
  }

  // check if any IUPAC bases fall within the range
  // NB: this could be done more efficiently, since we we know start <= end
  if(NULL != conv) (*conv) = 0;
  if(0 < tmap_refseq_amb_bases(refseq, seqid, start, end)) {
      // modify them
      for(i=start;i<=end;i++) {
          j = tmap_refseq_amb_bases(refseq, seqid, i, i); // Note: j is one-based
          if(0 < j) {
              target[i-start] = (0 == to_n) ? refseq->annos[seqid-1].amb_bases[j-1] : 4;
              if(NULL != conv) (*conv)++;
          }
      }
  }

  return target;
}
Пример #3
0
tmap_refseq_t *
tmap_refseq_read(const char *fn_fasta, uint32_t is_rev)
{
  tmap_file_t *fp_pac = NULL, *fp_anno = NULL;
  char *fn_pac = NULL, *fn_anno = NULL;
  tmap_refseq_t *refseq = NULL;

  // allocate some memory 
  refseq = tmap_calloc(1, sizeof(tmap_refseq_t), "refseq");
  refseq->is_rev = is_rev;
  refseq->is_shm = 0;

  // read annotation file
  fn_anno = tmap_get_file_name(fn_fasta, TMAP_ANNO_FILE);
  fp_anno = tmap_file_fopen(fn_anno, "rb", TMAP_ANNO_COMPRESSION);
  tmap_refseq_read_anno(fp_anno, refseq); 
  tmap_file_fclose(fp_anno);
  free(fn_anno);

  // read the sequence
  fn_pac = tmap_get_file_name(fn_fasta, (0 == is_rev) ? TMAP_PAC_FILE : TMAP_REV_PAC_FILE);
  fp_pac = tmap_file_fopen(fn_pac, "rb", (0 == is_rev) ? TMAP_PAC_COMPRESSION : TMAP_REV_PAC_COMPRESSION);
  refseq->seq = tmap_malloc(sizeof(uint8_t)*tmap_refseq_seq_memory(refseq->len), "refseq->seq"); // allocate
  if(tmap_refseq_seq_memory(refseq->len) 
     != tmap_file_fread(refseq->seq, sizeof(uint8_t), tmap_refseq_seq_memory(refseq->len), fp_pac)) {
      tmap_error(NULL, Exit, ReadFileError);
  }
  tmap_file_fclose(fp_pac);
  free(fn_pac);


  return refseq;
}
Пример #4
0
tmap_sff_read_t *
tmap_sff_read_read(tmap_file_t *fp, tmap_sff_header_t *gh, tmap_sff_read_header_t *rh)
{
  tmap_sff_read_t *r = NULL;
  uint32_t i, n = 0;

  r = tmap_calloc(1, sizeof(tmap_sff_read_t), "r");

  r->flowgram = tmap_malloc(sizeof(uint16_t)*gh->flow_length, "r->flowgram");
  r->flow_index = tmap_malloc(sizeof(uint8_t)*rh->n_bases, "r->flow_index");

  r->bases = tmap_string_init(rh->n_bases+1);
  r->quality = tmap_string_init(rh->n_bases+1);

  if(gh->flow_length != tmap_file_fread(r->flowgram, sizeof(uint16_t), gh->flow_length, fp)
     || rh->n_bases != tmap_file_fread(r->flow_index, sizeof(uint8_t), rh->n_bases, fp)
     || rh->n_bases != tmap_file_fread(r->bases->s, sizeof(char), rh->n_bases, fp)
     || rh->n_bases != tmap_file_fread(r->quality->s, sizeof(char), rh->n_bases, fp)) {
      tmap_error("tmap_file_fread", Exit, ReadFileError);
  }
  n += sizeof(uint16_t)*gh->flow_length + 3*sizeof(uint8_t)*rh->n_bases;

  // set length and null-terminators
  r->bases->l = rh->n_bases;
  r->quality->l = rh->n_bases;
  r->bases->s[r->bases->l]='\0';
  r->quality->s[r->quality->l]='\0';

  // convert qualities from int to char
  for(i=0;i<r->quality->l;i++) {
      r->quality->s[i] = QUAL2CHAR(r->quality->s[i]);
  }

  // convert flowgram to host order
  for(i=0;i<gh->flow_length;i++) {
      r->flowgram[i] = ntohs(r->flowgram[i]);
  }

  n += tmap_sff_read_padding(fp, n);

#ifdef TMAP_SFF_DEBUG
  tmap_sff_read_print(stderr, r, gh, rh);
#endif

  return r;
}
Пример #5
0
int32_t
tmap_sff_get_key_seq_int(tmap_sff_t *sff, uint8_t **key_seq)
{
  int32_t i;
  int32_t key_seq_len = sff->gheader->key->l;
  (*key_seq) = tmap_malloc(sizeof(uint8_t) * key_seq_len, "key_seq");
  for(i=0;i<key_seq_len;i++) {
      (*key_seq)[i] = tmap_nt_char_to_int[(int)sff->gheader->key->s[i]];
  }
  return key_seq_len;
}
Пример #6
0
int32_t
tmap_sff_get_flow_order_int(tmap_sff_t *sff, uint8_t **flow_order)
{
  int32_t i;
  int32_t flow_order_len = sff->gheader->flow->l;
  (*flow_order) = tmap_malloc(sizeof(uint8_t) * flow_order_len, "flow_order");
  for(i=0;i<flow_order_len;i++) {
      (*flow_order)[i] = tmap_nt_char_to_int[(int)sff->gheader->flow->s[i]];
  }
  return flow_order_len;
}
Пример #7
0
// NB: would require sorting to get the commands sorted
void
tmap_help_unknown_cmd(const char *cmd)
{
  int32_t i, n, best=INT32_MAX, best_n=0;
  uint64_t *distances = NULL;
  tmap_command_t *c = NULL;
  
  // get # of commands
  n = 0;
  c = commands;
  while(0 <= c->type) {
      n++;
      c++;
  }

  distances = tmap_malloc(sizeof(uint64_t)*n, "distances");

  for(i=0;0 <= commands[i].type;i++) {
      if(0 == strcmp(cmd, commands[i].name)) tmap_bug();
      if(!tmap_prefixcmp(commands[i].name, cmd)) {
          distances[i] = ((uint64_t)i << 32); // zero score
      }
      else {
          distances[i] = ((uint64_t)i << 32) | (tmap_levenshtein(cmd, commands[i].name, 0, 2, 1, 4) + 1); // pack
      }
      if(__get_distance(distances[i]) < best) {
          best = __get_distance(distances[i]);
          best_n = 0;
      }
      else if(__get_distance(distances[i]) == best) {
          best_n++;
      }
      //fprintf(stderr, "%s -> %u\n", commands[distances[i]>>32].name, (__get_distance(distances[i])));
  }

  if(0 == best && n == best_n) { // matches everything
      best = TMAP_HELP_SIMILARITY_FLOOR + 1; 
  }
  n = i;

  // output similar matches
  fprintf(stderr, "%s: '%s' is not a tmap command.  See 'tmap --help'.\n", PACKAGE, cmd);
  if(TMAP_HELP_SIMILAR_ENOUGH(best)) {
      fprintf(stderr, "\nDid you mean %s?\n",
              i < 2 ? "this": "one of these");
      for (i = 0; i < n; i++)
        if(best == __get_distance(distances[i])) {
            fprintf(stderr, "\t%s\n", commands[__get_name_idx(distances[i])].name);
        }
  }

  free(distances);
}
Пример #8
0
static tmap_sff_read_t *
tmap_sff_read_clone(tmap_sff_read_t *r, tmap_sff_header_t *gh, tmap_sff_read_header_t *rh)
{
  tmap_sff_read_t *ret = NULL;
  uint32_t i;

  ret = tmap_calloc(1, sizeof(tmap_sff_read_t), "r");

  ret->flowgram = tmap_malloc(sizeof(uint16_t)*gh->flow_length, "ret->flowgram");
  for(i=0;i<gh->flow_length;i++) {
      ret->flowgram[i] = r->flowgram[i];
  }

  ret->flow_index = tmap_malloc(sizeof(uint8_t)*rh->n_bases, "ret->flow_index");
  for(i=0;i<rh->n_bases;i++) {
      ret->flow_index[i] = r->flow_index[i];
  }

  ret->bases = tmap_string_clone(r->bases);
  ret->quality = tmap_string_clone(r->quality);

  return ret;
}
Пример #9
0
// from bam_md.c in SAMtools
void 
tmap_sam_md1(bam1_t *b, char *ref, int32_t len)
{
  int32_t i, j;
  char *ref_tmp = NULL;
  ref_tmp = tmap_malloc(sizeof(char) * (1 + len), "ref_tmp");
  for(i=j=0;i<len;i++) {
      if('-' != ref[i] && 'H' != ref[i]) {
          ref_tmp[j] = ref[i];
          j++;
      }
  }
  ref_tmp[j]='\0';
  tmap_sam_md1_core(b, ref_tmp);
  free(ref_tmp);
}
Пример #10
0
// TODO: memory pools?
tmap_map_sams_t *
tmap_map3_aux_core(tmap_seq_t *seq, 
                   uint8_t *flow_order,
                   int32_t flow_order_len,
                   tmap_refseq_t *refseq,
                   tmap_bwt_t *bwt,
                   tmap_sa_t *sa,
                   tmap_bwt_match_hash_t *hash,
                   tmap_map_opt_t *opt)
{
  int32_t i, j, n, seed_length, hp_diff = 0;
  int32_t seq_len;
  tmap_string_t *bases;
  uint8_t *query;
  uint8_t *flow=NULL;
  tmap_map3_aux_seed_t *seeds;
  int32_t m_seeds, n_seeds;
  tmap_map_sams_t *sams = NULL;

  if(0 < opt->hp_diff) {
      // set up the flow order to be used
      if(NULL == flow_order) {
          hp_diff = 0;
      }
      else {
          flow = tmap_malloc(sizeof(uint8_t)*flow_order_len, "flow[0]");
          for(i=0;i<flow_order_len;i++) {
              flow[i] = flow_order[i]; // forward
          }
      }
  }

  // init
  sams = tmap_map_sams_init(NULL);

  // update the seed length based on the read length
  seed_length = opt->seed_length;
  if(0 == opt->seed_length_set) {
      i = tmap_seq_get_bases_length(seq);
      while(0 < i) {
          seed_length++;
          i >>= 1; // divide by two
      }
  }
Пример #11
0
tmap_seqs_t *
tmap_seqs_clone(tmap_seqs_t *seqs)
{
  tmap_seqs_t *ret = NULL;
  int32_t i;

  ret = tmap_calloc(1, sizeof(tmap_seqs_t), "ret");
  ret->type = seqs->type;
  ret->n = seqs->n;
  ret->m = seqs->n; // do not expand memory

  if(0 < seqs->n) {
      ret->seqs = tmap_malloc(seqs->n * sizeof(tmap_seq_t*), "ret->seqs");
      for(i=0;i<ret->n;i++) {
          ret->seqs[i] = tmap_seq_clone(seqs->seqs[i]);
      }
  }

  return ret;
}
Пример #12
0
void
tmap_vsw_bm_core(int32_t seq_len, int32_t tlen, int32_t n_iter,
                 int32_t n_sub_iter, int32_t vsw_type)
{
  int32_t i, j, k;
  tmap_vsw_t *vsw = NULL;
  tmap_vsw_opt_t *vsw_opt = NULL;
  int32_t softclip_start, softclip_end;
  tmap_sw_param_t ap;
  int32_t matrix[25];

  tmap_map_opt_t *opt = tmap_map_opt_init(TMAP_MAP_ALGO_NONE);

  uint8_t *seq, *target;
  tmap_rand_t *rand = tmap_rand_init(0);

  seq = tmap_malloc(sizeof(uint8_t) * seq_len, "seq");
  target = tmap_malloc(sizeof(uint8_t) * tlen, "target");

  // random sequence
  for(i=0;i<seq_len;i++) {
      seq[i] = (uint8_t)(4*tmap_rand_get(rand));
  }

  softclip_start = 1;
  softclip_end = 1;

  // initialize opt
  if(0 <= vsw_type) { 
      vsw_opt = tmap_vsw_opt_init(opt->score_match, opt->pen_mm, opt->pen_gapo, opt->pen_gape, opt->score_thr);
      vsw = tmap_vsw_init(seq, seq_len, softclip_start, softclip_end, vsw_type, vsw_opt);
  }
  else {
      ap.matrix = matrix;
      __map_util_gen_ap(ap, opt);
  }

  int32_t front = (tlen - seq_len) / 2;
  int32_t end = tlen - seq_len - front;
  while(i<n_iter) {
      tmap_map_sam_t tmp_sam;
      int32_t overflow;
      for(j=k=0;j<front;j++,k++) {
          target[k] = (uint8_t)(4*tmap_rand_get(rand));
      }
      for(j=0;j<seq_len;j++,k++) {
          target[k] = seq[j];
      }
      for(j=0;j<end;j++,k++) {
          target[k] = (uint8_t)(4*tmap_rand_get(rand));
      }
      for(j=0;j<n_sub_iter&&i<n_iter;j++,i++) {
          if(0 <= vsw_type) { 
              // initialize the bounds
              tmp_sam.result.query_start = tmp_sam.result.query_end = 0;
              tmp_sam.result.target_start = tmp_sam.result.target_end = 0;
              // run the vsw
              tmap_vsw_process_fwd(vsw, seq, seq_len, target, tlen,
                            &tmp_sam.result, &overflow, opt->score_thr, 0);
          }
          else {
              tmap_sw_clipping_core(seq, seq_len, target, tlen,
                                    &ap, softclip_start, softclip_end,
                                    NULL, NULL, 0);
          }
      }
  }

  // free memory
  free(target);
  free(seq);
  if(0 <= vsw_type) {
      tmap_vsw_opt_destroy(vsw_opt);
      tmap_vsw_destroy(vsw);
  }
  tmap_map_opt_destroy(opt);
  tmap_rand_destroy(rand);
}
Пример #13
0
bam_header_t *
tmap_seqs_io_to_bam_header(tmap_refseq_t *refseq,
                           tmap_seqs_io_t *io_in,
                           char **rg_sam, int32_t rg_sam_num,
                           int32_t argc, char *argv[])
{
  bam_header_t *bam_header = NULL;
  sam_header_t *header = NULL; // the output header
  sam_header_record_t *record = NULL;
  sam_header_record_t **record_list = NULL;
  char tag[2];
  char *command_line= NULL;
  char *id = NULL;
  char *id_pp = NULL;
  int32_t i, j;

  // @HD
  if(io_in->type == TMAP_SEQ_TYPE_SAM || io_in->type == TMAP_SEQ_TYPE_BAM) {
      // should be only one input file
      if(1 != io_in->n) {
          tmap_bug();
      }
      // get the current header
      if(NULL == io_in->seqios[0]) tmap_bug();
      if(NULL == io_in->seqios[0]->io.samio) tmap_bug();
      if(NULL == io_in->seqios[0]->io.samio->fp->header) tmap_bug();
      if(NULL == io_in->seqios[0]->io.samio->fp->header->header) {
          header = sam_header_parse2(io_in->seqios[0]->io.samio->fp->header->text);
      }
      else {
          header = io_in->seqios[0]->io.samio->fp->header->header; // wow, that's a lot of pointers
          if(NULL == header) tmap_bug();
          header = sam_header_clone(header); // clone the header
      }
      if(NULL == header) tmap_bug();
  }
  else {
      // empty header
      header = sam_header_init();
      // @HD - header line
      record = sam_header_record_init("HD"); // new header line
      if(0 == sam_header_record_add(record, "VN", "1.4")) tmap_bug(); // version number
      if(0 == sam_header_add_record(header, record)) tmap_bug(); // add the header line
      // nullify
      record = NULL;
  }

  // Get the TMAP program ID
  id = tmap_malloc(sizeof(char) * (1 + strlen(PACKAGE_NAME)), "id"); 
  strcpy(id, PACKAGE_NAME); // default
  for(i=j=0;NULL != (record_list = sam_header_get_record(header, "PG", "ID", id, &i)) && 0 < i;i=0) { // while the id is found
      char *ptr = NULL;
      // swap id and id_pp
      ptr = id_pp;
      id_pp = id;
      id = ptr;
      // create the new ID
      j++;
      id = tmap_realloc(id, sizeof(char) * (1 + (int)log10(j) + 1 + strlen(PACKAGE_NAME) + 1), "id"); 
      if(sprintf(id, "%s.%d", PACKAGE_NAME, j) < 0) tmap_bug();
      free(record_list);
      record_list = NULL;
  }

  // @SQ
  if(NULL != refseq) {
      sam_header_records_t *records = NULL;
      // NB: check to see if any SQ/SN records exist, if not, then ignore checking...
	// ZZ: We will not checking, but instead just remove all the old header. The old way of checking is not working
      records = sam_header_get_records(header, "SQ");
      if (NULL != records) {
	// ZZ: remove the headers if exists.
	sam_header_remove_records(header, "SQ");
	records = NULL;
      }
      // ZZ: Now we will just add all new tags
      for(i=0;i<refseq->num_annos;i++) { // for each reference sequence
          char num[32];
          record = sam_header_record_init("SQ"); // new reference sequence record
          if(0 == sam_header_record_add(record, "SN", refseq->annos[i].name->s)) tmap_bug(); // reference sequence name
          if(sprintf(num, "%u", (uint32_t)refseq->annos[i].len) < 0) tmap_bug(); // integer to string
          if(0 == sam_header_record_add(record, "LN", num)) tmap_bug(); // reference sequence length
          if(0 == sam_header_add_record(header, record)) tmap_bug(); // add the reference sequence record
      }
  }

  // @RG - read group
  if(0 < rg_sam_num) { // @RG specified on the command line
      // Check for SAM/BAM
      // TODO: this should be possible...
      if(io_in->type == TMAP_SEQ_TYPE_SAM || io_in->type == TMAP_SEQ_TYPE_BAM) {
          tmap_error("Cannot specify the read groups on the command line when using SAM/BAM as input."
                     "  Please embed in the SAM/BAM header instead.", Exit, OutOfRange);
      }
      record = NULL;
      // go through the command line arguments
      for(i=0;i<rg_sam_num;i++) {
          if(strlen(rg_sam[i]) < 4) tmap_error("Read group too small", Exit, OutOfRange);
          if(':' != rg_sam[i][2]) tmap_error("Read group improperly formatted (no colon)", Exit, OutOfRange);

          // check for id
          if('I' == rg_sam[i][0] && 'D' == rg_sam[i][1]) { // new read group
              if(NULL != record) { // add the record
                  tmap_seqs_io_init2_fs_and_add(io_in, header, record); // add @RG.KS and @RG.FO
              }
              record = sam_header_record_init("RG"); // new read group
          }
          // add the tag/value to the record
          if(NULL == record) {
              tmap_error("The read group ID must be specified first", Exit, OutOfRange);
          }
          tag[0]=rg_sam[i][0]; tag[1]=rg_sam[i][1]; // setup the tag
          if(0 == sam_header_record_add(record, tag, rg_sam[i]+3)) tmap_bug(); // add the tag/value
      }
      if(NULL != record) { // add the record
          tmap_seqs_io_init2_fs_and_add(io_in, header, record); // add @RG.KS and @RG.FO
      }
      // check that the # of read groups added was the same as the # of input files...
      sam_header_records_t *records = sam_header_get_records(header, "RG"); // get the header line
      if(records->n != io_in->n) tmap_error("The number of read groups did not match the number of input files", Exit, OutOfRange);
  }
  else if(io_in->type != TMAP_SEQ_TYPE_SAM && io_in->type != TMAP_SEQ_TYPE_BAM) { // dummy...
      for(i=0;i<io_in->n;i++) { // for each input file
          char buf[32];
          record = sam_header_record_init("RG"); // new read group
          if(1 == io_in->n) strcpy(buf, "NOID");
          else if(sprintf(buf, "NOID.%d", i+1) < 0) tmap_bug();
          if(0 == sam_header_record_add(record, "ID", buf)) tmap_bug(); // dummy ID
          if(0 == sam_header_record_add(record, "SM", "NOSM")) tmap_bug(); // dummy SM, for Picard validation
          if(0 == sam_header_record_add(record, "PG", id)) tmap_bug(); // dummy PG
          tmap_seqs_io_init2_fs_and_add(io_in, header, record); // add @RG.KS and @RG.FO
      }
  }
  else {
      // check that SM/PG are present
      sam_header_records_t *records = sam_header_get_records(header, "RG"); // get the header line
      for(i=0;i<records->n;i++) {
          record = records->records[i];
          if(NULL == sam_header_record_get(record, "ID")) tmap_error("Missing @RG.ID in the SAM/BAM Header", Exit, OutOfRange);
          if(NULL == sam_header_record_get(record, "SM")) {
              if(0 == sam_header_record_add(record, "SM", "NOSM")) tmap_bug(); // dummy SM, for Picard validation
          }
          if(NULL == sam_header_record_get(record, "PG")) {
              if(0 == sam_header_record_add(record, "PG", id)) tmap_bug(); // dummy PG
          }
      }
  }

  // @PG - program group
  // TODO: check for previous program group ID and set @PG.PP
  record = sam_header_record_init("PG"); // new program group
  if(0 == sam_header_record_add(record, "ID", id)) tmap_bug(); // @PG.ID
  if(0 == sam_header_record_add(record, "VN", PACKAGE_VERSION)) tmap_bug(); // @PG.VN
  // @PG.CL
  command_line = NULL;
  j = 1; // for the EOL
  command_line = tmap_realloc(command_line, sizeof(char) * j, "command_line");
  command_line[j-1] = '\0';
  for(i=0;i<argc;i++) {
      if(0 < i) j++;
      j += strlen(argv[i]);
      command_line = tmap_realloc(command_line, sizeof(char) * j, "command_line");
      if(0 < i) strcat(command_line, " ");
      strcat(command_line, argv[i]);
      command_line[j-1] = '\0';
  }
  if(0 == sam_header_record_add(record, "CL", command_line)) tmap_bug(); // @PG.CL
  if(NULL != id_pp) { // @PG.PP
      if(0 == sam_header_record_add(record, "PP", id_pp)) tmap_bug(); // @PG.CL
  }
  if(0 == sam_header_add_record(header, record)) tmap_bug(); // add the record
  free(command_line);

  // Check the new SAM Header
  if(0 == sam_header_check(header)) {
      tmap_error("SAM Header was not consistent", Exit, OutOfRange);
  }

  // Create a BAM Header from the SAM Header
  bam_header = bam_header_init(); // empty
  bam_header->header = header; // soft-copy the header
  bam_header = sam_header_to_bam_header(bam_header); // convert

  // free memory
  free(id);
  free(id_pp);

  return bam_header;
}
Пример #14
0
static inline void
tmap_map1_aux_stack_push(tmap_map1_aux_stack_t *stack, 
                         int32_t offset,
                         tmap_bwt_match_occ_t *match_sa_prev,
                         int32_t n_mm, int32_t n_gapo, int32_t n_gape,
                         int32_t state, int32_t is_diff, 
                         tmap_map1_aux_stack_entry_t *prev_entry,
                         const tmap_map_opt_t *opt)
{
  int32_t i;
  int32_t n_bins_needed = 0;
  tmap_map1_aux_stack_entry_t *entry = NULL;
  tmap_map1_aux_bin_t *bin = NULL;

  // check to see if we need more memory
  if(stack->entry_pool_length <= stack->entry_pool_i) { 
      int32_t i = stack->entry_pool_length;
      stack->entry_pool_length <<= 2;
      stack->entry_pool = tmap_realloc(stack->entry_pool, 
                                       sizeof(tmap_map1_aux_stack_entry_t*)*stack->entry_pool_length, 
                                       "stack->entry_pool");
      while(i<stack->entry_pool_length) {
          stack->entry_pool[i] = tmap_malloc(sizeof(tmap_map1_aux_stack_entry_t), "stack->entry_pool[i]");
          i++;
      }
  }

  entry = stack->entry_pool[stack->entry_pool_i];
  entry->score = aln_score(n_mm, n_gapo, n_gape, opt);
  entry->n_mm = n_mm;
  entry->n_gapo = n_gapo;
  entry->n_gape = n_gape;
  entry->state = state;
  entry->match_sa = (*match_sa_prev); 
  entry->i = stack->entry_pool_i;
  entry->offset = offset;
  if(NULL == prev_entry) {
      entry->last_diff_offset = offset;
      entry->prev_i = -1;
  }
  else {
      entry->last_diff_offset = (1 == is_diff) ? (offset) : prev_entry->last_diff_offset; 
      entry->prev_i = prev_entry->i;
  }

  if(stack->n_bins <= entry->score) {
      //tmap_bug();
      // resize the bins if necessary
      n_bins_needed = entry->score + 1;
      // realloc
      tmap_roundup32(n_bins_needed);
      stack->bins = tmap_realloc(stack->bins, sizeof(tmap_map1_aux_bin_t) * n_bins_needed, "stack->bins"); 
      // initialize
      for(i=stack->n_bins;i<n_bins_needed;i++) {
          stack->bins[i].n_entries = stack->bins[i].m_entries = 0;
          stack->bins[i].entries = NULL;
      }
      stack->n_bins = n_bins_needed;
  }
  if(stack->n_bins <= entry->score) {
      tmap_bug();
  }
  bin = &stack->bins[entry->score];
  
  // - remove duplicates
  // - most likely formed by tandem repeats or indels
  // - too computationally expensive, and not necessary
  /*
  for(i=0;i<bin->n_entries;i++) {
      if(bin->entries[i]->match_sa.k == entry->match_sa.k 
         && bin->entries[i]->match_sa.l == entry->match_sa.l 
         && bin->entries[i]->offset == entry->offset
         && bin->entries[i]->state == entry->state) {
          return;
      }
  }
  */
  
  // update best score
  if(stack->best_score > entry->score) stack->best_score = entry->score;

  if(bin->m_entries <= bin->n_entries) {
      bin->m_entries++;
      tmap_roundup32(bin->m_entries);
      bin->entries = tmap_realloc(bin->entries, sizeof(tmap_map1_aux_bin_t) * bin->m_entries, "bin->entries");
  }
  bin->entries[bin->n_entries] = entry;
  bin->n_entries++;

  stack->entry_pool_i++;
  stack->n_entries++;
}
Пример #15
0
// prepare internal structures for clipping and alignment
// returns true if realignment was performed
bool RealignImp::compute_alignment (
    const char* q_seq,
    unsigned q_len,
    const char* r_seq, 
    unsigned r_len,
    int r_pos, 
    bool forward, 
    const uint32_t* cigar, 
    unsigned cigar_sz, 
    uint32_t*& cigar_dest, 
    unsigned& cigar_dest_sz, 
    int& new_pos,
    bool& already_perfect,
    bool& clip_failed,
    bool& alignment_failed,
    bool& unclip_failed)
{
    already_perfect = false;
    alignment_failed = false;
    unclip_failed = false;
    unsigned oplen;

    const char* q_seq_clipped = q_seq;
    const uint32_t* cigar_clipped = cigar;
    unsigned cigar_sz_clipped = cigar_sz;

    unsigned sclip_q_len, sclip_r_len, sclip_al_len;

    assert (cigar_sz);
    // reset realigner
    Reset ();

    // set clipping 
    SetClipping ((int) cliptype_, forward);

    // clip out the hard and soft clipping zones from 5" and 3"
    // The 'cut out' of the q_seq is done by switching to downstream pointer.
    if (bam_cigar_op (*cigar) == BAM_CSOFT_CLIP)
    {
        oplen = bam_cigar_oplen (*cigar);
        ClipStart (oplen);
        q_seq_clipped += oplen;
        ++cigar_clipped;
        --cigar_sz_clipped;
    }

    if (cigar_sz > 1 && bam_cigar_op (cigar [cigar_sz - 1]) == BAM_CSOFT_CLIP)
    {
        oplen = bam_cigar_oplen (cigar [cigar_sz - 1]);
        ClipEnd (oplen);
        --cigar_sz_clipped;
    }

    // cigar defines q_seq and t_seq lengths
    sclip_al_len = seq_lens_from_bin_cigar (cigar_clipped, cigar_sz_clipped, &sclip_q_len, &sclip_r_len);

    const std::string query (q_seq_clipped, sclip_q_len);
    const std::string target (r_seq, sclip_r_len);
    std::string pretty_al; pretty_al.reserve (sclip_al_len);

    pretty_al_from_bin_cigar (cigar_clipped, cigar_sz_clipped, q_seq_clipped, r_seq, pretty_al);

    // Realigner requires strings of proper size to be passed to SetSequences
    SetSequences (query, target, pretty_al, forward);

    if (!ClipAnchors (clip_failed))
    {
        already_perfect = true;
        return false; // alignment already good, no imperfect zone to realign found
    }

    // TODO avoid automatic vectors to prevent unneeded heap usage
    vector<MDelement> new_md_vec; 
    vector<CigarOp> new_cigar_vec;
    unsigned int start_pos_shift;

    if (!computeSWalignment(new_cigar_vec, new_md_vec, start_pos_shift))
    {
        alignment_failed = true;
        return false;
    }

    if (!addClippedBasesToTags(new_cigar_vec, new_md_vec, q_len))
    {
        unclip_failed = true;
        return false; // error adding back clipped out zones
    }

    if (!LeftAnchorClipped () && start_pos_shift != 0) 
    {
        // build cigar data only if it is needed
        // TODO avoid automatic vectors to prevent unneeded heap usage
        std::vector <CigarOp> cigar_vec;
        cigar_vector_from_bin (cigar, cigar_sz, cigar_vec);
        new_pos = updateReadPosition (cigar_vec, start_pos_shift, r_pos);
    }
    else
        new_pos = r_pos;

    // free (cigar_dest);
    // TODO: switch to better alignment memory management, avoid heap operations
    cigar_dest = (uint32_t*) tmap_malloc (sizeof (uint32_t) * new_cigar_vec.size (), "cigar_dest");
    cigar_dest_sz = new_cigar_vec.size ();
    cigar_vector_to_bin (new_cigar_vec, cigar_dest);

    return true;
}
Пример #16
0
void
tmap_sam_print_header(tmap_file_t *fp, tmap_refseq_t *refseq, tmap_seq_io_t *seqio, char *sam_rg, 
                      int32_t sam_flowspace_tags, int32_t ignore_rg_sam_tags, 
                      int argc, char *argv[])
{
  int32_t i, j, header_n = 0;
  char **header_a = NULL;
  char ***header_b = NULL;

  // SAM header
  tmap_file_fprintf(fp, "@HD\tVN:%s\tSO:unsorted\n",
                    TMAP_SAM_PRINT_VERSION);
  if(NULL != refseq) {
      for(i=0;i<refseq->num_annos;i++) {
          tmap_file_fprintf(fp, "@SQ\tSN:%s\tLN:%d\n",
                            refseq->annos[i].name->s, (int)refseq->annos[i].len);
      }
  }
  // RG
  header_a = tmap_sam_parse_rg(sam_rg); // parse the input read group line
  if(1 == ignore_rg_sam_tags) { // do not get the header from the input file
      if(1 == sam_flowspace_tags) { // ... except for the RG.FS/RG.KO
          // get the RG header from the input file
          header_b = tmap_seq_io_get_rg_header(seqio, &header_n);
          if(1 < header_n) { 
              // TODO: we could check to see that FO/KS are the same across all
              // input read groups
              tmap_error("Command line read group found with multiple read groups from the input file", Exit, OutOfRange);
          }
          else if(1 == header_n) {
              if(NULL == header_a) {
                  header_a = tmap_calloc(TMAP_SAM_RG_NUM, sizeof(char*), "header_a");
                  // copy over default RG.ID
                  header_a[TMAP_SAM_RG_ID] = tmap_malloc(sizeof(char) * (strlen(tmap_sam_rg_id) + 1), "header_a[TMAP_SAM_RG_ID]");
                  strcpy(header_a[TMAP_SAM_RG_ID], tmap_sam_rg_id);
              }
              for(i=0;i<TMAP_SAM_RG_NUM;i++) { // for each RG.TAG
                  switch(i) {
                    case TMAP_SAM_RG_FO:
                    case TMAP_SAM_RG_KS:
                      if(NULL != header_a[i] && NULL != header_b[0][i]) {
                          tmap_error("Command line and input read groups share tags", Exit, OutOfRange);
                      }
                      else if(NULL != header_b[0][i]) { // copy over
                          header_a[i] = tmap_malloc(sizeof(char) * (strlen(header_b[0][i]) + 1), "header_a[i]");
                          strcpy(header_a[i], header_b[0][i]);
                      }
                    default:
                      break;
                  }
              }
          }
          // free header_b, it is no longer in use
          for(i=0;i<header_n;i++) {
              free(header_b[i]);
          }
          free(header_b);
          header_b = NULL;
          header_n = 0;
      }
  }
  else { 
      // get the RG header from the input file
      header_b = tmap_seq_io_get_rg_header(seqio, &header_n);
  }

  // reconcile the RG headers
  if(NULL != header_a) { // header a exists
      if(NULL != header_b && 1 == header_n) { // header b exists, and only one line...
          // check to see if they are mutually exclusive
          for(i=0;i<TMAP_SAM_RG_NUM;i++) {
              if(NULL != header_a[i] && NULL != header_b[0][i]) {
                  tmap_file_fprintf(tmap_file_stderr, "\nFound both command line and input file read group information for the same tag: %s.\n", TMAP_SAM_RG_TAGS[i]);
                  tmap_error(NULL, Exit, OutOfRange);
              }
              else if(NULL == header_a[i] && NULL != header_b[0][i]) { // copy over
                  header_a[i] = tmap_calloc(1+strlen(header_b[0][i]), sizeof(char), "header_a[i]");
                  strcpy(header_a[i], header_b[0][i]);
              }
          }
          // free
          free(header_b[0]);
          free(header_b);
          header_b = NULL;
          header_n = 0;
      }
      if(0 == header_n) { // no header b
          if(NULL != header_a[TMAP_SAM_RG_ID]) {
              strcpy(tmap_sam_rg_id, header_a[TMAP_SAM_RG_ID]);
          }
          else {
              header_a[TMAP_SAM_RG_ID] = tmap_malloc(sizeof(char) * (strlen(tmap_sam_rg_id) + 1), "header_a[i]");
              strcpy(header_a[TMAP_SAM_RG_ID], tmap_sam_rg_id);
          }
          if(NULL == header_a[TMAP_SAM_RG_SM]) { // for Picard
              header_a[TMAP_SAM_RG_SM] = tmap_malloc(sizeof(char) * (strlen(TMAP_SAM_NO_RG_SM) + 1), "header_a[i]");
              strcpy(header_a[TMAP_SAM_RG_SM], TMAP_SAM_NO_RG_SM);
          }
          tmap_sam_rg_id_use = 1;
          tmap_file_fprintf(fp, "@RG");
          for(i=0;i<TMAP_SAM_RG_NUM;i++) {
              if(NULL != header_a[i]) {
                  tmap_file_fprintf(fp, "\t%s:%s", TMAP_SAM_RG_TAGS[i], header_a[i]);
              }
          }
          tmap_file_fprintf(fp, "\n");
      }
      else { // both header_a and header_b exist 
          tmap_error("Found both command line and input file read group information", Exit, OutOfRange);
      }
  }
  else { // no header_a exists
      if(NULL != header_b) { // no header_b exists
          tmap_sam_rg_id_use = 0;
          for(i=0;i<header_n;i++) { // for each RG.ID
              if(NULL == header_b[i][TMAP_SAM_RG_ID]) {
                  if(1 == header_n && TMAP_SEQ_TYPE_SFF == seqio->type) { // make an exception for SFF files
                      header_b[i][TMAP_SAM_RG_ID] = tmap_sam_rg_id;
                      tmap_sam_rg_id_use = 1;
                  }
                  else {
                      tmap_error("missing RG.ID found in the RG SAM Header", Exit, OutOfRange);
                  }
              }
              // RG.SM for picard
              if(NULL == header_b[i][TMAP_SAM_RG_SM]) {
                  header_b[i][TMAP_SAM_RG_SM] = TMAP_SAM_NO_RG_SM;
              }
              tmap_file_fprintf(fp, "@RG");
              for(j=0;j<TMAP_SAM_RG_NUM;j++) { // for each RG.TAG
                  if(NULL != header_b[i][j]) {
                      tmap_file_fprintf(fp, "\t%s:%s", TMAP_SAM_RG_TAGS[j], header_b[i][j]);
                  }
              }
              tmap_file_fprintf(fp, "\n");
          }
      }
      else {
          header_a = tmap_calloc(TMAP_SAM_RG_NUM, sizeof(char*), "header_a");
          // RG.ID
          header_a[TMAP_SAM_RG_ID] = tmap_malloc(sizeof(char) * (strlen(tmap_sam_rg_id) + 1), "header_a[i]");
          strcpy(header_a[TMAP_SAM_RG_ID], tmap_sam_rg_id);
          // RG.SM for Picard
          header_a[TMAP_SAM_RG_SM] = tmap_malloc(sizeof(char) * (strlen(TMAP_SAM_NO_RG_SM) + 1), "header_a[i]");
          strcpy(header_a[TMAP_SAM_RG_SM], TMAP_SAM_NO_RG_SM);
          tmap_sam_rg_id_use = 1;
          tmap_file_fprintf(fp, "@RG");
          for(i=0;i<TMAP_SAM_RG_NUM;i++) {
              if(NULL != header_a[i]) {
                  tmap_file_fprintf(fp, "\t%s:%s", TMAP_SAM_RG_TAGS[i], header_a[i]);
              }
          }
          tmap_file_fprintf(fp, "\n");
      }
  }

  // PG
  tmap_file_fprintf(fp, "@PG\tID:%s\tVN:%s\tCL:",
                    PACKAGE_NAME, PACKAGE_VERSION);
  for(i=0;i<argc;i++) {
      if(0 < i) tmap_file_fprintf(fp, " ");
      tmap_file_fprintf(fp, "%s", argv[i]);
  }
  tmap_file_fprintf(fp, "\n");

  // free
  for(i=0;i<header_n;i++) {
      free(header_b[i]);
  }
  free(header_b);
  if(NULL != header_a) {
      for(i=0;i<TMAP_SAM_RG_NUM;i++) {
          free(header_a[i]);
      }
  }
  free(header_a);
}