Exemplo n.º 1
0
static void
tmap_seqs_io_init2_fs_and_add(tmap_seqs_io_t *io_in,
                              sam_header_t *header,
                              sam_header_record_t *record)
{
  char tag[2];
  // add @RG.KS and @RG.FO
  if(io_in->type == TMAP_SEQ_TYPE_SFF) {
      sam_header_records_t *records = sam_header_get_records(header, record->tag); // get the header line
      if(io_in->n <= records->n) tmap_error("Too many read groups specified", Exit, OutOfRange);
      // @RG.KS
      tag[0]='K';tag[1]='S';
      if(0 == sam_header_record_add(record, tag, tmap_sff_io_get_rg_ks(io_in->seqios[records->n]->io.sffio))) {
          tmap_error("Could not add the KS tag; most likely it is already present", Exit, OutOfRange);
      }
      // @RG.FO
      tag[0]='F';tag[1]='O';
      if(0 == sam_header_record_add(record, tag, tmap_sff_io_get_rg_fo(io_in->seqios[records->n]->io.sffio))) {
          tmap_error("Could not add the FO tag; most likely it is already present", Exit, OutOfRange);
      }
  }
  // check for the @RG.ID and @RG.SM tags
  if(NULL == sam_header_record_get(record, "ID")) tmap_bug(); // should not happen
  if(NULL == sam_header_record_get(record, "SM")) {
      if(0 == sam_header_record_add(record, "SM", "NOSM")) tmap_bug(); // dummy SM, for Picard validation
  }
  if(NULL == sam_header_record_get(record, "PG")) {
      if(0 == sam_header_record_add(record, "PG", PACKAGE_NAME)) tmap_bug(); // dummy PG
  }
  // add the read group
  if(0 == sam_header_add_record(header, record)) tmap_bug(); 
}
Exemplo n.º 2
0
tmap_bwt_int_t
tmap_sa_pac_pos(const tmap_sa_t *sa, const tmap_bwt_t *bwt, tmap_bwt_int_t k)
{

#if TMAP_SA_RUN_TYPE != 1 // Not just the optimized
    tmap_bwt_int_t orig;
    orig = tmap_sa_pac_pos_orig(sa, bwt, k);
#endif

#if TMAP_SA_RUN_TYPE != 0 // Not just the original
    tmap_bwt_int_t opt;
    opt = tmap_sa_pac_pos_aux(sa, bwt, k);
#endif

#if TMAP_SA_RUN_TYPE == 2 // Not just the original
    if(orig != opt) {
        tmap_bug();
    }
#endif

#if TMAP_SA_RUN_TYPE == 1 // The optimized
    return opt;
#else
    return orig;
#endif
}
Exemplo n.º 3
0
int32_t
tmap_sam_get_za(tmap_sam_t *sam)
{
  uint8_t *tag = NULL;
  // ZA
  if(NULL == sam->b) tmap_bug();
  tag = bam_aux_get(sam->b, "ZA");
  if(NULL != tag) return bam_aux2i(tag);
  else return -1;
}
Exemplo n.º 4
0
// NB: would require sorting to get the commands sorted
void
tmap_help_unknown_cmd(const char *cmd)
{
  int32_t i, n, best=INT32_MAX, best_n=0;
  uint64_t *distances = NULL;
  tmap_command_t *c = NULL;
  
  // get # of commands
  n = 0;
  c = commands;
  while(0 <= c->type) {
      n++;
      c++;
  }

  distances = tmap_malloc(sizeof(uint64_t)*n, "distances");

  for(i=0;0 <= commands[i].type;i++) {
      if(0 == strcmp(cmd, commands[i].name)) tmap_bug();
      if(!tmap_prefixcmp(commands[i].name, cmd)) {
          distances[i] = ((uint64_t)i << 32); // zero score
      }
      else {
          distances[i] = ((uint64_t)i << 32) | (tmap_levenshtein(cmd, commands[i].name, 0, 2, 1, 4) + 1); // pack
      }
      if(__get_distance(distances[i]) < best) {
          best = __get_distance(distances[i]);
          best_n = 0;
      }
      else if(__get_distance(distances[i]) == best) {
          best_n++;
      }
      //fprintf(stderr, "%s -> %u\n", commands[distances[i]>>32].name, (__get_distance(distances[i])));
  }

  if(0 == best && n == best_n) { // matches everything
      best = TMAP_HELP_SIMILARITY_FLOOR + 1; 
  }
  n = i;

  // output similar matches
  fprintf(stderr, "%s: '%s' is not a tmap command.  See 'tmap --help'.\n", PACKAGE, cmd);
  if(TMAP_HELP_SIMILAR_ENOUGH(best)) {
      fprintf(stderr, "\nDid you mean %s?\n",
              i < 2 ? "this": "one of these");
      for (i = 0; i < n; i++)
        if(best == __get_distance(distances[i])) {
            fprintf(stderr, "\t%s\n", commands[__get_name_idx(distances[i])].name);
        }
  }

  free(distances);
}
Exemplo n.º 5
0
static inline tmap_map1_aux_stack_entry_t *
tmap_map1_aux_stack_pop(tmap_map1_aux_stack_t *stack)
{
  int32_t i;
  tmap_map1_aux_bin_t *bin;
  tmap_map1_aux_stack_entry_t *best = NULL;

  if(0 == stack->n_entries) {
      return NULL;
  }
  
  // remove from the appropriate bin
  bin = &stack->bins[stack->best_score];
  if(0 == bin->n_entries) {
      tmap_bug();
  }
  best = bin->entries[bin->n_entries-1];
  bin->entries[bin->n_entries-1] = NULL;
  bin->n_entries--;
  stack->n_entries--;

  if(0 == stack->n_entries) {
      stack->best_score = INT32_MAX;
  }
  else if(0 == bin->n_entries) { // find the next best
      for(i=stack->best_score;i<stack->n_bins;i++) {
          if(0 < stack->bins[i].n_entries) {
              stack->best_score = i;
              break;
          }
      }
      if(i == stack->n_bins) {
          tmap_bug();
      }
  }

  return best;
}
Exemplo n.º 6
0
static int
tmap_usage(int argc, char *argv[])
{
  tmap_command_t *c = NULL;
  int i, t;

  tmap_version(argc, argv);

  c = commands;
  t = -1;
  while(0 <= c->type) {
      if(c->type != t) {
          if(0 <= t && TMAP_COMMAND_NONE != c->type) fprintf(stderr, "\n");
          switch(c->type) {
            case TMAP_COMMAND_PREPROCESSING:
              fprintf(stderr, "%sPre-processing:%s\n", KRED, KNRM);
              break;
            case TMAP_COMMAND_SERVER:
              fprintf(stderr, "%sServer:%s\n", KRED, KNRM);
              break;
            case TMAP_COMMAND_MAPPING:
              fprintf(stderr, "%sMapping:%s\n", KRED, KNRM);
              break;
            case TMAP_COMMAND_UTILITIES:
              fprintf(stderr, "%sUtilities:%s\n", KRED, KNRM);
              break;
#ifdef ENABLE_TMAP_DEBUG_FUNCTIONS
            case TMAP_COMMAND_DEBUG:
              fprintf(stderr, "%sDebugging:%s\n", KRED, KNRM);
              break;
#endif
            case TMAP_COMMAND_NONE:
              break;
            default:
              fprintf(stderr, "c->type=%d\n", c->type);
              tmap_bug();
          }
          t = c->type;
      }
            
      if(c->type != TMAP_COMMAND_NONE) {
          fprintf(stderr, "         %s%s%s", KCYN, c->name, KNRM);
          for(i=strlen(c->name);i<16;i++) fputc(' ', stderr);
          fprintf(stderr, "%s%s%s\n", KWHT, c->help, KNRM);
      }
      c++;
  }
  return 1;
}
Exemplo n.º 7
0
static int32_t
tmap_map1_mapq(tmap_map_sams_t *sams, int32_t seq_len, tmap_map_opt_t *opt)
{
  int32_t i;
  int32_t num_best_sa, num_best;

  if(0 == sams->n) {
      return 0;
  }

  // sort by decreasing score
  tmap_sort_introsort(tmap_map1_sam_sort_score, sams->n, sams->sams);

  //Note: assumes that the alignments are sorted by decreasing score
  num_best = num_best_sa = 0;
  for(i=0;i<sams->n;i++) {
      if(0 < i && sams->sams[i-1].score < sams->sams[i].score) { // check assumption
          tmap_bug();
      }
      if(sams->sams[i].score < sams->sams[0].score) {
          break;
      }
      num_best++;
      num_best_sa++;
  }
  for(i=0;i<num_best;i++) {
      sams->sams[i].mapq = tmap_map1_sam_mapq(num_best_sa, 
                                              sams->sams[i].aux.map1_aux->num_all_sa, 
                                              opt->max_mm, 
                                              sams->sams[i].aux.map1_aux->n_mm);
  }
  for(i=num_best;i<sams->n;i++) {
      sams->sams[i].mapq = 0;
  }

  return 0;
}
Exemplo n.º 8
0
void 
tmap_bwt_compare_core2(tmap_bwt_t *bwt[2], int32_t length, int32_t print_msg, int32_t warn)
{
  uint8_t *seqs[2] = {NULL,NULL};
  char *str = NULL;
  int32_t i, asymmetric, k, m;
  uint64_t hash_j;
  int64_t sum, j;
  tmap_bwt_match_occ_t sa[2];
  tmap_bwt_int_t n[2][2];

  for(i=1;i<=length;i++) {
      seqs[0] = tmap_calloc(i, sizeof(uint8_t), "seqs[0]");
      seqs[1] = tmap_calloc(i, sizeof(uint8_t), "seqs[1]");
      str = tmap_calloc(i+1, sizeof(char), "str");
      for(j=0;j<i;j++) {
          seqs[1][j] = 3;
      }

      asymmetric = 0;
      j = 0;
      hash_j = sum = 0;
      while(1) {
          if(i == j) {
              for(k=0;k<i;k++) {
                  seqs[1][k] = 3 - seqs[0][i-k-1];
              }
              for(k=0;k<2;k++) {
                  for(m=0;m<2;m++) {
                      n[m][k] = tmap_bwt_match_exact_reverse(bwt[m], i, seqs[k], &sa[m]);
                  }
                  if(n[0][k] != n[1][k] || sa[0].k != sa[1].k || sa[0].l != sa[1].l || sa[0].hi != sa[1].hi || sa[0].offset != sa[1].offset) {
                      tmap_progress_print2("BWTs did not match");
                      tmap_progress_print2("n=[%llu,%llu]", n[0][k], n[1][k]);
                      tmap_progress_print2("k=[%llu,%llu]", sa[0].k, sa[1].k);
                      tmap_progress_print2("l=[%llu,%llu]", sa[0].l, sa[1].l);
                      tmap_progress_print2("hi=[%llu,%llu]", sa[0].hi, sa[1].hi);
                      tmap_progress_print2("offset=[%llu,%llu]", sa[0].offset, sa[1].offset);
                      tmap_bug();
                  }
              }
              for(k=0;k<2;k++) {
                  // use m == 0 && k = 0
                  if(0 == k) {
                      if(0 < n[0][k] && TMAP_BWT_INT_MAX != sa[0].k && sa[0].k <= sa[0].l) {
                          sum += n[0][k];
                      }
                  }
              }
              if(0 == asymmetric && n[0][0] != n[0][1]) {
                  asymmetric = 1;
                  //fprintf(stderr, "n[0][0]=%u n[0][1]=%u\n", n[0][0], n[0][1]);
                  tmap_error("Asymmetry found", Warn, OutOfRange);
              }

              j--;
              while(0 <= j && 3 == seqs[0][j]) {
                  seqs[0][j] = 0;
                  hash_j >>= 2;
                  j--;
              }
              if(j < 0) break;
              seqs[0][j]++;
              hash_j++;
              j++;
          }
          else {
              hash_j <<= 2;
              j++;
          }
      }

      free(seqs[0]);
      free(seqs[1]);
      free(str);

      j = (sum == (bwt[0]->seq_len - i + 1)) ? 0 : 1; // j==1 on fail
      if(1 == print_msg) {
          if(0 == j) tmap_progress_print2("%d-mer validation passed", i);
          else tmap_progress_print2("%d-mer validation failed: observed (%llu) != expected (%llu)\n", i, sum, bwt[0]->seq_len - i + 1);
      }
      if(0 == warn && 1 == j) {
          tmap_error("inconsistency found in the BWT", Exit, OutOfRange);
      }
  }
Exemplo n.º 9
0
int
tmap_seqs_io_sff2sam_main(int argc, char *argv[])
{
  int c, help = 0;
  tmap_seqs_io_t *io_in = NULL;
  tmap_seqs_t *seqs = NULL;
  char **sam_rg = NULL;
  int32_t sam_rg_num = 0;
  int bidirectional = 0, sam_flowspace_tags = 0;
  int out_type = 0;
  tmap_sam_io_t *io_out = NULL;
  bam_header_t *header = NULL; // BAM Header
  int32_t i;

  /*
  uint8_t *key_seq = NULL;
  int key_seq_len = 0;
  */

  while((c = getopt(argc, argv, "DGR:Yvh")) >= 0) {
      switch(c) {
        case 'D': bidirectional = 1; break;
        case 'G': break;
        case 'R':
                  sam_rg = tmap_realloc(sam_rg, (1+sam_rg_num) * sizeof(char*), "sam_rg");
                  sam_rg[sam_rg_num] = tmap_strdup(optarg);
                  sam_rg_num++;
                  break;
        case 'Y': sam_flowspace_tags = 1; break;
        case 'v': tmap_progress_set_verbosity(1); break;
        case 'h': help = 1; break;
        default: return 1;
      }
  }
  if(1 != argc - optind || 1 == help) {
      tmap_file_fprintf(tmap_file_stderr, "Usage: %s %s [-R -Y -v -h] <in.sff>\n", PACKAGE, argv[0]);
      return 1; 
  }

  // input
  io_in = tmap_seqs_io_init(&argv[optind], 1, TMAP_SEQ_TYPE_SFF, TMAP_FILE_NO_COMPRESSION, 0l, 0l);

  // BAM Header
  header = tmap_seqs_io_to_bam_header(NULL, io_in, sam_rg, sam_rg_num, argc, argv);

  // open the output file
  switch(out_type) {
    case 0: // SAM
      io_out = tmap_sam_io_init2("-", "wh", header);
      break;
    case 1:
      io_out = tmap_sam_io_init2("-", "wb", header);
      break;
    case 2:
      io_out = tmap_sam_io_init2("-", "wbu", header);
      break;
    default:
      tmap_bug();
  }

  // destroy the BAM Header
  bam_header_destroy(header);
  header = NULL;

  seqs = tmap_seqs_init(TMAP_SEQ_TYPE_SFF);
  while(0 < tmap_seqs_io_read(io_in, seqs, io_out->fp->header->header)) {
      bam1_t *b = NULL;
      tmap_seq_t *seq = seqs->seqs[0];
      b = tmap_sam_convert_unmapped(seq, sam_flowspace_tags, bidirectional, NULL,
                                    0, 0, 0,
                                    0, 0, 0,
                                    "\tlq:i:%d\trq:i:%d\tla:i:%d\trq:i:%d",
                                    seq->data.sff->rheader->clip_qual_left,
                                    seq->data.sff->rheader->clip_qual_right,
                                    seq->data.sff->rheader->clip_adapter_left,
                                    seq->data.sff->rheader->clip_adapter_right);
      if(samwrite(io_out->fp, b) <= 0) {
          tmap_error("Error writing the SAM file", Exit, WriteFileError);
      }
      bam_destroy1(b); 
      tmap_seqs_destroy(seqs);
      seqs = tmap_seqs_init(TMAP_SEQ_TYPE_SFF);
  }
  tmap_seqs_destroy(seqs);

  // free memory
  tmap_seqs_io_destroy(io_in);
  tmap_sam_io_destroy(io_out);
  for(i=0;i<sam_rg_num;i++) {
      free(sam_rg[i]);
  }
  free(sam_rg);

  return 0;
}
Exemplo n.º 10
0
bam_header_t *
tmap_seqs_io_to_bam_header(tmap_refseq_t *refseq,
                           tmap_seqs_io_t *io_in,
                           char **rg_sam, int32_t rg_sam_num,
                           int32_t argc, char *argv[])
{
  bam_header_t *bam_header = NULL;
  sam_header_t *header = NULL; // the output header
  sam_header_record_t *record = NULL;
  sam_header_record_t **record_list = NULL;
  char tag[2];
  char *command_line= NULL;
  char *id = NULL;
  char *id_pp = NULL;
  int32_t i, j;

  // @HD
  if(io_in->type == TMAP_SEQ_TYPE_SAM || io_in->type == TMAP_SEQ_TYPE_BAM) {
      // should be only one input file
      if(1 != io_in->n) {
          tmap_bug();
      }
      // get the current header
      if(NULL == io_in->seqios[0]) tmap_bug();
      if(NULL == io_in->seqios[0]->io.samio) tmap_bug();
      if(NULL == io_in->seqios[0]->io.samio->fp->header) tmap_bug();
      if(NULL == io_in->seqios[0]->io.samio->fp->header->header) {
          header = sam_header_parse2(io_in->seqios[0]->io.samio->fp->header->text);
      }
      else {
          header = io_in->seqios[0]->io.samio->fp->header->header; // wow, that's a lot of pointers
          if(NULL == header) tmap_bug();
          header = sam_header_clone(header); // clone the header
      }
      if(NULL == header) tmap_bug();
  }
  else {
      // empty header
      header = sam_header_init();
      // @HD - header line
      record = sam_header_record_init("HD"); // new header line
      if(0 == sam_header_record_add(record, "VN", "1.4")) tmap_bug(); // version number
      if(0 == sam_header_add_record(header, record)) tmap_bug(); // add the header line
      // nullify
      record = NULL;
  }

  // Get the TMAP program ID
  id = tmap_malloc(sizeof(char) * (1 + strlen(PACKAGE_NAME)), "id"); 
  strcpy(id, PACKAGE_NAME); // default
  for(i=j=0;NULL != (record_list = sam_header_get_record(header, "PG", "ID", id, &i)) && 0 < i;i=0) { // while the id is found
      char *ptr = NULL;
      // swap id and id_pp
      ptr = id_pp;
      id_pp = id;
      id = ptr;
      // create the new ID
      j++;
      id = tmap_realloc(id, sizeof(char) * (1 + (int)log10(j) + 1 + strlen(PACKAGE_NAME) + 1), "id"); 
      if(sprintf(id, "%s.%d", PACKAGE_NAME, j) < 0) tmap_bug();
      free(record_list);
      record_list = NULL;
  }

  // @SQ
  if(NULL != refseq) {
      sam_header_records_t *records = NULL;
      // NB: check to see if any SQ/SN records exist, if not, then ignore checking...
	// ZZ: We will not checking, but instead just remove all the old header. The old way of checking is not working
      records = sam_header_get_records(header, "SQ");
      if (NULL != records) {
	// ZZ: remove the headers if exists.
	sam_header_remove_records(header, "SQ");
	records = NULL;
      }
      // ZZ: Now we will just add all new tags
      for(i=0;i<refseq->num_annos;i++) { // for each reference sequence
          char num[32];
          record = sam_header_record_init("SQ"); // new reference sequence record
          if(0 == sam_header_record_add(record, "SN", refseq->annos[i].name->s)) tmap_bug(); // reference sequence name
          if(sprintf(num, "%u", (uint32_t)refseq->annos[i].len) < 0) tmap_bug(); // integer to string
          if(0 == sam_header_record_add(record, "LN", num)) tmap_bug(); // reference sequence length
          if(0 == sam_header_add_record(header, record)) tmap_bug(); // add the reference sequence record
      }
  }

  // @RG - read group
  if(0 < rg_sam_num) { // @RG specified on the command line
      // Check for SAM/BAM
      // TODO: this should be possible...
      if(io_in->type == TMAP_SEQ_TYPE_SAM || io_in->type == TMAP_SEQ_TYPE_BAM) {
          tmap_error("Cannot specify the read groups on the command line when using SAM/BAM as input."
                     "  Please embed in the SAM/BAM header instead.", Exit, OutOfRange);
      }
      record = NULL;
      // go through the command line arguments
      for(i=0;i<rg_sam_num;i++) {
          if(strlen(rg_sam[i]) < 4) tmap_error("Read group too small", Exit, OutOfRange);
          if(':' != rg_sam[i][2]) tmap_error("Read group improperly formatted (no colon)", Exit, OutOfRange);

          // check for id
          if('I' == rg_sam[i][0] && 'D' == rg_sam[i][1]) { // new read group
              if(NULL != record) { // add the record
                  tmap_seqs_io_init2_fs_and_add(io_in, header, record); // add @RG.KS and @RG.FO
              }
              record = sam_header_record_init("RG"); // new read group
          }
          // add the tag/value to the record
          if(NULL == record) {
              tmap_error("The read group ID must be specified first", Exit, OutOfRange);
          }
          tag[0]=rg_sam[i][0]; tag[1]=rg_sam[i][1]; // setup the tag
          if(0 == sam_header_record_add(record, tag, rg_sam[i]+3)) tmap_bug(); // add the tag/value
      }
      if(NULL != record) { // add the record
          tmap_seqs_io_init2_fs_and_add(io_in, header, record); // add @RG.KS and @RG.FO
      }
      // check that the # of read groups added was the same as the # of input files...
      sam_header_records_t *records = sam_header_get_records(header, "RG"); // get the header line
      if(records->n != io_in->n) tmap_error("The number of read groups did not match the number of input files", Exit, OutOfRange);
  }
  else if(io_in->type != TMAP_SEQ_TYPE_SAM && io_in->type != TMAP_SEQ_TYPE_BAM) { // dummy...
      for(i=0;i<io_in->n;i++) { // for each input file
          char buf[32];
          record = sam_header_record_init("RG"); // new read group
          if(1 == io_in->n) strcpy(buf, "NOID");
          else if(sprintf(buf, "NOID.%d", i+1) < 0) tmap_bug();
          if(0 == sam_header_record_add(record, "ID", buf)) tmap_bug(); // dummy ID
          if(0 == sam_header_record_add(record, "SM", "NOSM")) tmap_bug(); // dummy SM, for Picard validation
          if(0 == sam_header_record_add(record, "PG", id)) tmap_bug(); // dummy PG
          tmap_seqs_io_init2_fs_and_add(io_in, header, record); // add @RG.KS and @RG.FO
      }
  }
  else {
      // check that SM/PG are present
      sam_header_records_t *records = sam_header_get_records(header, "RG"); // get the header line
      for(i=0;i<records->n;i++) {
          record = records->records[i];
          if(NULL == sam_header_record_get(record, "ID")) tmap_error("Missing @RG.ID in the SAM/BAM Header", Exit, OutOfRange);
          if(NULL == sam_header_record_get(record, "SM")) {
              if(0 == sam_header_record_add(record, "SM", "NOSM")) tmap_bug(); // dummy SM, for Picard validation
          }
          if(NULL == sam_header_record_get(record, "PG")) {
              if(0 == sam_header_record_add(record, "PG", id)) tmap_bug(); // dummy PG
          }
      }
  }

  // @PG - program group
  // TODO: check for previous program group ID and set @PG.PP
  record = sam_header_record_init("PG"); // new program group
  if(0 == sam_header_record_add(record, "ID", id)) tmap_bug(); // @PG.ID
  if(0 == sam_header_record_add(record, "VN", PACKAGE_VERSION)) tmap_bug(); // @PG.VN
  // @PG.CL
  command_line = NULL;
  j = 1; // for the EOL
  command_line = tmap_realloc(command_line, sizeof(char) * j, "command_line");
  command_line[j-1] = '\0';
  for(i=0;i<argc;i++) {
      if(0 < i) j++;
      j += strlen(argv[i]);
      command_line = tmap_realloc(command_line, sizeof(char) * j, "command_line");
      if(0 < i) strcat(command_line, " ");
      strcat(command_line, argv[i]);
      command_line[j-1] = '\0';
  }
  if(0 == sam_header_record_add(record, "CL", command_line)) tmap_bug(); // @PG.CL
  if(NULL != id_pp) { // @PG.PP
      if(0 == sam_header_record_add(record, "PP", id_pp)) tmap_bug(); // @PG.CL
  }
  if(0 == sam_header_add_record(header, record)) tmap_bug(); // add the record
  free(command_line);

  // Check the new SAM Header
  if(0 == sam_header_check(header)) {
      tmap_error("SAM Header was not consistent", Exit, OutOfRange);
  }

  // Create a BAM Header from the SAM Header
  bam_header = bam_header_init(); // empty
  bam_header->header = header; // soft-copy the header
  bam_header = sam_header_to_bam_header(bam_header); // convert

  // free memory
  free(id);
  free(id_pp);

  return bam_header;
}
Exemplo n.º 11
0
static inline void
tmap_map1_aux_stack_push(tmap_map1_aux_stack_t *stack, 
                         int32_t offset,
                         tmap_bwt_match_occ_t *match_sa_prev,
                         int32_t n_mm, int32_t n_gapo, int32_t n_gape,
                         int32_t state, int32_t is_diff, 
                         tmap_map1_aux_stack_entry_t *prev_entry,
                         const tmap_map_opt_t *opt)
{
  int32_t i;
  int32_t n_bins_needed = 0;
  tmap_map1_aux_stack_entry_t *entry = NULL;
  tmap_map1_aux_bin_t *bin = NULL;

  // check to see if we need more memory
  if(stack->entry_pool_length <= stack->entry_pool_i) { 
      int32_t i = stack->entry_pool_length;
      stack->entry_pool_length <<= 2;
      stack->entry_pool = tmap_realloc(stack->entry_pool, 
                                       sizeof(tmap_map1_aux_stack_entry_t*)*stack->entry_pool_length, 
                                       "stack->entry_pool");
      while(i<stack->entry_pool_length) {
          stack->entry_pool[i] = tmap_malloc(sizeof(tmap_map1_aux_stack_entry_t), "stack->entry_pool[i]");
          i++;
      }
  }

  entry = stack->entry_pool[stack->entry_pool_i];
  entry->score = aln_score(n_mm, n_gapo, n_gape, opt);
  entry->n_mm = n_mm;
  entry->n_gapo = n_gapo;
  entry->n_gape = n_gape;
  entry->state = state;
  entry->match_sa = (*match_sa_prev); 
  entry->i = stack->entry_pool_i;
  entry->offset = offset;
  if(NULL == prev_entry) {
      entry->last_diff_offset = offset;
      entry->prev_i = -1;
  }
  else {
      entry->last_diff_offset = (1 == is_diff) ? (offset) : prev_entry->last_diff_offset; 
      entry->prev_i = prev_entry->i;
  }

  if(stack->n_bins <= entry->score) {
      //tmap_bug();
      // resize the bins if necessary
      n_bins_needed = entry->score + 1;
      // realloc
      tmap_roundup32(n_bins_needed);
      stack->bins = tmap_realloc(stack->bins, sizeof(tmap_map1_aux_bin_t) * n_bins_needed, "stack->bins"); 
      // initialize
      for(i=stack->n_bins;i<n_bins_needed;i++) {
          stack->bins[i].n_entries = stack->bins[i].m_entries = 0;
          stack->bins[i].entries = NULL;
      }
      stack->n_bins = n_bins_needed;
  }
  if(stack->n_bins <= entry->score) {
      tmap_bug();
  }
  bin = &stack->bins[entry->score];
  
  // - remove duplicates
  // - most likely formed by tandem repeats or indels
  // - too computationally expensive, and not necessary
  /*
  for(i=0;i<bin->n_entries;i++) {
      if(bin->entries[i]->match_sa.k == entry->match_sa.k 
         && bin->entries[i]->match_sa.l == entry->match_sa.l 
         && bin->entries[i]->offset == entry->offset
         && bin->entries[i]->state == entry->state) {
          return;
      }
  }
  */
  
  // update best score
  if(stack->best_score > entry->score) stack->best_score = entry->score;

  if(bin->m_entries <= bin->n_entries) {
      bin->m_entries++;
      tmap_roundup32(bin->m_entries);
      bin->entries = tmap_realloc(bin->entries, sizeof(tmap_map1_aux_bin_t) * bin->m_entries, "bin->entries");
  }
  bin->entries[bin->n_entries] = entry;
  bin->n_entries++;

  stack->entry_pool_i++;
  stack->n_entries++;
}
Exemplo n.º 12
0
void
tmap_seq_update(tmap_seq_t *seq, int32_t idx, sam_header_t *header)
{
  char *rg_id = NULL;
  sam_header_records_t *records = NULL;
  sam_header_record_t **record_list = NULL;
  int32_t n = 0;

  // Read Group
  switch(seq->type) {
    case TMAP_SEQ_TYPE_FQ:
    case TMAP_SEQ_TYPE_SFF:
      break;
    case TMAP_SEQ_TYPE_SAM:
    case TMAP_SEQ_TYPE_BAM:
      rg_id = tmap_sam_get_rg_id(seq->data.sam);
      break;
    default:
      tmap_error("type is unrecognized", Exit, OutOfRange);
      break;
  }
  if(NULL == rg_id) { // did not find in SAM/BAM
      // NB: assume that it is from the ith record in the header 
      records = sam_header_get_records(header, "RG");
      if(NULL != records) { // it exists
          if(idx < 0 || records->n <= idx) {
              tmap_error("RG records index was out of bounds", Exit, OutOfRange);
          }
          seq->rg_record = records->records[idx]; // copy over
          if(NULL == seq->rg_record) tmap_bug();
      }
  }
  else { // found in SAM/BAM
      n = 0;
      record_list = sam_header_get_record(header, "RG", "ID", rg_id, &n);
      if(0 == n) {
          fprintf(stderr, "Read Group Identifier: [%s]\n", rg_id);
          tmap_error("Did not find the @RG.ID in the SAM/BAM Header", Exit, OutOfRange);
      }
      else if(1 < n) {
          fprintf(stderr, "Read Group Identifier: [%s]\n", rg_id);
          tmap_error("Found more than one @RG.ID in the SAM/BAM Header", Exit, OutOfRange);
      }
      seq->rg_record = record_list[0];
      free(record_list); // NB: shallow copied
  }

  // Program Group
  // NB: assumes the last item in the header
  records = sam_header_get_records(header, "PG");
  if(NULL != records && 0 < records->n) { // it exists
      seq->pg_record = records->records[records->n-1]; // copy over
  }
  else {
      seq->pg_record = NULL;
  }

  // key sequence and flow order
  seq->fo_start_idx = -1;
  if(NULL != seq->rg_record) { // It should exist in the SAM/BAM Header
      seq->ks = sam_header_record_get(seq->rg_record, "KS");
      seq->fo = sam_header_record_get(seq->rg_record, "FO");
        
      // flow order index start
      if(NULL != seq->ks && NULL != seq->fo && TMAP_SEQ_TYPE_SFF == seq->type) { // only if it is an SFF
          // in addition, remove key sequence and trimming
          seq->fo_start_idx = tmap_seq_remove_key_sequence(seq, 1); 
      }
      else if(TMAP_SEQ_TYPE_SAM == seq->type || TMAP_SEQ_TYPE_BAM == seq->type) { // Try the ZF tag...
          seq->fo_start_idx = tmap_sam_get_fo_start_idx(seq->data.sam);
      }

      // flowgram information...
      seq->flowgram_len = tmap_seq_get_flowgram(seq, &seq->flowgram);

      // check if all flowspace information is available
      /*if((NULL == seq->ks || NULL == seq->fo || -1 == seq->fo_start_idx || NULL == seq->flowgram)// anything missing
         && (NULL != seq->ks || NULL != seq->fo || -1 != seq->fo_start_idx || NULL != seq->flowgram)) { // anything exists
          fprintf(stderr, "@RG.KS %s present.\n", (NULL == seq->ks) ? "is not" : "is");
          fprintf(stderr, "@RG.FO %s present.\n", (NULL == seq->fo) ? "is not" : "is");
          fprintf(stderr, "@SAM.FZ %s present.\n", (NULL == seq->flowgram) ? "is not" : "is");
          fprintf(stderr, "@SAM.ZF %s present.\n", (-1 == seq->fo_start_idx) ? "is not" : "is");
          tmap_error("Not all flowspace information available (@RG.KS and @RG.FO, and @SAM.FZ and @SAM.ZF)", Exit, OutOfRange);
      }*/
  }
}
Exemplo n.º 13
0
static inline void
tmap_map3_aux_core_seed_helper(uint8_t *query,
                               int32_t query_length,
                               int32_t offset,
                               uint8_t *flow_order,
                               uint8_t flow_i,
                               int32_t hp_diff,
                               tmap_refseq_t *refseq,
                               tmap_bwt_t *bwt,
                               tmap_sa_t *sa,
                               tmap_bwt_match_hash_t *hash,
                               tmap_map_opt_t *opt,
                               tmap_map3_aux_seed_t **seeds,
                               int32_t *n_seeds,
                               int32_t *m_seeds,
                               int32_t seed_length)
{ 
  int32_t i, k;
  int32_t n_bases;;
  tmap_bwt_match_occ_t prev_sa, cur_sa, next_sa, tmp_sa;

  if(query_length <= offset) return;
  if(flow_order[flow_i] != query[offset]) {
      tmap_bug();
  }

  // initialize prev
  prev_sa.k = 0;
  prev_sa.l = bwt->seq_len;
  prev_sa.hi = 0;
  prev_sa.offset = 0;

  i = offset;
  while(i < query_length) {
      // reached the seed length
      if(seed_length < i - offset) { 
          break;
      }

      // get the homopolymer length
      n_bases = 0;
      if(flow_order[flow_i] == query[i]) { // non-empty flow
          n_bases = 1;
          while(i + n_bases < query_length && query[i] == query[i+n_bases]) {
              n_bases++;
          }
      }


      // move through the homopolymer, trying deletions if possible
      next_sa = prev_sa;
      for(k=0;k<n_bases;k++) {
          // reached the seed length
          if(seed_length < i - offset + k) { 
              break;
          }

          // only delete if there are bases available and we are not deleting
          // the entire first flow 
          int32_t bases_to_align = seed_length - (i - offset + k);
          int32_t bases_left = query_length - i - n_bases;
          if(0 < n_bases - k // bases to delete
             && n_bases - k <= hp_diff // not too many to delete
             && (i != offset || 0 != k) // do not delete the entire flow
             && bases_to_align <= bases_left) { // enough bases 
              // match exactly from here onwards
              tmp_sa = next_sa;
              if(0 < tmap_bwt_match_hash_exact_alt(bwt, bases_to_align, query + i + n_bases, &tmp_sa, hash)
                 && (tmp_sa.l - tmp_sa.k + 1) <= opt->max_seed_hits) {
                  tmap_map3_aux_seed_add(seeds, n_seeds, m_seeds, tmp_sa.k, tmp_sa.l, offset, seed_length + n_bases - k);
              }
          }

          // move past this base in the hp
          tmp_sa = next_sa;
          tmap_bwt_match_hash_2occ(bwt, &tmp_sa, flow_order[flow_i], &next_sa, hash);
          if(next_sa.l < next_sa.k) { // no match, return
              return;
          }
      }

      // insert hp bases
      if(i + n_bases < offset + seed_length) { // not the last flow
          cur_sa = next_sa; // already considered the 'n_bases' of this flow
          // insert
          for(k=1;k<=hp_diff;k++) { // # of bases to insert
              tmap_bwt_match_hash_2occ(bwt, &cur_sa, flow_order[flow_i], &tmp_sa, hash);
              if(tmp_sa.l < tmp_sa.k) { // no match, do not continue
                  break;
              }
              // match exactly from here onwards
              if(0 < tmap_bwt_match_hash_exact_alt(bwt, seed_length - (i - offset) - n_bases, query + i + n_bases, &tmp_sa, hash)
                 && (tmp_sa.l - tmp_sa.k + 1) <= opt->max_seed_hits) {
                  tmap_map3_aux_seed_add(seeds, n_seeds, m_seeds, tmp_sa.k, tmp_sa.l, offset, seed_length + k);
              }
              // move to the next
              cur_sa = tmp_sa;
          }
      }

      // next flow
      flow_i = (1+flow_i) & 3;
      i += n_bases;
      prev_sa = next_sa;
  }

  if(i - offset < seed_length) tmap_bug();

  // add in the seed with no hp indels
  if((next_sa.l - next_sa.k + 1) <= opt->max_seed_hits) {
      tmap_map3_aux_seed_add(seeds, n_seeds, m_seeds, next_sa.k, next_sa.l, offset, seed_length);
  }
}
Exemplo n.º 14
0
static inline void
tmap_map3_aux_core_seed(uint8_t *query,
                        int32_t query_length,
                        uint8_t *flow_order,
                        int32_t hp_diff,
                        tmap_refseq_t *refseq,
                        tmap_bwt_t *bwt,
                        tmap_sa_t *sa,
                        tmap_bwt_match_hash_t *hash,
                        tmap_map_opt_t *opt,
                        tmap_map3_aux_seed_t **seeds,
                        int32_t *n_seeds,
                        int32_t *m_seeds,
                        int32_t seed_length,
                        int32_t seed_step,
                        int32_t fwd_search)
{
  int32_t i, j, flow_i;

  if(0 < opt->hp_diff) {
      i=flow_i=0;
      while(i<query_length - seed_length + 1) {
          // move to the next flow
          j=0;
          while(query[i] != flow_order[flow_i]) {
              flow_i = (flow_i + 1) & 3;
              // sanity check
              j++;
              if(4 <= j) tmap_bug();
          }

          // add seeds
          tmap_map3_aux_core_seed_helper(query, query_length, i, flow_order, flow_i, hp_diff,
                                         refseq, bwt, sa, hash, opt, seeds, n_seeds, m_seeds,
                                         seed_length);

          // skip over this hp
          i++;
          while(i<query_length) {
              if(query[i] != query[i-1]) {
                  break;
              }
              i++;
          }
      }
  }
  else {
      int k, count;
      tmap_bwt_match_occ_t cur_sa, prev_sa;
      j = count = 0;
      if(1 == fwd_search) {
          for(i=0;i<query_length-seed_length+1;i++) {
              if(0 < tmap_bwt_match_hash_exact(bwt, seed_length, query + i, &cur_sa, hash)) {
                  count++;
                  if((cur_sa.l - cur_sa.k + 1) <= opt->max_seed_hits) {
                      // extend further
                      prev_sa = cur_sa;
                      k = i + 1;
                      while(k < query_length - seed_length) {
                          tmap_bwt_match_hash_2occ(bwt, &prev_sa, query[k], &cur_sa, hash);
                          if(cur_sa.l < cur_sa.k) {
                              // use prev
                              cur_sa = prev_sa;
                              break;
                          }
                          else {
                              // keep going
                              prev_sa = cur_sa;
                              k++;
                          }
                      }
                      k--; // k is always one greater
                      tmap_map3_aux_seed_add(seeds, n_seeds, m_seeds, cur_sa.k, cur_sa.l, k, seed_length + k - i);
                      j++;
                      // skip over 
                      if(0 < opt->skip_seed_frac) {
                          i += opt->skip_seed_frac * (seed_length + k - i - 1); // - 1 since i will be incremented
                      }
                  }
                  else {
                      // seed stepping
                      if(0 < seed_step) {
                          k = i + seed_length;
                          int32_t n = 0;
                          while(k + seed_step < query_length && 0 < tmap_bwt_match_hash_exact(bwt, seed_step, query + k, &cur_sa, hash)) {
                              if((cur_sa.l - cur_sa.k + 1) <= opt->max_seed_hits) {
                                  tmap_map3_aux_seed_add(seeds, n_seeds, m_seeds, cur_sa.k, cur_sa.l, i, seed_length + k - i);
                                  j++;
                                  // skip over 
                                  if(0 < opt->skip_seed_frac) {
                                      i += opt->skip_seed_frac * (seed_length + k - i - 1); // - 1 since i will be incremented
                                  }
                                  break;
                              }
                              k += seed_step;
                              n++;
                          }
                      }
                  }
              }
          }
      }
      else {
          for(i=query_length-seed_length;0<=i;i--) {
              if(0 < tmap_bwt_match_hash_exact(bwt, seed_length, query + i, &cur_sa, hash)) {
                  count++;
                  if((cur_sa.l - cur_sa.k + 1) <= opt->max_seed_hits) {
                      tmap_map3_aux_seed_add(seeds, n_seeds, m_seeds, cur_sa.k, cur_sa.l, i, seed_length);
                      j++;
                      if(0 < opt->skip_seed_frac) {
                          i -= opt->skip_seed_frac * (seed_length - 1); // -1 since i will be incremented
                      }
                  }
                  else {
                      // seed stepping
                      if(0 < seed_step) {
                          int32_t k = i + seed_length;
                          int32_t n = 0;
                          while(k + seed_step < query_length && 0 < tmap_bwt_match_hash_exact_alt(bwt, seed_step, query + k, &cur_sa, hash)) {
                              if((cur_sa.l - cur_sa.k + 1) <= opt->max_seed_hits) {
                                  tmap_map3_aux_seed_add(seeds, n_seeds, m_seeds, cur_sa.k, cur_sa.l, i, seed_length + k - i);
                                  j++;
                                  if(0 < opt->skip_seed_frac) {
                                      i -= opt->skip_seed_frac * (seed_length + k - i - 1); // -1 since i will be incremented
                                  }
                                  // break when the e find the first hit
                                  break;
                              }
                              k += seed_step;
                              n++;
                          }
                      }
                  }
              }
              else {
                  // skip over if we came up short
                  i -= (seed_length - cur_sa.offset); 
              }
          }
      }
      // remove seeds if there were too many repetitive hits
      // NB: does count seed steps
      if(j / (double)count < opt->hit_frac) {
          (*n_seeds) = 0;
          //(*n_seeds) -= j;
      }
  }
}
Exemplo n.º 15
0
static inline tmap_string_t *
tmap_sam_md(tmap_refseq_t *refseq, char *read_bases, // read bases are characters
            uint32_t seqid, uint32_t pos, // seqid and pos are 0-based
            uint32_t *cigar, int32_t n_cigar, int32_t *nm, char *read_bases_eq)
{
  int32_t i, j;
  uint32_t ref_i, read_i, ref_start, ref_end;
  int32_t l = 0; // the length of the last md op
  uint8_t read_base, ref_base;
  tmap_string_t *md=NULL;
  uint8_t *target = NULL;;

  md = tmap_string_init(32);
  (*nm) = 0;

  ref_start = ref_end = pos + 1; // make one-based
  for(i=0;i<n_cigar;i++) { // go through each cigar operator
      int32_t op_len;
      op_len = cigar[i] >> 4;
      switch(cigar[i]&0xf) {
        case BAM_CMATCH:
        case BAM_CDEL:
        case BAM_CREF_SKIP:
          ref_end += op_len; break;
        default:
          break;
      }
  }
  ref_end--;
      
  target = tmap_refseq_subseq2(refseq, seqid+1, ref_start, ref_end, NULL, 0, NULL);
  if(NULL == target) {
      tmap_bug();
  }

  if(0 == n_cigar) {
      tmap_bug();
  }

  read_i = ref_i = 0;
  for(i=0;i<n_cigar;i++) { // go through each cigar operator
      int32_t op_len, op;

      op_len = cigar[i] >> 4;
      op = cigar[i] & 0xf;

      if(BAM_CMATCH == op) {
          for(j=0;j<op_len;j++) {
              if(refseq->len <= refseq->annos[seqid].offset + pos + ref_i) break; // out of boundary

              read_base = tmap_nt_char_to_int[(int)read_bases[read_i]]; 
              ref_base = target[ref_i];

              if(read_base == ref_base) { // a match
                  if(NULL != read_bases_eq) read_bases_eq[read_i] = '=';
                  l++;
              }
              else {
                  if(NULL != read_bases_eq) read_bases_eq[read_i] = read_bases[read_i];
                  tmap_string_lsprintf(md, md->l, "%d%c", l, tmap_iupac_int_to_char[ref_base]);
                  l = 0;
                  (*nm)++;
              }
              read_i++;
              ref_i++; 
          }
          if(j < op_len) break;
      }
      else if(BAM_CINS == op) {
          if(NULL != read_bases_eq) {
              for(j=0;j<op_len;j++) {
                  read_bases_eq[read_i+j] = read_bases[read_i+j];
              }
          }
          read_i += op_len;
          (*nm) += op_len;
      }
      else if(BAM_CDEL == op) {
          tmap_string_lsprintf(md, md->l, "%d^", l);
          for(j=0;j<op_len;j++) {
              if(refseq->len <= refseq->annos[seqid].offset + pos + ref_i) break; // out of boundary
              ref_base = target[ref_i];
              tmap_string_lsprintf(md, md->l, "%c", tmap_iupac_int_to_char[ref_base]);
              ref_i++;
          }
          if(j < op_len) break;
          (*nm) += op_len;
          l=0;
      }
      else if(BAM_CREF_SKIP == op) {
          ref_i += op_len;
      }
      else if(BAM_CSOFT_CLIP == op) {
          if(NULL != read_bases_eq) {
              for(j=0;j<op_len;j++) {
                  read_bases_eq[read_i+j] = read_bases[read_i+j];
              }
          }
          read_i += op_len;
      }
      else if(BAM_CHARD_CLIP == op) {
          // ignore
      }
      else if(BAM_CPAD == op) {
          // ignore
      }
      else {
          tmap_error("could not understand the cigar operator", Exit, OutOfRange);
      }
  }
  tmap_string_lsprintf(md, md->l, "%d", l);
  if(NULL != read_bases_eq) read_bases_eq[read_i] = '\0';

  free(target);

  return md;
}