Beispiel #1
0
static inline void
tmap_sam_print_rg(tmap_file_t *fp, tmap_seq_t *seq)
{
  // RG 
  if(1 == tmap_sam_rg_id_use) {
      tmap_file_fprintf(fp, "\tRG:Z:%s", tmap_sam_rg_id);
  }
  else if(0 == tmap_sam_rg_id_use) {
      char *id = tmap_seq_get_rg_id(seq);
      if(NULL == id) {
          tmap_error("Missing Record RG.ID in the input file", Exit, OutOfRange);
      }
      tmap_file_fprintf(fp, "\tRG:Z:%s", id);
  }
}
Beispiel #2
0
int
tmap_refseq_refinfo_main(int argc, char *argv[])
{
  int c, help=0;
  tmap_refseq_t *refseq = NULL;
  tmap_file_t *fp_anno = NULL;
  char *fn_anno = NULL;
  char *fn_fasta = NULL;

  while((c = getopt(argc, argv, "vh")) >= 0) {
      switch(c) {
        case 'v': tmap_progress_set_verbosity(1); break;
        case 'h': help = 1; break;
        default: return 1;
      }
  }
  if(1 != argc - optind || 1 == help) {
      tmap_file_fprintf(tmap_file_stderr, "Usage: %s %s [-vh] <in.fasta>\n", PACKAGE, argv[0]);
      return 1;
  }
  fn_fasta = argv[optind];

  // Note: 'tmap_file_stdout' should not have been previously modified
  tmap_file_stdout = tmap_file_fdopen(fileno(stdout), "wb", TMAP_FILE_NO_COMPRESSION);

  // allocate some memory 
  refseq = tmap_calloc(1, sizeof(tmap_refseq_t), "refseq");
  refseq->is_rev = 0;
  refseq->is_shm = 0;

  // read the annotation file
  fn_anno = tmap_get_file_name(fn_fasta, TMAP_ANNO_FILE);
  fp_anno = tmap_file_fopen(fn_anno, "rb", TMAP_ANNO_COMPRESSION);
  tmap_refseq_read_anno(fp_anno, refseq);
  tmap_file_fclose(fp_anno);
  free(fn_anno);

  // no need to read in the pac
  refseq->seq = NULL;

  // print the header
  tmap_refseq_print_header(tmap_file_stdout, refseq);

  // destroy
  tmap_refseq_destroy(refseq);

  // close the output
  tmap_file_fclose(tmap_file_stdout);

  return 0;
}
Beispiel #3
0
static inline void 
tmap_sam_print_fz_and_zf(tmap_file_t *fp, tmap_seq_t *seq)
{
  uint16_t *flowgram = NULL;
  int32_t flow_start_index;
  int32_t flowgram_len;
  flowgram_len = tmap_seq_get_flowgram(seq, &flowgram, 0);
  if(NULL != flowgram) {
      tmap_sam_print_flowgram(fp, flowgram, flowgram_len);
      free(flowgram);
  }
  flow_start_index = tmap_seq_get_flow_start_index(seq);
  if(0 <= flow_start_index) {
      tmap_file_fprintf(fp, "\tZF:i:%d", flow_start_index);
  }
}
Beispiel #4
0
static inline void 
tmap_refseq_print_header(tmap_file_t *fp, tmap_refseq_t *refseq)
{
  uint32_t i;
  tmap_file_fprintf(fp, "version id:\t%llu\n", (unsigned long long int)refseq->version_id);
  tmap_file_fprintf(fp, "format:\t%s\n", tmap_refseq_get_version_format(refseq->package_version->s));
  tmap_file_fprintf(fp, "package version:\t%s\n", refseq->package_version->s);
  for(i=0;i<refseq->num_annos;i++) {
      tmap_file_fprintf(fp, "contig-%d:\t%s\t%u\n", i+1, refseq->annos[i].name->s, refseq->annos[i].len);
  }
  tmap_file_fprintf(fp, "length:\t%llu\n", (unsigned long long int)refseq->len);
  tmap_file_fprintf(fp, "supported:\t%s\n", (0 == tmap_refseq_supported(refseq)) ? "false" : "true");
}
Beispiel #5
0
int
tmap_refseq_fasta2pac_main(int argc, char *argv[])
{
  int c, help=0;

  while((c = getopt(argc, argv, "vh")) >= 0) {
      switch(c) {
        case 'v': tmap_progress_set_verbosity(1); break;
        case 'h': help = 1; break;
        default: return 1;
      }
  }
  if(1 != argc - optind || 1 == help) {
      tmap_file_fprintf(tmap_file_stderr, "Usage: %s %s [-vh] <in.fasta>\n", PACKAGE, argv[0]);
      return 1;
  }

  tmap_refseq_fasta2pac(argv[optind], TMAP_FILE_NO_COMPRESSION);

  return 0;
}
Beispiel #6
0
int 
tmap_index(int argc, char *argv[])
{
  int c;
  tmap_index_opt_t opt;

  opt.fn_fasta = NULL;
  opt.occ_interval = TMAP_BWT_OCC_INTERVAL; 
  opt.hash_width = INT32_MAX;
  opt.sa_interval = TMAP_SA_INTERVAL; 
  opt.is_large = -1;
  opt.check_hash = 1;
      
  if(2 == argc && 0 == strcmp("--version", argv[1])) {
      tmap_file_stdout = tmap_file_fdopen(fileno(stdout), "wb", TMAP_FILE_NO_COMPRESSION);
      tmap_file_fprintf(tmap_file_stdout, "%s\n", tmap_refseq_get_version_format(PACKAGE_VERSION));
      tmap_file_fclose(tmap_file_stdout);
      return 0;
  }

  while((c = getopt(argc, argv, "f:o:i:w:a:hvH")) >= 0) {
      switch(c) {
        case 'f':
          opt.fn_fasta = tmap_strdup(optarg); break;
        case 'o':
          opt.occ_interval = atoi(optarg); break;
        case 'i':
          opt.sa_interval = atoi(optarg); break;
        case 'w':
          opt.hash_width = atoi(optarg); break;
        case 'a':
          if(0 == strcmp("is", optarg)) opt.is_large = 0;
          else if(0 == strcmp("bwtsw", optarg)) opt.is_large = 1;
          else tmap_error("Option -a value not correct", Exit, CommandLineArgument); 
          break; 
        case 'v':
          tmap_progress_set_verbosity(1); break;
        case 'h':
          return usage(&opt);
        case 'H':
          opt.check_hash = 0; break;
        default:
          return usage(&opt);
      }
  }

  if(argc != optind || 1 == argc) {
      return usage(&opt);
  }
  if(NULL == opt.fn_fasta) {
      tmap_error("required option -f", Exit, CommandLineArgument);
  }
  if(opt.occ_interval < TMAP_BWT_OCC_MOD || 0 != (opt.occ_interval % 2) || 0 != (opt.occ_interval % TMAP_BWT_OCC_MOD)) {
      tmap_error("option -o out of range", Exit, CommandLineArgument);
  }
  if(opt.hash_width < 0) {
      tmap_error("option -w out of range", Exit, CommandLineArgument);
  }
  if(opt.sa_interval <= 0 || (1 < opt.sa_interval && 0 != (opt.sa_interval % 2))) {
      tmap_error("option -i out of range", Exit, CommandLineArgument);
  }

  tmap_index_core(&opt);

  free(opt.fn_fasta);
  
  tmap_progress_print2("terminating successfully");

  return 0;
}
Beispiel #7
0
static int 
usage(tmap_index_opt_t *opt)
{
  tmap_file_fprintf(tmap_file_stderr, "\n");
  tmap_file_fprintf(tmap_file_stderr, "Usage: %s index [options]", PACKAGE);
  tmap_file_fprintf(tmap_file_stderr, "\n");
  tmap_file_fprintf(tmap_file_stderr, "Options (required):\n");
  tmap_file_fprintf(tmap_file_stderr, "         -f FILE     the FASTA file name to index\n");
  tmap_file_fprintf(tmap_file_stderr, "Options (optional):\n");
  tmap_file_fprintf(tmap_file_stderr, "         -o INT      the occurrence interval (use %d, %d, %d, ...) [%d]\n", 
                    TMAP_BWT_OCC_MOD, TMAP_BWT_OCC_MOD*2, TMAP_BWT_OCC_MOD*3, opt->occ_interval);
  tmap_file_fprintf(tmap_file_stderr, "         -w INT      the k-mer occurrence hash width [%d]\n", opt->hash_width);
  tmap_file_fprintf(tmap_file_stderr, "         -i INT      the suffix array interval (use 1, 2, 4, ...) [%d]\n", opt->sa_interval);
  tmap_file_fprintf(tmap_file_stderr, "         -a STRING   override BWT construction algorithm:\n");
  tmap_file_fprintf(tmap_file_stderr, "                     \t\"bwtsw\" (large genomes)\n");
  tmap_file_fprintf(tmap_file_stderr, "                     \t\"is\" (short genomes)\n");
  tmap_file_fprintf(tmap_file_stderr, "         -H          do not validate the BWT hash [%d]\n", opt->check_hash);
  tmap_file_fprintf(tmap_file_stderr, "         --version   print the index format that will be created and exit\n");
  tmap_file_fprintf(tmap_file_stderr, "         -v          print verbose progress information\n");
  tmap_file_fprintf(tmap_file_stderr, "         -h          print this message\n");
  tmap_file_fprintf(tmap_file_stderr, "\n");
  return 1;
}
Beispiel #8
0
static int
usage(int32_t seq_len, int32_t tlen, int32_t n_iter, 
      int32_t n_sub_iter, int32_t vsw_type)
{
  tmap_file_fprintf(tmap_file_stderr, "\n");
  tmap_file_fprintf(tmap_file_stderr, "Usage: %s vswbm [options]", PACKAGE);
  tmap_file_fprintf(tmap_file_stderr, "\n");
  tmap_file_fprintf(tmap_file_stderr, "Options (required):\n");
  tmap_file_fprintf(tmap_file_stderr, "         -q INT      the query length [%d]\n", seq_len);
  tmap_file_fprintf(tmap_file_stderr, "         -t INT      the target length [%d] (must be at least as long as the query)\n", tlen);
  tmap_file_fprintf(tmap_file_stderr, "         -n INT      the number of iterations [%d]\n", n_iter);
  tmap_file_fprintf(tmap_file_stderr, "         -N INT      the number of re-evaluations of the same query/target combination [%d]\n", n_sub_iter);
  tmap_file_fprintf(tmap_file_stderr, "         -H INT      smith waterman algorithm [%d]\n", vsw_type);
  tmap_file_fprintf(tmap_file_stderr, "Options (optional):\n");
  tmap_file_fprintf(tmap_file_stderr, "         -h          print this message\n");
  tmap_file_fprintf(tmap_file_stderr, "\n");
  return 1;
}
Beispiel #9
0
int
tmap_seqs_io_sff2sam_main(int argc, char *argv[])
{
  int c, help = 0;
  tmap_seqs_io_t *io_in = NULL;
  tmap_seqs_t *seqs = NULL;
  char **sam_rg = NULL;
  int32_t sam_rg_num = 0;
  int bidirectional = 0, sam_flowspace_tags = 0;
  int out_type = 0;
  tmap_sam_io_t *io_out = NULL;
  bam_header_t *header = NULL; // BAM Header
  int32_t i;

  /*
  uint8_t *key_seq = NULL;
  int key_seq_len = 0;
  */

  while((c = getopt(argc, argv, "DGR:Yvh")) >= 0) {
      switch(c) {
        case 'D': bidirectional = 1; break;
        case 'G': break;
        case 'R':
                  sam_rg = tmap_realloc(sam_rg, (1+sam_rg_num) * sizeof(char*), "sam_rg");
                  sam_rg[sam_rg_num] = tmap_strdup(optarg);
                  sam_rg_num++;
                  break;
        case 'Y': sam_flowspace_tags = 1; break;
        case 'v': tmap_progress_set_verbosity(1); break;
        case 'h': help = 1; break;
        default: return 1;
      }
  }
  if(1 != argc - optind || 1 == help) {
      tmap_file_fprintf(tmap_file_stderr, "Usage: %s %s [-R -Y -v -h] <in.sff>\n", PACKAGE, argv[0]);
      return 1; 
  }

  // input
  io_in = tmap_seqs_io_init(&argv[optind], 1, TMAP_SEQ_TYPE_SFF, TMAP_FILE_NO_COMPRESSION, 0l, 0l);

  // BAM Header
  header = tmap_seqs_io_to_bam_header(NULL, io_in, sam_rg, sam_rg_num, argc, argv);

  // open the output file
  switch(out_type) {
    case 0: // SAM
      io_out = tmap_sam_io_init2("-", "wh", header);
      break;
    case 1:
      io_out = tmap_sam_io_init2("-", "wb", header);
      break;
    case 2:
      io_out = tmap_sam_io_init2("-", "wbu", header);
      break;
    default:
      tmap_bug();
  }

  // destroy the BAM Header
  bam_header_destroy(header);
  header = NULL;

  seqs = tmap_seqs_init(TMAP_SEQ_TYPE_SFF);
  while(0 < tmap_seqs_io_read(io_in, seqs, io_out->fp->header->header)) {
      bam1_t *b = NULL;
      tmap_seq_t *seq = seqs->seqs[0];
      b = tmap_sam_convert_unmapped(seq, sam_flowspace_tags, bidirectional, NULL,
                                    0, 0, 0,
                                    0, 0, 0,
                                    "\tlq:i:%d\trq:i:%d\tla:i:%d\trq:i:%d",
                                    seq->data.sff->rheader->clip_qual_left,
                                    seq->data.sff->rheader->clip_qual_right,
                                    seq->data.sff->rheader->clip_adapter_left,
                                    seq->data.sff->rheader->clip_adapter_right);
      if(samwrite(io_out->fp, b) <= 0) {
          tmap_error("Error writing the SAM file", Exit, WriteFileError);
      }
      bam_destroy1(b); 
      tmap_seqs_destroy(seqs);
      seqs = tmap_seqs_init(TMAP_SEQ_TYPE_SFF);
  }
  tmap_seqs_destroy(seqs);

  // free memory
  tmap_seqs_io_destroy(io_in);
  tmap_sam_io_destroy(io_out);
  for(i=0;i<sam_rg_num;i++) {
      free(sam_rg[i]);
  }
  free(sam_rg);

  return 0;
}
Beispiel #10
0
int
tmap_refseq_pac2fasta_main(int argc, char *argv[])
{
  int c, help=0, amb=0;
  uint32_t i, j, k;
  char *fn_fasta = NULL;
  tmap_refseq_t *refseq = NULL;

  while((c = getopt(argc, argv, "avh")) >= 0) {
      switch(c) {
        case 'a': amb = 1; break;
        case 'v': tmap_progress_set_verbosity(1); break;
        case 'h': help = 1; break;
        default: return 1;
      }
  }
  if(1 != argc - optind || 1 == help) {
      tmap_file_fprintf(tmap_file_stderr, "Usage: %s %s [-avh] <in.fasta>\n", PACKAGE, argv[0]);
      return 1;
  }

  fn_fasta = argv[optind];

  // Note: 'tmap_file_stdout' should not have been previously modified
  tmap_file_stdout = tmap_file_fdopen(fileno(stdout), "wb", TMAP_FILE_NO_COMPRESSION);

  // read in the reference sequence
  refseq = tmap_refseq_read(fn_fasta, 0);

  for(i=0;i<refseq->num_annos;i++) {
      tmap_file_fprintf(tmap_file_stdout, ">%s", refseq->annos[i].name->s); // new line handled later
      for(j=k=0;j<refseq->annos[i].len;j++) {
          if(0 == (j % TMAP_REFSEQ_FASTA_LINE_LENGTH)) {
              tmap_file_fprintf(tmap_file_stdout, "\n");
          }
          if(1 == amb && 0 < refseq->annos[i].num_amb) {
              // move the next ambiguous region
              while(k < refseq->annos[i].num_amb && refseq->annos[i].amb_positions_end[k] < j+1) {
                  k++;
              }
              // check for the ambiguous region
              if(k < refseq->annos[i].num_amb
                 && 0 == tmap_interval_overlap(j+1, j+1, refseq->annos[i].amb_positions_start[k], refseq->annos[i].amb_positions_end[k])) {
                  tmap_file_fprintf(tmap_file_stdout, "%c", tmap_iupac_int_to_char[refseq->annos[i].amb_bases[k]]);
              }
              else {
                  tmap_file_fprintf(tmap_file_stdout, "%c", "ACGTN"[(int)tmap_refseq_seq_i(refseq, j + refseq->annos[i].offset)]);
              }
          }
          else {
              tmap_file_fprintf(tmap_file_stdout, "%c", "ACGTN"[(int)tmap_refseq_seq_i(refseq, j + refseq->annos[i].offset)]);
          }
      }
      tmap_file_fprintf(tmap_file_stdout, "\n");
  }

  // destroy
  tmap_refseq_destroy(refseq);

  // close the output
  tmap_file_fclose(tmap_file_stdout);

  return 0;
}
Beispiel #11
0
uint64_t
tmap_refseq_fasta2pac(const char *fn_fasta, int32_t compression)
{
  tmap_file_t *fp_pac = NULL, *fp_anno = NULL;
  tmap_seq_io_t *seqio = NULL;
  tmap_seq_t *seq = NULL;
  tmap_refseq_t *refseq = NULL;
  char *fn_pac = NULL, *fn_anno = NULL;
  uint8_t buffer[TMAP_REFSEQ_BUFFER_SIZE];
  int32_t i, j, l, buffer_length;
  uint32_t num_IUPAC_found= 0, amb_bases_mem = 0;
  uint8_t x = 0;
  uint64_t ref_len;

  tmap_progress_print("packing the reference FASTA");

  refseq = tmap_calloc(1, sizeof(tmap_refseq_t), "refseq");

  refseq->version_id = TMAP_VERSION_ID; 
  refseq->package_version = tmap_string_clone2(PACKAGE_VERSION);
  refseq->seq = buffer; // IMPORTANT: must nullify later
  refseq->annos = NULL;
  refseq->num_annos = 0;
  refseq->len = 0;
  refseq->is_rev = 0;
  refseq->is_shm = 0;
  memset(buffer, 0, TMAP_REFSEQ_BUFFER_SIZE);
  buffer_length = 0;

  // input files
  seqio = tmap_seq_io_init(fn_fasta, TMAP_SEQ_TYPE_FQ, 0, compression);
  seq = tmap_seq_init(TMAP_SEQ_TYPE_FQ);

  // output files
  fn_pac = tmap_get_file_name(fn_fasta, TMAP_PAC_FILE);
  fp_pac = tmap_file_fopen(fn_pac, "wb", TMAP_PAC_COMPRESSION);

  // read in sequences
  while(0 <= (l = tmap_seq_io_read(seqio, seq))) {
      tmap_anno_t *anno = NULL;
      tmap_progress_print2("packing contig [%s:1-%d]", seq->data.fq->name->s, l);

      refseq->num_annos++;
      refseq->annos = tmap_realloc(refseq->annos, sizeof(tmap_anno_t)*refseq->num_annos, "refseq->annos");
      anno = &refseq->annos[refseq->num_annos-1];
      
      anno->name = tmap_string_clone(seq->data.fq->name); 
      anno->len = l;
      anno->offset = (1 == refseq->num_annos) ? 0 : refseq->annos[refseq->num_annos-2].offset + refseq->annos[refseq->num_annos-2].len;
      anno->amb_positions_start = NULL;
      anno->amb_positions_end = NULL;
      anno->amb_bases = NULL;
      anno->num_amb = 0;
      amb_bases_mem = 0;

      // fill the buffer
      for(i=0;i<l;i++) {
          uint8_t c = tmap_nt_char_to_int[(int)seq->data.fq->seq->s[i]];
          // handle IUPAC codes 
          if(4 <= c) {
              int32_t k;
              // warn users about IUPAC codes
              if(0 == num_IUPAC_found) { 
                  tmap_error("IUPAC codes were found and will be converted to non-matching DNA bases", Warn, OutOfRange);
                  for(j=4;j<15;j++) {
                      c = tmap_iupac_char_to_bit_string[(int)tmap_iupac_int_to_char[j]];
                      // get the lexicographically smallest base not compatible with this code
                      for(k=0;k<4;k++) {
                          if(!(c & (0x1 << k))) {
                              break;
                          }
                      } 
                      tmap_progress_print2("IUPAC code %c will be converted to %c", tmap_iupac_int_to_char[j], "ACGTN"[k & 3]);
                  }
              }
              num_IUPAC_found++;
              
              // change it to a mismatched base than the IUPAC code
              c = tmap_iupac_char_to_bit_string[(int)seq->data.fq->seq->s[i]];

              // store IUPAC bases
              if(amb_bases_mem <= anno->num_amb) { // allocate more memory if necessary
                  amb_bases_mem = anno->num_amb + 1;
                  tmap_roundup32(amb_bases_mem);
                  anno->amb_positions_start = tmap_realloc(anno->amb_positions_start, sizeof(uint32_t) * amb_bases_mem, "anno->amb_positions_start");
                  anno->amb_positions_end = tmap_realloc(anno->amb_positions_end, sizeof(uint32_t) * amb_bases_mem, "anno->amb_positions_end");
                  anno->amb_bases = tmap_realloc(anno->amb_bases, sizeof(uint8_t) * amb_bases_mem, "anno->amb_bases");
              }
              // encode stretches of the same base
              if(0 < anno->num_amb
                 && anno->amb_positions_end[anno->num_amb-1] == i
                 && anno->amb_bases[anno->num_amb-1] == tmap_iupac_char_to_int[(int)seq->data.fq->seq->s[i]]) {
                 anno->amb_positions_end[anno->num_amb-1]++; // expand the range 
              }
              else {
                  // new ambiguous base and range
                  anno->num_amb++;
                  anno->amb_positions_start[anno->num_amb-1] = i+1; // one-based
                  anno->amb_positions_end[anno->num_amb-1] = i+1; // one-based
                  anno->amb_bases[anno->num_amb-1] = tmap_iupac_char_to_int[(int)seq->data.fq->seq->s[i]];
              }
              
              // get the lexicographically smallest base not compatible with
              // this code
              for(j=0;j<4;j++) {
                  if(!(c & (0x1 << j))) {
                      break;
                  }
              } 
              c = j & 3; // Note: Ns will go to As
          }
          if(3 < c) {
              tmap_error("bug encountered", Exit, OutOfRange);
          }
          if(buffer_length == (TMAP_REFSEQ_BUFFER_SIZE << 2)) { // 2-bit
              if(tmap_refseq_seq_memory(buffer_length) != tmap_file_fwrite(buffer, sizeof(uint8_t), tmap_refseq_seq_memory(buffer_length), fp_pac)) {
                  tmap_error(fn_pac, Exit, WriteFileError);
              }
              memset(buffer, 0, TMAP_REFSEQ_BUFFER_SIZE);
              buffer_length = 0;
          }
          tmap_refseq_seq_store_i(refseq, buffer_length, c);
          buffer_length++;
      }
      refseq->len += l;
      // re-size the amibiguous bases
      if(anno->num_amb < amb_bases_mem) {
          amb_bases_mem = anno->num_amb;
          anno->amb_positions_start = tmap_realloc(anno->amb_positions_start, sizeof(uint32_t) * amb_bases_mem, "anno->amb_positions_start");
          anno->amb_positions_end = tmap_realloc(anno->amb_positions_end, sizeof(uint32_t) * amb_bases_mem, "anno->amb_positions_end");
          anno->amb_bases = tmap_realloc(anno->amb_bases, sizeof(uint8_t) * amb_bases_mem, "anno->amb_bases");
      }
  }
  // write out the buffer
  if(tmap_refseq_seq_memory(buffer_length) != tmap_file_fwrite(buffer, sizeof(uint8_t), tmap_refseq_seq_memory(buffer_length), fp_pac)) {
      tmap_error(fn_pac, Exit, WriteFileError);
  }
  if(refseq->len % 4 == 0) { // add an extra byte if we completely filled all bits
      if(1 != tmap_file_fwrite(&x, sizeof(uint8_t), 1, fp_pac)) {
          tmap_error(fn_pac, Exit, WriteFileError);
      }
  }
  // store number of unused bits at the last byte
  x = refseq->len % 4;
  if(1 != tmap_file_fwrite(&x, sizeof(uint8_t), 1, fp_pac)) {
      tmap_error(fn_pac, Exit, WriteFileError);
  }
  refseq->seq = NULL; // IMPORTANT: nullify this
  ref_len = refseq->len; // save for return
      
  tmap_progress_print2("total genome length [%u]", refseq->len);
  if(0 < num_IUPAC_found) {
      if(1 == num_IUPAC_found) {
          tmap_progress_print("%u IUPAC base was found and converted to a DNA base", num_IUPAC_found);
      }
      else {
          tmap_progress_print("%u IUPAC bases were found and converted to DNA bases", num_IUPAC_found);
      }
  }

  // write annotation file
  fn_anno = tmap_get_file_name(fn_fasta, TMAP_ANNO_FILE);
  fp_anno = tmap_file_fopen(fn_anno, "wb", TMAP_ANNO_COMPRESSION);
  tmap_refseq_write_anno(fp_anno, refseq); 

  // close files
  tmap_file_fclose(fp_pac);
  tmap_file_fclose(fp_anno);

  // check sequence name uniqueness
  for(i=0;i<refseq->num_annos;i++) {
      for(j=i+1;j<refseq->num_annos;j++) {
          if(0 == strcmp(refseq->annos[i].name->s, refseq->annos[j].name->s)) {
              tmap_file_fprintf(tmap_file_stderr, "Contigs have the same name: #%d [%s] and #%d [%s]\n",
                                i+1, refseq->annos[i].name->s, 
                                j+1, refseq->annos[j].name->s); 
              tmap_error("Contig names must be unique", Exit, OutOfRange);
          }
      }
  }

  tmap_refseq_destroy(refseq); 
  tmap_seq_io_destroy(seqio);
  tmap_seq_destroy(seq);
  free(fn_pac);
  free(fn_anno);

  tmap_progress_print2("packed the reference FASTA");

  tmap_refseq_pac2revpac(fn_fasta);

  return ref_len;
}
Beispiel #12
0
inline void
tmap_sam_print_mapped(tmap_file_t *fp, tmap_seq_t *seq, int32_t sam_flowspace_tags, int32_t bidirectional, int32_t seq_eq, tmap_refseq_t *refseq,
                      uint8_t strand, uint32_t seqid, uint32_t pos, int32_t aln_num,
                      uint32_t end_num, uint32_t m_unmapped, uint32_t m_prop, double m_num_std, uint32_t m_strand,
                      uint32_t m_seqid, uint32_t m_pos, uint32_t m_tlen,
                      uint8_t mapq, uint32_t *cigar, int32_t n_cigar,
                      int32_t score, int32_t ascore, int32_t pscore, int32_t nh, int32_t algo_id, int32_t algo_stage,
                      const char *format, ...)
{
  va_list ap;
  int32_t i;
  tmap_string_t *name=NULL, *bases=NULL, *qualities=NULL;
  char *bases_eq=NULL;
  uint32_t flag;
  tmap_string_t *md;
  int32_t nm;

  /*
  fprintf(stderr, "end_num=%d m_unmapped=%d m_prop=%d m_strand=%d m_seqid=%d m_pos=%d m_tlen=%d\n",
          end_num, m_unmapped, m_prop, m_strand, m_seqid, m_pos, m_tlen);
  */

  name = tmap_seq_get_name(seq);
  bases = tmap_seq_get_bases(seq);
  qualities = tmap_seq_get_qualities(seq);

  if(1 == strand) { // reverse for the output
      tmap_string_reverse_compliment(bases, 0);
      tmap_string_reverse(qualities);
  }

  if(0 == pos + 1) {
      tmap_error("position is out of range", Exit, OutOfRange);
  }

  // compute the MD/NM
  if(1 == seq_eq && 0 < bases->l) {
      bases_eq = tmap_calloc((1 + bases->l), sizeof(char), "bases_eq");
  }
  else {
      bases_eq = NULL;
  }
  md = tmap_sam_md(refseq, bases->s, seqid, pos, cigar, n_cigar, &nm, bases_eq);

  // flag
  flag = 0;
  if(1 == strand) flag |= 0x10; // strand
  if(0 < aln_num) flag |= 0x100; // secondary alignment
  if(0 < end_num) { // mate info
      flag |= 0x1;
      if(0 == m_unmapped && 1 == m_prop) flag |= 0x2; // properly aligned
      if(1 == m_unmapped) flag |= 0x8; // unmapped
      else if(1 == m_strand) flag |= 0x20; // strand 
      flag |= (1 == end_num) ? 0x40 : 0x80; // first/second end
  }

  tmap_file_fprintf(fp, "%s\t%u\t%s\t%u\t%u\t",
                    name->s, flag, refseq->annos[seqid].name->s,
                    pos + 1,
                    mapq);

  // print out the cigar
  if(TMAP_SEQ_TYPE_SFF == seq->type) {
      if(0 == strand && 0 < seq->data.sff->rheader->clip_left) {
          tmap_file_fprintf(fp, "%dH", seq->data.sff->rheader->clip_left);
      }
      else if(1 == strand && 0 < seq->data.sff->rheader->clip_right) {
          tmap_file_fprintf(fp, "%dH", seq->data.sff->rheader->clip_right);
      }
  }
  for(i=0;i<n_cigar;i++) {
      tmap_file_fprintf(fp, "%d%c",
                        cigar[i]>>4, "MIDNSHP"[cigar[i]&0xf]);
  }
  if(TMAP_SEQ_TYPE_SFF == seq->type) {
      if(1 == strand && 0 < seq->data.sff->rheader->clip_left) {
          tmap_file_fprintf(fp, "%dH", seq->data.sff->rheader->clip_left);
      }
      else if(0 == strand && 0 < seq->data.sff->rheader->clip_right) {
          tmap_file_fprintf(fp, "%dH", seq->data.sff->rheader->clip_right);
      }
  }
  
  // mate info
  if(0 == end_num) { // no mate
      tmap_file_fprintf(fp, "\t*\t0\t0");
  }
  else if(1 == m_unmapped) { // unmapped mate
      tmap_file_fprintf(fp, "\t%s\t%u\t%u",
                        "=",
                        pos + 1,
                        0);
  }
  else { // mapped mate
      tmap_file_fprintf(fp, "\t%s\t%u\t%d",
                        refseq->annos[m_seqid].name->s,
                        m_pos+1,
                        m_tlen);
  }

  // bases and qualities
  if(1 == seq_eq && NULL != bases_eq) {
      tmap_file_fprintf(fp, "\t%s\t%s", bases_eq, (0 == qualities->l) ? "*" : qualities->s);
  }
  else {
      tmap_file_fprintf(fp, "\t%s\t%s", bases->s, (0 == qualities->l) ? "*" : qualities->s);
  }
  
  // RG 
  tmap_sam_print_rg(fp, seq);

  // PG
  tmap_file_fprintf(fp, "\tPG:Z:%s", PACKAGE_NAME);

  // MD and NM
  tmap_file_fprintf(fp, "\tMD:Z:%s\tNM:i:%d", md->s, nm);

  // AS
  tmap_file_fprintf(fp, "\tAS:i:%d", score);

  // NH
  if(1 < nh) tmap_file_fprintf(fp, "\tNH:i:%d", nh);
  
  // FZ and ZF
  if(1 == sam_flowspace_tags) {
      tmap_sam_print_fz_and_zf(fp, seq);
  }

  // XA
  if(0 < algo_stage) {
      tmap_file_fprintf(fp, "\tXA:Z:%s-%d", tmap_algo_id_to_name(algo_id), algo_stage);
  }
  
  // XZ
  if(TMAP_SEQ_TYPE_SFF == seq->type && INT32_MIN != ascore) {
      tmap_file_fprintf(fp, "\tXZ:i:%d", ascore);
  }
  
  if(0 < end_num) { // mate info
      tmap_file_fprintf(fp, "\tYP:i:%d", pscore);
      if(0 == m_unmapped) {
          tmap_file_fprintf(fp, "\tYS:f:%f", m_num_std);
      }
  }
  if(1 == bidirectional) {
      tmap_file_fprintf(fp, "\tXB:i:1");
  }

  // optional tags
  if(NULL != format) {
      va_start(ap, format);
      tmap_file_vfprintf(fp, format, ap);
      va_end(ap);
  }
  // new line
  tmap_file_fprintf(fp, "\n");
  if(1 == strand) { // reverse back
      tmap_string_reverse_compliment(bases, 0);
      tmap_string_reverse(qualities);
  }

  // free
  tmap_string_destroy(md);
  free(bases_eq);
}
Beispiel #13
0
inline void
tmap_sam_print_unmapped(tmap_file_t *fp, tmap_seq_t *seq, int32_t sam_flowspace_tags, int32_t bidirectional, tmap_refseq_t *refseq,
                      uint32_t end_num, uint32_t m_unmapped, uint32_t m_prop, 
                      uint32_t m_strand, uint32_t m_seqid, uint32_t m_pos,
                      const char *format, ...)
{
  uint32_t flag = 0;
  tmap_string_t *name=NULL, *bases=NULL, *qualities=NULL;
  va_list ap;

  name = tmap_seq_get_name(seq);
  bases = tmap_seq_get_bases(seq);
  qualities = tmap_seq_get_qualities(seq);
  
  // set the flag
  flag = 0x4;
  if(0 < end_num) { // mate info
      flag |= 0x1;
      if(1 == m_prop) flag |= 0x2; // properly aligned
      if(1 == m_unmapped) flag |= 0x8; // unmapped
      else if(1 == m_strand) flag |= 0x20; // strand 
      flag |= (1 == end_num) ? 0x40 : 0x80; // first/second end
  }

  // name, flag, seqid, pos, mapq, cigar
  tmap_file_fprintf(fp, "%s\t%u\t*\t%u\t%u\t*", name->s, flag, 0, 0);

  // NB: hard clipped portions of the read is not reported
  // mate info
  if(0 == end_num) { // no mate
      tmap_file_fprintf(fp, "\t*\t0\t0");
  }
  else if(1 == m_unmapped) { // unmapped mate
      tmap_file_fprintf(fp, "\t*\t0\t0");
  }
  else if(NULL != refseq) { // mapped mate
      tmap_file_fprintf(fp, "\t%s\t%u\t%u",
                        refseq->annos[m_seqid].name->s,
                        m_pos+1,
                        0);
  }

  // bases and qual
  tmap_file_fprintf(fp, "\t%s\t%s",
                    (0 == bases->l) ? "*" : bases->s, (0 == qualities->l) ? "*" : qualities->s);

  // RG 
  tmap_sam_print_rg(fp, seq);

  // PG
  tmap_file_fprintf(fp, "\tPG:Z:%s", PACKAGE_NAME);

  // FZ and ZF
  if(1 == sam_flowspace_tags) {
      tmap_sam_print_fz_and_zf(fp, seq);
  }
  if(1 == bidirectional) {
      tmap_file_fprintf(fp, "\tXB:i:1");
  }
  
  // optional tags
  if(NULL != format) {
      va_start(ap, format);
      tmap_file_vfprintf(fp, format, ap);
      va_end(ap);
  }
  tmap_file_fprintf(fp, "\n");
}
Beispiel #14
0
void
tmap_sam_print_header(tmap_file_t *fp, tmap_refseq_t *refseq, tmap_seq_io_t *seqio, char *sam_rg, 
                      int32_t sam_flowspace_tags, int32_t ignore_rg_sam_tags, 
                      int argc, char *argv[])
{
  int32_t i, j, header_n = 0;
  char **header_a = NULL;
  char ***header_b = NULL;

  // SAM header
  tmap_file_fprintf(fp, "@HD\tVN:%s\tSO:unsorted\n",
                    TMAP_SAM_PRINT_VERSION);
  if(NULL != refseq) {
      for(i=0;i<refseq->num_annos;i++) {
          tmap_file_fprintf(fp, "@SQ\tSN:%s\tLN:%d\n",
                            refseq->annos[i].name->s, (int)refseq->annos[i].len);
      }
  }
  // RG
  header_a = tmap_sam_parse_rg(sam_rg); // parse the input read group line
  if(1 == ignore_rg_sam_tags) { // do not get the header from the input file
      if(1 == sam_flowspace_tags) { // ... except for the RG.FS/RG.KO
          // get the RG header from the input file
          header_b = tmap_seq_io_get_rg_header(seqio, &header_n);
          if(1 < header_n) { 
              // TODO: we could check to see that FO/KS are the same across all
              // input read groups
              tmap_error("Command line read group found with multiple read groups from the input file", Exit, OutOfRange);
          }
          else if(1 == header_n) {
              if(NULL == header_a) {
                  header_a = tmap_calloc(TMAP_SAM_RG_NUM, sizeof(char*), "header_a");
                  // copy over default RG.ID
                  header_a[TMAP_SAM_RG_ID] = tmap_malloc(sizeof(char) * (strlen(tmap_sam_rg_id) + 1), "header_a[TMAP_SAM_RG_ID]");
                  strcpy(header_a[TMAP_SAM_RG_ID], tmap_sam_rg_id);
              }
              for(i=0;i<TMAP_SAM_RG_NUM;i++) { // for each RG.TAG
                  switch(i) {
                    case TMAP_SAM_RG_FO:
                    case TMAP_SAM_RG_KS:
                      if(NULL != header_a[i] && NULL != header_b[0][i]) {
                          tmap_error("Command line and input read groups share tags", Exit, OutOfRange);
                      }
                      else if(NULL != header_b[0][i]) { // copy over
                          header_a[i] = tmap_malloc(sizeof(char) * (strlen(header_b[0][i]) + 1), "header_a[i]");
                          strcpy(header_a[i], header_b[0][i]);
                      }
                    default:
                      break;
                  }
              }
          }
          // free header_b, it is no longer in use
          for(i=0;i<header_n;i++) {
              free(header_b[i]);
          }
          free(header_b);
          header_b = NULL;
          header_n = 0;
      }
  }
  else { 
      // get the RG header from the input file
      header_b = tmap_seq_io_get_rg_header(seqio, &header_n);
  }

  // reconcile the RG headers
  if(NULL != header_a) { // header a exists
      if(NULL != header_b && 1 == header_n) { // header b exists, and only one line...
          // check to see if they are mutually exclusive
          for(i=0;i<TMAP_SAM_RG_NUM;i++) {
              if(NULL != header_a[i] && NULL != header_b[0][i]) {
                  tmap_file_fprintf(tmap_file_stderr, "\nFound both command line and input file read group information for the same tag: %s.\n", TMAP_SAM_RG_TAGS[i]);
                  tmap_error(NULL, Exit, OutOfRange);
              }
              else if(NULL == header_a[i] && NULL != header_b[0][i]) { // copy over
                  header_a[i] = tmap_calloc(1+strlen(header_b[0][i]), sizeof(char), "header_a[i]");
                  strcpy(header_a[i], header_b[0][i]);
              }
          }
          // free
          free(header_b[0]);
          free(header_b);
          header_b = NULL;
          header_n = 0;
      }
      if(0 == header_n) { // no header b
          if(NULL != header_a[TMAP_SAM_RG_ID]) {
              strcpy(tmap_sam_rg_id, header_a[TMAP_SAM_RG_ID]);
          }
          else {
              header_a[TMAP_SAM_RG_ID] = tmap_malloc(sizeof(char) * (strlen(tmap_sam_rg_id) + 1), "header_a[i]");
              strcpy(header_a[TMAP_SAM_RG_ID], tmap_sam_rg_id);
          }
          if(NULL == header_a[TMAP_SAM_RG_SM]) { // for Picard
              header_a[TMAP_SAM_RG_SM] = tmap_malloc(sizeof(char) * (strlen(TMAP_SAM_NO_RG_SM) + 1), "header_a[i]");
              strcpy(header_a[TMAP_SAM_RG_SM], TMAP_SAM_NO_RG_SM);
          }
          tmap_sam_rg_id_use = 1;
          tmap_file_fprintf(fp, "@RG");
          for(i=0;i<TMAP_SAM_RG_NUM;i++) {
              if(NULL != header_a[i]) {
                  tmap_file_fprintf(fp, "\t%s:%s", TMAP_SAM_RG_TAGS[i], header_a[i]);
              }
          }
          tmap_file_fprintf(fp, "\n");
      }
      else { // both header_a and header_b exist 
          tmap_error("Found both command line and input file read group information", Exit, OutOfRange);
      }
  }
  else { // no header_a exists
      if(NULL != header_b) { // no header_b exists
          tmap_sam_rg_id_use = 0;
          for(i=0;i<header_n;i++) { // for each RG.ID
              if(NULL == header_b[i][TMAP_SAM_RG_ID]) {
                  if(1 == header_n && TMAP_SEQ_TYPE_SFF == seqio->type) { // make an exception for SFF files
                      header_b[i][TMAP_SAM_RG_ID] = tmap_sam_rg_id;
                      tmap_sam_rg_id_use = 1;
                  }
                  else {
                      tmap_error("missing RG.ID found in the RG SAM Header", Exit, OutOfRange);
                  }
              }
              // RG.SM for picard
              if(NULL == header_b[i][TMAP_SAM_RG_SM]) {
                  header_b[i][TMAP_SAM_RG_SM] = TMAP_SAM_NO_RG_SM;
              }
              tmap_file_fprintf(fp, "@RG");
              for(j=0;j<TMAP_SAM_RG_NUM;j++) { // for each RG.TAG
                  if(NULL != header_b[i][j]) {
                      tmap_file_fprintf(fp, "\t%s:%s", TMAP_SAM_RG_TAGS[j], header_b[i][j]);
                  }
              }
              tmap_file_fprintf(fp, "\n");
          }
      }
      else {
          header_a = tmap_calloc(TMAP_SAM_RG_NUM, sizeof(char*), "header_a");
          // RG.ID
          header_a[TMAP_SAM_RG_ID] = tmap_malloc(sizeof(char) * (strlen(tmap_sam_rg_id) + 1), "header_a[i]");
          strcpy(header_a[TMAP_SAM_RG_ID], tmap_sam_rg_id);
          // RG.SM for Picard
          header_a[TMAP_SAM_RG_SM] = tmap_malloc(sizeof(char) * (strlen(TMAP_SAM_NO_RG_SM) + 1), "header_a[i]");
          strcpy(header_a[TMAP_SAM_RG_SM], TMAP_SAM_NO_RG_SM);
          tmap_sam_rg_id_use = 1;
          tmap_file_fprintf(fp, "@RG");
          for(i=0;i<TMAP_SAM_RG_NUM;i++) {
              if(NULL != header_a[i]) {
                  tmap_file_fprintf(fp, "\t%s:%s", TMAP_SAM_RG_TAGS[i], header_a[i]);
              }
          }
          tmap_file_fprintf(fp, "\n");
      }
  }

  // PG
  tmap_file_fprintf(fp, "@PG\tID:%s\tVN:%s\tCL:",
                    PACKAGE_NAME, PACKAGE_VERSION);
  for(i=0;i<argc;i++) {
      if(0 < i) tmap_file_fprintf(fp, " ");
      tmap_file_fprintf(fp, "%s", argv[i]);
  }
  tmap_file_fprintf(fp, "\n");

  // free
  for(i=0;i<header_n;i++) {
      free(header_b[i]);
  }
  free(header_b);
  if(NULL != header_a) {
      for(i=0;i<TMAP_SAM_RG_NUM;i++) {
          free(header_a[i]);
      }
  }
  free(header_a);
}