bam_header_t *bam_header_dup(const bam_header_t *h0)
{
	bam_header_t *h;
	int i;
	h = bam_header_init();
	*h = *h0;
	h->hash = h->dict = h->rg2lib = 0;
	h->text = (char*)calloc(h->l_text + 1, 1);
	memcpy(h->text, h0->text, h->l_text);
	h->target_len = (uint32_t*)calloc(h->n_targets, 4);
	h->target_name = (char**)calloc(h->n_targets, sizeof(void*));
	for (i = 0; i < h->n_targets; ++i) {
		h->target_len[i] = h0->target_len[i];
		h->target_name[i] = strdup(h0->target_name[i]);
	}
	return h;
}
Exemplo n.º 2
0
static bam_header_t *hash2header(const kh_ref_t *hash)
{
	bam_header_t *header;
	khiter_t k;
	header = bam_header_init();
	header->n_targets = kh_size(hash);
	header->target_name = (char**)calloc(kh_size(hash), sizeof(char*));
	header->target_len = (uint32_t*)calloc(kh_size(hash), 4);
	for (k = kh_begin(hash); k != kh_end(hash); ++k) {
		if (kh_exist(hash, k)) {
			int i = (int)kh_value(hash, k);
			header->target_name[i] = (char*)kh_key(hash, k);
			header->target_len[i] = kh_value(hash, k)>>32;
		}
	}
	bam_init_header_hash(header);
	return header;
}
Exemplo n.º 3
0
bam_header_t *bam_header_read(bamFile fp)
{
	bam_header_t *header;
	char buf[4];
	int magic_len;
	int32_t i = 1, name_len;
	// check EOF
	i = bgzf_check_EOF(fp);
	if (i < 0) {
		// If the file is a pipe, checking the EOF marker will *always* fail
		// with ESPIPE.  Suppress the error message in this case.
		if (errno != ESPIPE) perror("[bam_header_read] bgzf_check_EOF");
	}
	else if (i == 0) fprintf(stderr, "[bam_header_read] EOF marker is absent. The input is probably truncated.\n");
	// read "BAM1"
	magic_len = bam_read(fp, buf, 4);
	if (magic_len != 4 || strncmp(buf, "BAM\001", 4) != 0) {
		fprintf(stderr, "[bam_header_read] invalid BAM binary header (this is not a BAM file).\n");
		return 0;
	}
	header = bam_header_init();
	// read plain text and the number of reference sequences
	bam_read(fp, &header->l_text, 4);
	if (bam_is_be) bam_swap_endian_4p(&header->l_text);
	header->text = (char*)calloc(header->l_text + 1, 1);
	bam_read(fp, header->text, header->l_text);
	bam_read(fp, &header->n_targets, 4);
	if (bam_is_be) bam_swap_endian_4p(&header->n_targets);
	// read reference sequence names and lengths
	header->target_name = (char**)calloc(header->n_targets, sizeof(char*));
	header->target_len = (uint32_t*)calloc(header->n_targets, 4);
	for (i = 0; i != header->n_targets; ++i) {
		bam_read(fp, &name_len, 4);
		if (bam_is_be) bam_swap_endian_4p(&name_len);
		header->target_name[i] = (char*)calloc(name_len, 1);
		bam_read(fp, header->target_name[i], name_len);
		bam_read(fp, &header->target_len[i], 4);
		if (bam_is_be) bam_swap_endian_4p(&header->target_len[i]);
	}

	bam_init_header_hash(header);

	return header;
}
samfile_t *samopen(const char *fn, const char *mode, const void *aux)
{
	samfile_t *fp;
	fp = (samfile_t*)calloc(1, sizeof(samfile_t));
	if (strchr(mode, 'r')) { // read
		fp->type |= TYPE_READ;
		if (strchr(mode, 'b')) { // binary
			fp->type |= TYPE_BAM;
			fp->x.bam = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r");
			if (fp->x.bam == 0) goto open_err_ret;
			fp->header = bam_header_read(fp->x.bam);
		} else { // text
			fp->x.tamr = sam_open(fn);
			if (fp->x.tamr == 0) goto open_err_ret;
			fp->header = sam_header_read(fp->x.tamr);
			if (fp->header->n_targets == 0) { // no @SQ fields
				if (aux) { // check if aux is present
					bam_header_t *textheader = fp->header;
					fp->header = sam_header_read2((const char*)aux);
					if (fp->header == 0) goto open_err_ret;
					append_header_text(fp->header, textheader->text, textheader->l_text);
					bam_header_destroy(textheader);
				}
				if (fp->header->n_targets == 0 && bam_verbose >= 1)
					fprintf(stderr, "[samopen] no @SQ lines in the header.\n");
			} //else if (bam_verbose >= 2) fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets);
		}
	} else if (strchr(mode, 'w')) { // write
		fp->header = bam_header_dup((const bam_header_t*)aux);
		if (strchr(mode, 'b')) { // binary
			char bmode[3];
			int i, compress_level = -1;
			for (i = 0; mode[i]; ++i) if (mode[i] >= '0' && mode[i] <= '9') break;
			if (mode[i]) compress_level = mode[i] - '0';
			if (strchr(mode, 'u')) compress_level = 0;
			bmode[0] = 'w'; bmode[1] = compress_level < 0? 0 : compress_level + '0'; bmode[2] = 0;
			fp->type |= TYPE_BAM;
			fp->x.bam = strcmp(fn, "-")? bam_open(fn, bmode) : bam_dopen(fileno(stdout), bmode);
			if (fp->x.bam == 0) goto open_err_ret;
			bam_header_write(fp->x.bam, fp->header);
		} else { // text
			// open file
			fp->x.tamw = strcmp(fn, "-")? fopen(fn, "w") : stdout;
			if (fp->x.tamw == 0) goto open_err_ret;
			if (strchr(mode, 'X')) fp->type |= BAM_OFSTR<<2;
			else if (strchr(mode, 'x')) fp->type |= BAM_OFHEX<<2;
			else fp->type |= BAM_OFDEC<<2;
			// write header
			if (strchr(mode, 'h')) {
				int i;
				bam_header_t *alt;
				// parse the header text 
				alt = bam_header_init();
				alt->l_text = fp->header->l_text; alt->text = fp->header->text;
				sam_header_parse(alt);
				alt->l_text = 0; alt->text = 0;
				// check if there are @SQ lines in the header
				fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw); // FIXME: better to skip the trailing NULL
				if (alt->n_targets) { // then write the header text without dumping ->target_{name,len}
					if (alt->n_targets != fp->header->n_targets && bam_verbose >= 1)
						fprintf(stderr, "[samopen] inconsistent number of target sequences. Output the text header.\n");
				} else { // then dump ->target_{name,len}
					for (i = 0; i < fp->header->n_targets; ++i)
						fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]);
				}
				bam_header_destroy(alt);
			}
		}
	}
	return fp;

open_err_ret:
	free(fp);
	return 0;
}
Exemplo n.º 5
0
bam_header_t *
tmap_seqs_io_to_bam_header(tmap_refseq_t *refseq,
                           tmap_seqs_io_t *io_in,
                           char **rg_sam, int32_t rg_sam_num,
                           int32_t argc, char *argv[])
{
  bam_header_t *bam_header = NULL;
  sam_header_t *header = NULL; // the output header
  sam_header_record_t *record = NULL;
  sam_header_record_t **record_list = NULL;
  char tag[2];
  char *command_line= NULL;
  char *id = NULL;
  char *id_pp = NULL;
  int32_t i, j;

  // @HD
  if(io_in->type == TMAP_SEQ_TYPE_SAM || io_in->type == TMAP_SEQ_TYPE_BAM) {
      // should be only one input file
      if(1 != io_in->n) {
          tmap_bug();
      }
      // get the current header
      if(NULL == io_in->seqios[0]) tmap_bug();
      if(NULL == io_in->seqios[0]->io.samio) tmap_bug();
      if(NULL == io_in->seqios[0]->io.samio->fp->header) tmap_bug();
      if(NULL == io_in->seqios[0]->io.samio->fp->header->header) {
          header = sam_header_parse2(io_in->seqios[0]->io.samio->fp->header->text);
      }
      else {
          header = io_in->seqios[0]->io.samio->fp->header->header; // wow, that's a lot of pointers
          if(NULL == header) tmap_bug();
          header = sam_header_clone(header); // clone the header
      }
      if(NULL == header) tmap_bug();
  }
  else {
      // empty header
      header = sam_header_init();
      // @HD - header line
      record = sam_header_record_init("HD"); // new header line
      if(0 == sam_header_record_add(record, "VN", "1.4")) tmap_bug(); // version number
      if(0 == sam_header_add_record(header, record)) tmap_bug(); // add the header line
      // nullify
      record = NULL;
  }

  // Get the TMAP program ID
  id = tmap_malloc(sizeof(char) * (1 + strlen(PACKAGE_NAME)), "id"); 
  strcpy(id, PACKAGE_NAME); // default
  for(i=j=0;NULL != (record_list = sam_header_get_record(header, "PG", "ID", id, &i)) && 0 < i;i=0) { // while the id is found
      char *ptr = NULL;
      // swap id and id_pp
      ptr = id_pp;
      id_pp = id;
      id = ptr;
      // create the new ID
      j++;
      id = tmap_realloc(id, sizeof(char) * (1 + (int)log10(j) + 1 + strlen(PACKAGE_NAME) + 1), "id"); 
      if(sprintf(id, "%s.%d", PACKAGE_NAME, j) < 0) tmap_bug();
      free(record_list);
      record_list = NULL;
  }

  // @SQ
  if(NULL != refseq) {
      sam_header_records_t *records = NULL;
      // NB: check to see if any SQ/SN records exist, if not, then ignore checking...
	// ZZ: We will not checking, but instead just remove all the old header. The old way of checking is not working
      records = sam_header_get_records(header, "SQ");
      if (NULL != records) {
	// ZZ: remove the headers if exists.
	sam_header_remove_records(header, "SQ");
	records = NULL;
      }
      // ZZ: Now we will just add all new tags
      for(i=0;i<refseq->num_annos;i++) { // for each reference sequence
          char num[32];
          record = sam_header_record_init("SQ"); // new reference sequence record
          if(0 == sam_header_record_add(record, "SN", refseq->annos[i].name->s)) tmap_bug(); // reference sequence name
          if(sprintf(num, "%u", (uint32_t)refseq->annos[i].len) < 0) tmap_bug(); // integer to string
          if(0 == sam_header_record_add(record, "LN", num)) tmap_bug(); // reference sequence length
          if(0 == sam_header_add_record(header, record)) tmap_bug(); // add the reference sequence record
      }
  }

  // @RG - read group
  if(0 < rg_sam_num) { // @RG specified on the command line
      // Check for SAM/BAM
      // TODO: this should be possible...
      if(io_in->type == TMAP_SEQ_TYPE_SAM || io_in->type == TMAP_SEQ_TYPE_BAM) {
          tmap_error("Cannot specify the read groups on the command line when using SAM/BAM as input."
                     "  Please embed in the SAM/BAM header instead.", Exit, OutOfRange);
      }
      record = NULL;
      // go through the command line arguments
      for(i=0;i<rg_sam_num;i++) {
          if(strlen(rg_sam[i]) < 4) tmap_error("Read group too small", Exit, OutOfRange);
          if(':' != rg_sam[i][2]) tmap_error("Read group improperly formatted (no colon)", Exit, OutOfRange);

          // check for id
          if('I' == rg_sam[i][0] && 'D' == rg_sam[i][1]) { // new read group
              if(NULL != record) { // add the record
                  tmap_seqs_io_init2_fs_and_add(io_in, header, record); // add @RG.KS and @RG.FO
              }
              record = sam_header_record_init("RG"); // new read group
          }
          // add the tag/value to the record
          if(NULL == record) {
              tmap_error("The read group ID must be specified first", Exit, OutOfRange);
          }
          tag[0]=rg_sam[i][0]; tag[1]=rg_sam[i][1]; // setup the tag
          if(0 == sam_header_record_add(record, tag, rg_sam[i]+3)) tmap_bug(); // add the tag/value
      }
      if(NULL != record) { // add the record
          tmap_seqs_io_init2_fs_and_add(io_in, header, record); // add @RG.KS and @RG.FO
      }
      // check that the # of read groups added was the same as the # of input files...
      sam_header_records_t *records = sam_header_get_records(header, "RG"); // get the header line
      if(records->n != io_in->n) tmap_error("The number of read groups did not match the number of input files", Exit, OutOfRange);
  }
  else if(io_in->type != TMAP_SEQ_TYPE_SAM && io_in->type != TMAP_SEQ_TYPE_BAM) { // dummy...
      for(i=0;i<io_in->n;i++) { // for each input file
          char buf[32];
          record = sam_header_record_init("RG"); // new read group
          if(1 == io_in->n) strcpy(buf, "NOID");
          else if(sprintf(buf, "NOID.%d", i+1) < 0) tmap_bug();
          if(0 == sam_header_record_add(record, "ID", buf)) tmap_bug(); // dummy ID
          if(0 == sam_header_record_add(record, "SM", "NOSM")) tmap_bug(); // dummy SM, for Picard validation
          if(0 == sam_header_record_add(record, "PG", id)) tmap_bug(); // dummy PG
          tmap_seqs_io_init2_fs_and_add(io_in, header, record); // add @RG.KS and @RG.FO
      }
  }
  else {
      // check that SM/PG are present
      sam_header_records_t *records = sam_header_get_records(header, "RG"); // get the header line
      for(i=0;i<records->n;i++) {
          record = records->records[i];
          if(NULL == sam_header_record_get(record, "ID")) tmap_error("Missing @RG.ID in the SAM/BAM Header", Exit, OutOfRange);
          if(NULL == sam_header_record_get(record, "SM")) {
              if(0 == sam_header_record_add(record, "SM", "NOSM")) tmap_bug(); // dummy SM, for Picard validation
          }
          if(NULL == sam_header_record_get(record, "PG")) {
              if(0 == sam_header_record_add(record, "PG", id)) tmap_bug(); // dummy PG
          }
      }
  }

  // @PG - program group
  // TODO: check for previous program group ID and set @PG.PP
  record = sam_header_record_init("PG"); // new program group
  if(0 == sam_header_record_add(record, "ID", id)) tmap_bug(); // @PG.ID
  if(0 == sam_header_record_add(record, "VN", PACKAGE_VERSION)) tmap_bug(); // @PG.VN
  // @PG.CL
  command_line = NULL;
  j = 1; // for the EOL
  command_line = tmap_realloc(command_line, sizeof(char) * j, "command_line");
  command_line[j-1] = '\0';
  for(i=0;i<argc;i++) {
      if(0 < i) j++;
      j += strlen(argv[i]);
      command_line = tmap_realloc(command_line, sizeof(char) * j, "command_line");
      if(0 < i) strcat(command_line, " ");
      strcat(command_line, argv[i]);
      command_line[j-1] = '\0';
  }
  if(0 == sam_header_record_add(record, "CL", command_line)) tmap_bug(); // @PG.CL
  if(NULL != id_pp) { // @PG.PP
      if(0 == sam_header_record_add(record, "PP", id_pp)) tmap_bug(); // @PG.CL
  }
  if(0 == sam_header_add_record(header, record)) tmap_bug(); // add the record
  free(command_line);

  // Check the new SAM Header
  if(0 == sam_header_check(header)) {
      tmap_error("SAM Header was not consistent", Exit, OutOfRange);
  }

  // Create a BAM Header from the SAM Header
  bam_header = bam_header_init(); // empty
  bam_header->header = header; // soft-copy the header
  bam_header = sam_header_to_bam_header(bam_header); // convert

  // free memory
  free(id);
  free(id_pp);

  return bam_header;
}
Exemplo n.º 6
0
samfile_t *samopen(const char *fn, const char *mode, const void *aux)
{
	samfile_t *fp;
	fp = (samfile_t*)calloc(1, sizeof(samfile_t));
	if (mode[0] == 'r') { // read
		fp->type |= TYPE_READ;
		if (mode[1] == 'b') { // binary
			fp->type |= TYPE_BAM;
			fp->x.bam = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r");
			if (fp->x.bam == 0) goto open_err_ret;
			fp->header = bam_header_read(fp->x.bam);
		} else { // text
			fp->x.tamr = sam_open(fn);
			if (fp->x.tamr == 0) goto open_err_ret;
			fp->header = sam_header_read(fp->x.tamr);
			if (fp->header->n_targets == 0) { // no @SQ fields
				if (aux) { // check if aux is present
					bam_header_destroy(fp->header);
					fp->header = sam_header_read2((const char*)aux);
				}
				if (fp->header->n_targets == 0)
					fprintf(stderr, "[samopen] empty header.\n");
			} else fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets);
		}
	} else if (mode[0] == 'w') { // write
		fp->header = bam_header_dup((const bam_header_t*)aux);
		if (mode[1] == 'b') { // binary
			fp->type |= TYPE_BAM;
			fp->x.bam = strcmp(fn, "-")? bam_open(fn, "w") : bam_dopen(fileno(stdout), "w");
			if (fp->x.bam == 0) goto open_err_ret;
			bam_header_write(fp->x.bam, fp->header);
		} else { // text
			// open file
			fp->x.tamw = strcmp(fn, "-")? fopen(fn, "w") : stdout;
			if (fp->x.tamr == 0) goto open_err_ret;
			// write header
			if (strstr(mode, "h")) {
				int i;
				bam_header_t *alt;
				// parse the header text 
				alt = bam_header_init();
				alt->l_text = fp->header->l_text; alt->text = fp->header->text;
				sam_header_parse(alt);
				alt->l_text = 0; alt->text = 0;
				// check if there are @SQ lines in the header
				if (alt->n_targets) { // then write the header text without dumping ->target_{name,len}
					if (alt->n_targets != fp->header->n_targets)
						fprintf(stderr, "[samopen] inconsistent number of target sequences.\n");
					fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw);
				} else { // then dump ->target_{name,len}
					for (i = 0; i < fp->header->n_targets; ++i)
						fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]);
				}
				bam_header_destroy(alt);
			}
		}
	}
	return fp;

open_err_ret:
	free(fp);
	return 0;
}