static bool init(const parsed_opts_t* opts, state_t** state_out) { state_t* retval = (state_t*) calloc(1, sizeof(state_t)); if (retval == NULL) { fprintf(stderr, "[init] Out of memory allocating state struct.\n"); return false; } *state_out = retval; // Open files retval->input_file = sam_open_format(opts->input_name, "r", &opts->ga.in); if (retval->input_file == NULL) { fprintf(stderr, "[init] Could not open input file: %s\n", opts->input_name); return false; } retval->input_header = sam_hdr_read(retval->input_file); retval->output_header = bam_hdr_dup(retval->input_header); retval->output_file = sam_open_format(opts->output_name == NULL?"-":opts->output_name, "w", &opts->ga.out); if (retval->output_file == NULL) { print_error_errno("addreplacerg", "Could not open output file: %s\n", opts->output_name); return false; } if (opts->rg_line) { // Append new RG line to header. // Check does not already exist if ( confirm_rg(retval->output_header, opts->rg_id) ) { fprintf(stderr, "[init] ID of new RG line specified conflicts with that of an existing header RG line. Overwrite not yet implemented.\n"); return false; } retval->rg_id = strdup(opts->rg_id); size_t new_len = strlen( retval->output_header->text ) + strlen( opts->rg_line ) + 2; char* new_header = malloc(new_len); if (!new_header) { fprintf(stderr, "[init] Out of memory whilst writing new header.\n"); return false; } sprintf(new_header,"%s%s\n", retval->output_header->text, opts->rg_line); free(retval->output_header->text); retval->output_header->text = new_header; retval->output_header->l_text = (int)new_len - 1; } else { if (opts->rg_id) { // Confirm what has been supplied exists if ( !confirm_rg(retval->output_header, opts->rg_id) ) { fprintf(stderr, "RG ID supplied does not exist in header. Supply full @RG line with -r instead?\n"); return false; } retval->rg_id = strdup(opts->rg_id); } else { if ((retval->rg_id = get_first_rgid(retval->output_header)) == NULL ) { fprintf(stderr, "No RG specified on command line or in existing header.\n"); return false; } } } switch (opts->mode) { case overwrite_all: retval->mode_func = &overwrite_all_func; break; case orphan_only: retval->mode_func = &orphan_only_func; break; } return true; }
// Set the initial state static state_t* init(parsed_opts_t* opts) { state_t* retval = calloc(sizeof(state_t), 1); if (!retval) { fprintf(pysamerr, "Out of memory"); return NULL; } retval->merged_input_file = sam_open(opts->merged_input_name, "rb"); if (!retval->merged_input_file) { fprintf(pysamerr, "Could not open input file (%s)\n", opts->merged_input_name); free(retval); return NULL; } retval->merged_input_header = sam_hdr_read(retval->merged_input_file); if (opts->unaccounted_name) { if (opts->unaccounted_header_name) { samFile* hdr_load = sam_open(opts->unaccounted_header_name, "r"); if (!hdr_load) { fprintf(pysamerr, "Could not open unaccounted header file (%s)\n", opts->unaccounted_header_name); cleanup_state(retval); return NULL; } retval->unaccounted_header = sam_hdr_read(hdr_load); sam_close(hdr_load); } else { retval->unaccounted_header = bam_hdr_dup(retval->merged_input_header); } retval->unaccounted_file = sam_open(opts->unaccounted_name, "wb"); if (retval->unaccounted_file == NULL) { fprintf(pysamerr, "Could not open unaccounted output file: %s\n", opts->unaccounted_name); cleanup_state(retval); return NULL; } } // Open output files for RGs if (!count_RG(retval->merged_input_header, &retval->output_count, &retval->rg_id)) return NULL; if (opts->verbose) fprintf(pysamerr, "@RG's found %zu\n",retval->output_count); retval->rg_output_file = (samFile**)calloc(retval->output_count, sizeof(samFile*)); retval->rg_output_header = (bam_hdr_t**)calloc(retval->output_count, sizeof(bam_hdr_t*)); retval->rg_hash = kh_init_c2i(); if (!retval->rg_output_file || !retval->rg_output_header) { fprintf(pysamerr, "Could not allocate memory for output file array. Out of memory?"); cleanup_state(retval); return NULL; } char* dirsep = strrchr(opts->merged_input_name, '/'); char* input_base_name = strdup(dirsep? dirsep+1 : opts->merged_input_name); if (!input_base_name) { fprintf(pysamerr, "Out of memory\n"); cleanup_state(retval); return NULL; } char* extension = strrchr(input_base_name, '.'); if (extension) *extension = '\0'; size_t i; for (i = 0; i < retval->output_count; i++) { char* output_filename = NULL; if ( ( output_filename = expand_format_string(opts->output_format_string, input_base_name, retval->rg_id[i], i) ) == NULL) { fprintf(pysamerr, "Error expanding output filename format string.\r\n"); cleanup_state(retval); free(input_base_name); return NULL; } retval->rg_output_file[i] = sam_open(output_filename, "wb"); if (retval->rg_output_file[i] == NULL) { fprintf(pysamerr, "Could not open output file: %s\r\n", output_filename); cleanup_state(retval); free(input_base_name); return NULL; } // Record index in hash int ret; khiter_t iter = kh_put_c2i(retval->rg_hash, retval->rg_id[i], &ret); kh_val(retval->rg_hash,iter) = i; // Set and edit header retval->rg_output_header[i] = bam_hdr_dup(retval->merged_input_header); if ( !filter_header_rg(retval->rg_output_header[i], retval->rg_id[i]) ) { fprintf(pysamerr, "Could not rewrite header for file: %s\r\n", output_filename); cleanup_state(retval); free(output_filename); free(input_base_name); return NULL; } free(output_filename); } free(input_base_name); return retval; }
/*! @abstract Merge multiple sorted BAM. @param is_by_qname whether to sort by query name @param out output BAM file name @param mode sam_open() mode to be used to create the final output file (overrides level settings from UNCOMP and LEVEL1 flags) @param headers name of SAM file from which to copy '@' header lines, or NULL to copy them from the first file to be merged @param n number of files to be merged @param fn names of files to be merged @param flag flags that control how the merge is undertaken @param reg region to merge @param n_threads number of threads to use (passed to htslib) @discussion Padding information may NOT correctly maintained. This function is NOT thread safe. */ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char *headers, int n, char * const *fn, int flag, const char *reg, int n_threads) { samFile *fpout, **fp; heap1_t *heap; bam_hdr_t *hout = NULL; int i, j, *RG_len = NULL; uint64_t idx = 0; char **RG = NULL; hts_itr_t **iter = NULL; bam_hdr_t **hdr = NULL; trans_tbl_t *translation_tbl = NULL; // Is there a specified pre-prepared header to use for output? if (headers) { samFile* fpheaders = sam_open(headers, "r"); if (fpheaders == NULL) { const char *message = strerror(errno); fprintf(pysamerr, "[bam_merge_core] cannot open '%s': %s\n", headers, message); return -1; } hout = sam_hdr_read(fpheaders); sam_close(fpheaders); } g_is_by_qname = by_qname; fp = (samFile**)calloc(n, sizeof(samFile*)); heap = (heap1_t*)calloc(n, sizeof(heap1_t)); iter = (hts_itr_t**)calloc(n, sizeof(hts_itr_t*)); hdr = (bam_hdr_t**)calloc(n, sizeof(bam_hdr_t*)); translation_tbl = (trans_tbl_t*)calloc(n, sizeof(trans_tbl_t)); // prepare RG tag from file names if (flag & MERGE_RG) { RG = (char**)calloc(n, sizeof(char*)); RG_len = (int*)calloc(n, sizeof(int)); for (i = 0; i != n; ++i) { int l = strlen(fn[i]); const char *s = fn[i]; if (l > 4 && strcmp(s + l - 4, ".bam") == 0) l -= 4; for (j = l - 1; j >= 0; --j) if (s[j] == '/') break; ++j; l -= j; RG[i] = (char*)calloc(l + 1, 1); RG_len[i] = l; strncpy(RG[i], s + j, l); } } // open and read the header from each file for (i = 0; i < n; ++i) { bam_hdr_t *hin; fp[i] = sam_open(fn[i], "r"); if (fp[i] == NULL) { int j; fprintf(pysamerr, "[bam_merge_core] fail to open file %s\n", fn[i]); for (j = 0; j < i; ++j) sam_close(fp[j]); free(fp); free(heap); // FIXME: possible memory leak return -1; } hin = sam_hdr_read(fp[i]); if (hout) trans_tbl_init(hout, hin, translation_tbl+i, flag & MERGE_COMBINE_RG, flag & MERGE_COMBINE_PG); else { // As yet, no headers to merge into... hout = bam_hdr_dup(hin); // ...so no need to translate header into itself trans_tbl_init(hout, hin, translation_tbl+i, true, true); } // TODO sam_itr_next() doesn't yet work for SAM files, // so for those keep the headers around for use with sam_read1() if (hts_get_format(fp[i])->format == sam) hdr[i] = hin; else { bam_hdr_destroy(hin); hdr[i] = NULL; } if ((translation_tbl+i)->lost_coord_sort && !by_qname) { fprintf(pysamerr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]); } } // Transform the header into standard form pretty_header(&hout->text,hout->l_text); // If we're only merging a specified region move our iters to start at that point if (reg) { int* rtrans = rtrans_build(n, hout->n_targets, translation_tbl); int tid, beg, end; const char *name_lim = hts_parse_reg(reg, &beg, &end); char *name = malloc(name_lim - reg + 1); memcpy(name, reg, name_lim - reg); name[name_lim - reg] = '\0'; tid = bam_name2id(hout, name); free(name); if (tid < 0) { fprintf(pysamerr, "[%s] Malformated region string or undefined reference name\n", __func__); return -1; } for (i = 0; i < n; ++i) { hts_idx_t *idx = sam_index_load(fp[i], fn[i]); // (rtrans[i*n+tid]) Look up what hout tid translates to in input tid space int mapped_tid = rtrans[i*hout->n_targets+tid]; if (mapped_tid != INT32_MIN) { iter[i] = sam_itr_queryi(idx, mapped_tid, beg, end); } else { iter[i] = sam_itr_queryi(idx, HTS_IDX_NONE, 0, 0); } hts_idx_destroy(idx); if (iter[i] == NULL) break; } free(rtrans); } else { for (i = 0; i < n; ++i) { if (hdr[i] == NULL) { iter[i] = sam_itr_queryi(NULL, HTS_IDX_REST, 0, 0); if (iter[i] == NULL) break; } else iter[i] = NULL; } } if (i < n) { fprintf(pysamerr, "[%s] Memory allocation failed\n", __func__); return -1; } // Load the first read from each file into the heap for (i = 0; i < n; ++i) { heap1_t *h = heap + i; h->i = i; h->b = bam_init1(); if ((iter[i]? sam_itr_next(fp[i], iter[i], h->b) : sam_read1(fp[i], hdr[i], h->b)) >= 0) { bam_translate(h->b, translation_tbl + i); h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam_is_rev(h->b); h->idx = idx++; } else { h->pos = HEAP_EMPTY; bam_destroy1(h->b); h->b = NULL; } } // Open output file and write header if ((fpout = sam_open(out, mode)) == 0) { fprintf(pysamerr, "[%s] fail to create the output file.\n", __func__); return -1; } sam_hdr_write(fpout, hout); if (!(flag & MERGE_UNCOMP)) hts_set_threads(fpout, n_threads); // Begin the actual merge ks_heapmake(heap, n, heap); while (heap->pos != HEAP_EMPTY) { bam1_t *b = heap->b; if (flag & MERGE_RG) { uint8_t *rg = bam_aux_get(b, "RG"); if (rg) bam_aux_del(b, rg); bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]); } sam_write1(fpout, hout, b); if ((j = (iter[heap->i]? sam_itr_next(fp[heap->i], iter[heap->i], b) : sam_read1(fp[heap->i], hdr[heap->i], b))) >= 0) { bam_translate(b, translation_tbl + heap->i); heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam_is_rev(b); heap->idx = idx++; } else if (j == -1) { heap->pos = HEAP_EMPTY; bam_destroy1(heap->b); heap->b = NULL; } else fprintf(pysamerr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]); ks_heapadjust(heap, 0, n, heap); } // Clean up and close if (flag & MERGE_RG) { for (i = 0; i != n; ++i) free(RG[i]); free(RG); free(RG_len); } for (i = 0; i < n; ++i) { trans_tbl_destroy(translation_tbl + i); hts_itr_destroy(iter[i]); bam_hdr_destroy(hdr[i]); sam_close(fp[i]); } bam_hdr_destroy(hout); sam_close(fpout); free(translation_tbl); free(fp); free(heap); free(iter); free(hdr); return 0; }
bam_hdr_t * fix_header(bam_hdr_t *old, faidx_t *fai) { int i = 0, unpadded_len = 0; bam_hdr_t *header = 0 ; header = bam_hdr_dup(old); for (i = 0; i < old->n_targets; ++i) { unpadded_len = get_unpadded_len(fai, old->target_name[i], old->target_len[i]); if (unpadded_len < 0) { fprintf(stderr, "[depad] ERROR getting unpadded length of '%s', padded length %i\n", old->target_name[i], old->target_len[i]); } else { header->target_len[i] = unpadded_len; //fprintf(stderr, "[depad] Recalculating '%s' length %i -> %i\n", old->target_name[i], old->target_len[i], header->target_len[i]); } } /* Duplicating the header allocated new buffer for header string */ /* After modifying the @SQ lines it will only get smaller, since */ /* the LN entries will be the same or shorter, and we'll remove */ /* any MD entries (MD5 checksums). */ assert(strlen(old->text) == strlen(header->text)); assert (0==strcmp(old->text, header->text)); const char *text; text = old->text; header->text[0] = '\0'; /* Resuse the allocated buffer */ char * newtext = header->text; char * end=NULL; while (text[0]=='@') { end = strchr(text, '\n'); assert(end != 0); if (text[1]=='S' && text[2]=='Q' && text[3]=='\t') { const char *cp = text+3; char *name = strstr(text, "\tSN:"); char *name_end; if (!name) { fprintf(stderr, "Unable to find SN: header field\n"); return NULL; } name += 4; for (name_end = name; name_end != end && *name_end != '\t'; name_end++); strcat(newtext, "@SQ"); /* Parse the @SQ lines */ while (cp != end) { if (end-cp >= 2 && strncmp(cp, "LN", 2) == 0) { // Rewrite the length char len_buf[100]; int tid; for (tid = 0; tid < header->n_targets; tid++) { // may want to hash this, but new header API incoming. if (strncmp(name, header->target_name[tid], name_end - name) == 0) { sprintf(len_buf, "LN:%d", header->target_len[tid]); strcat(newtext, len_buf); break; } } while (cp != end && *cp++ != '\t'); if (cp != end) strcat(newtext, "\t"); } else if (end-cp >= 2 && (strncmp(cp, "M5", 2) == 0 || strncmp(cp, "UR", 2) == 0)) { // MD5 changed during depadding; ditch it. // URLs are also invalid. while (cp != end && *cp++ != '\t'); } else { // Otherwise copy this sub-field verbatim const char *cp_start = cp; while (cp != end && *cp++ != '\t'); strncat(newtext, cp_start, cp-cp_start); } } // Add newline, replacing trailing '\t' if last on line was the LN: char *text_end = newtext + strlen(newtext); if (text_end[-1] == '\t') text_end[-1] = '\n'; else *text_end++ = '\n', *text_end = '\0'; } else { /* Copy this line to the new header */ strncat(newtext, text, end - text + 1); } text = end + 1; } assert (text[0]=='\0'); /* Check we didn't overflow the buffer */ assert (strlen(header->text) <= strlen(old->text)); if (strlen(header->text) < header->l_text) { //fprintf(stderr, "[depad] Reallocating header buffer\n"); assert (newtext == header->text); newtext = malloc(strlen(header->text) + 1); strcpy(newtext, header->text); free(header->text); header->text = newtext; header->l_text = strlen(newtext); } //fprintf(stderr, "[depad] Here is the new header (pending @SQ lines),\n\n%s\n(end)\n", header->text); return header; }
/* * Check the files are consistent and capable of being concatenated. * Also fills out the rg2id read-group hash and the version numbers * and produces a new bam_hdr_t structure with merged RG lines. * Note it is only a simple merge, as we lack the niceties of a proper * header API. * * Returns updated header on success; * NULL on failure. */ static bam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const bam_hdr_t *h, khash_s2i **rg2id, int *vers_maj_p, int *vers_min_p) { int i, vers_maj = -1, vers_min = -1; bam_hdr_t *new_h = NULL; if (h) { new_h = bam_hdr_dup(h); *rg2id = hash_rg(new_h); } for (i = 0; i < nfn; ++i) { samFile *in; cram_fd *in_c; khint_t ki; int new_rg = -1; in = sam_open(fn[i], "rc"); if (in == 0) { fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]); return NULL; } in_c = in->fp.cram; int vmaj = cram_major_vers(in_c); int vmin = cram_minor_vers(in_c); if ((vers_maj != -1 && vers_maj != vmaj) || (vers_min != -1 && vers_min != vmin)) { fprintf(stderr, "[%s] ERROR: input files have differing version numbers.\n", __func__); return NULL; } vers_maj = vmaj; vers_min = vmin; bam_hdr_t *old = sam_hdr_read(in); khash_s2i *rg2id_in = hash_rg(old); if (!new_h) { new_h = bam_hdr_dup(old); *rg2id = hash_rg(new_h); } // Add any existing @RG entries to our global @RG hash. for (ki = 0; ki < rg2id_in->n_id; ki++) { int added; new_rg = hash_s2i_inc(*rg2id, rg2id_in->id[ki], rg2id_in->line[ki], &added); //fprintf(stderr, "RG %s: #%d -> #%d\n", // rg2id_in->id[ki], ki, new_rg); if (added) { // Also add to new_h const char *line = rg2id_in->line[ki]; const char *line_end = line; while (*line && *line_end++ != '\n') ; new_h->l_text += line_end - line; new_h->text = realloc(new_h->text, new_h->l_text+1); strncat(&new_h->text[new_h->l_text - (line_end - line)], line, line_end - line); } if (new_rg != ki && rg2id_in->n_id > 1) { fprintf(stderr, "[%s] ERROR: Same size @RG lists but differing order / contents\n", __func__); return NULL; } } hash_s2i_free(rg2id_in); bam_hdr_destroy(old); sam_close(in); } *vers_maj_p = vers_maj; *vers_min_p = vers_min; return new_h; }
/** * @brief creates a deep copy of an existing bam_hdr_t * @param original an htslib raw bam header pointer */ bam_hdr_t* sam_header_deep_copy(bam_hdr_t* original) { return bam_hdr_dup(original); }