Пример #1
0
static bool init(const parsed_opts_t* opts, state_t** state_out) {
    state_t* retval = (state_t*) calloc(1, sizeof(state_t));
    if (retval == NULL) {
        fprintf(stderr, "[init] Out of memory allocating state struct.\n");
        return false;
    }
    *state_out = retval;

    // Open files
    retval->input_file = sam_open_format(opts->input_name, "r", &opts->ga.in);
    if (retval->input_file == NULL) {
        fprintf(stderr, "[init] Could not open input file: %s\n", opts->input_name);
        return false;
    }
    retval->input_header = sam_hdr_read(retval->input_file);

    retval->output_header = bam_hdr_dup(retval->input_header);
    retval->output_file = sam_open_format(opts->output_name == NULL?"-":opts->output_name, "w", &opts->ga.out);

    if (retval->output_file == NULL) {
        print_error_errno("addreplacerg", "Could not open output file: %s\n", opts->output_name);
        return false;
    }

    if (opts->rg_line) {
        // Append new RG line to header.
        // Check does not already exist
        if ( confirm_rg(retval->output_header, opts->rg_id) ) {
            fprintf(stderr, "[init] ID of new RG line specified conflicts with that of an existing header RG line. Overwrite not yet implemented.\n");
            return false;
        }
        retval->rg_id = strdup(opts->rg_id);
        size_t new_len = strlen( retval->output_header->text ) + strlen( opts->rg_line ) + 2;
        char* new_header = malloc(new_len);
        if (!new_header) {
            fprintf(stderr, "[init] Out of memory whilst writing new header.\n");
            return false;
        }
        sprintf(new_header,"%s%s\n", retval->output_header->text, opts->rg_line);
        free(retval->output_header->text);
        retval->output_header->text = new_header;
        retval->output_header->l_text = (int)new_len - 1;
    } else {
        if (opts->rg_id) {
            // Confirm what has been supplied exists
            if ( !confirm_rg(retval->output_header, opts->rg_id) ) {
                fprintf(stderr, "RG ID supplied does not exist in header. Supply full @RG line with -r instead?\n");
                return false;
            }
            retval->rg_id = strdup(opts->rg_id);
        } else {
            if ((retval->rg_id = get_first_rgid(retval->output_header)) == NULL ) {
                fprintf(stderr, "No RG specified on command line or in existing header.\n");
                return false;
            }
        }
    }

    switch (opts->mode) {
        case overwrite_all:
            retval->mode_func = &overwrite_all_func;
            break;
        case orphan_only:
            retval->mode_func = &orphan_only_func;
            break;
    }

    return true;
}
Пример #2
0
// Set the initial state
static state_t* init(parsed_opts_t* opts)
{
    state_t* retval = calloc(sizeof(state_t), 1);
    if (!retval) {
        fprintf(pysamerr, "Out of memory");
        return NULL;
    }

    retval->merged_input_file = sam_open(opts->merged_input_name, "rb");
    if (!retval->merged_input_file) {
        fprintf(pysamerr, "Could not open input file (%s)\n", opts->merged_input_name);
        free(retval);
        return NULL;
    }
    retval->merged_input_header = sam_hdr_read(retval->merged_input_file);

    if (opts->unaccounted_name) {
        if (opts->unaccounted_header_name) {
            samFile* hdr_load = sam_open(opts->unaccounted_header_name, "r");
            if (!hdr_load) {
                fprintf(pysamerr, "Could not open unaccounted header file (%s)\n", opts->unaccounted_header_name);
                cleanup_state(retval);
                return NULL;
            }
            retval->unaccounted_header = sam_hdr_read(hdr_load);
            sam_close(hdr_load);
        } else {
            retval->unaccounted_header = bam_hdr_dup(retval->merged_input_header);
        }

        retval->unaccounted_file = sam_open(opts->unaccounted_name, "wb");
        if (retval->unaccounted_file == NULL) {
            fprintf(pysamerr, "Could not open unaccounted output file: %s\n", opts->unaccounted_name);
            cleanup_state(retval);
            return NULL;
        }
    }

    // Open output files for RGs
    if (!count_RG(retval->merged_input_header, &retval->output_count, &retval->rg_id)) return NULL;
    if (opts->verbose) fprintf(pysamerr, "@RG's found %zu\n",retval->output_count);

    retval->rg_output_file = (samFile**)calloc(retval->output_count, sizeof(samFile*));
    retval->rg_output_header = (bam_hdr_t**)calloc(retval->output_count, sizeof(bam_hdr_t*));
    retval->rg_hash = kh_init_c2i();
    if (!retval->rg_output_file || !retval->rg_output_header) {
        fprintf(pysamerr, "Could not allocate memory for output file array. Out of memory?");
        cleanup_state(retval);
        return NULL;
    }

    char* dirsep = strrchr(opts->merged_input_name, '/');
    char* input_base_name = strdup(dirsep? dirsep+1 : opts->merged_input_name);
    if (!input_base_name) {
        fprintf(pysamerr, "Out of memory\n");
        cleanup_state(retval);
        return NULL;
    }
    char* extension = strrchr(input_base_name, '.');
    if (extension) *extension = '\0';

    size_t i;
    for (i = 0; i < retval->output_count; i++) {
        char* output_filename = NULL;

        if ( ( output_filename = expand_format_string(opts->output_format_string, input_base_name, retval->rg_id[i], i) ) == NULL) {
            fprintf(pysamerr, "Error expanding output filename format string.\r\n");
            cleanup_state(retval);
            free(input_base_name);
            return NULL;
        }

        retval->rg_output_file[i] = sam_open(output_filename, "wb");
        if (retval->rg_output_file[i] == NULL) {
            fprintf(pysamerr, "Could not open output file: %s\r\n", output_filename);
            cleanup_state(retval);
            free(input_base_name);
            return NULL;
        }

        // Record index in hash
        int ret;
        khiter_t iter = kh_put_c2i(retval->rg_hash, retval->rg_id[i], &ret);
        kh_val(retval->rg_hash,iter) = i;

        // Set and edit header
        retval->rg_output_header[i] = bam_hdr_dup(retval->merged_input_header);
        if ( !filter_header_rg(retval->rg_output_header[i], retval->rg_id[i]) ) {
            fprintf(pysamerr, "Could not rewrite header for file: %s\r\n", output_filename);
            cleanup_state(retval);
            free(output_filename);
            free(input_base_name);
            return NULL;
        }
        free(output_filename);
    }

    free(input_base_name);

    return retval;
}
Пример #3
0
/*!
  @abstract    Merge multiple sorted BAM.
  @param  is_by_qname whether to sort by query name
  @param  out         output BAM file name
  @param  mode        sam_open() mode to be used to create the final output file
                      (overrides level settings from UNCOMP and LEVEL1 flags)
  @param  headers     name of SAM file from which to copy '@' header lines,
                      or NULL to copy them from the first file to be merged
  @param  n           number of files to be merged
  @param  fn          names of files to be merged
  @param  flag        flags that control how the merge is undertaken
  @param  reg         region to merge
  @param  n_threads   number of threads to use (passed to htslib)
  @discussion Padding information may NOT correctly maintained. This
  function is NOT thread safe.
 */
int bam_merge_core2(int by_qname, const char *out, const char *mode, const char *headers, int n, char * const *fn, int flag, const char *reg, int n_threads)
{
    samFile *fpout, **fp;
    heap1_t *heap;
    bam_hdr_t *hout = NULL;
    int i, j, *RG_len = NULL;
    uint64_t idx = 0;
    char **RG = NULL;
    hts_itr_t **iter = NULL;
    bam_hdr_t **hdr = NULL;
    trans_tbl_t *translation_tbl = NULL;

    // Is there a specified pre-prepared header to use for output?
    if (headers) {
        samFile* fpheaders = sam_open(headers, "r");
        if (fpheaders == NULL) {
            const char *message = strerror(errno);
            fprintf(pysamerr, "[bam_merge_core] cannot open '%s': %s\n", headers, message);
            return -1;
        }
        hout = sam_hdr_read(fpheaders);
        sam_close(fpheaders);
    }

    g_is_by_qname = by_qname;
    fp = (samFile**)calloc(n, sizeof(samFile*));
    heap = (heap1_t*)calloc(n, sizeof(heap1_t));
    iter = (hts_itr_t**)calloc(n, sizeof(hts_itr_t*));
    hdr = (bam_hdr_t**)calloc(n, sizeof(bam_hdr_t*));
    translation_tbl = (trans_tbl_t*)calloc(n, sizeof(trans_tbl_t));
    // prepare RG tag from file names
    if (flag & MERGE_RG) {
        RG = (char**)calloc(n, sizeof(char*));
        RG_len = (int*)calloc(n, sizeof(int));
        for (i = 0; i != n; ++i) {
            int l = strlen(fn[i]);
            const char *s = fn[i];
            if (l > 4 && strcmp(s + l - 4, ".bam") == 0) l -= 4;
            for (j = l - 1; j >= 0; --j) if (s[j] == '/') break;
            ++j; l -= j;
            RG[i] = (char*)calloc(l + 1, 1);
            RG_len[i] = l;
            strncpy(RG[i], s + j, l);
        }
    }
    // open and read the header from each file
    for (i = 0; i < n; ++i) {
        bam_hdr_t *hin;
        fp[i] = sam_open(fn[i], "r");
        if (fp[i] == NULL) {
            int j;
            fprintf(pysamerr, "[bam_merge_core] fail to open file %s\n", fn[i]);
            for (j = 0; j < i; ++j) sam_close(fp[j]);
            free(fp); free(heap);
            // FIXME: possible memory leak
            return -1;
        }
        hin = sam_hdr_read(fp[i]);
        if (hout)
            trans_tbl_init(hout, hin, translation_tbl+i, flag & MERGE_COMBINE_RG, flag & MERGE_COMBINE_PG);
        else {
            // As yet, no headers to merge into...
            hout = bam_hdr_dup(hin);
            // ...so no need to translate header into itself
            trans_tbl_init(hout, hin, translation_tbl+i, true, true);
        }

        // TODO sam_itr_next() doesn't yet work for SAM files,
        // so for those keep the headers around for use with sam_read1()
        if (hts_get_format(fp[i])->format == sam) hdr[i] = hin;
        else { bam_hdr_destroy(hin); hdr[i] = NULL; }

        if ((translation_tbl+i)->lost_coord_sort && !by_qname) {
            fprintf(pysamerr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]);
        }
    }

    // Transform the header into standard form
    pretty_header(&hout->text,hout->l_text);

    // If we're only merging a specified region move our iters to start at that point
    if (reg) {
        int* rtrans = rtrans_build(n, hout->n_targets, translation_tbl);

        int tid, beg, end;
        const char *name_lim = hts_parse_reg(reg, &beg, &end);
        char *name = malloc(name_lim - reg + 1);
        memcpy(name, reg, name_lim - reg);
        name[name_lim - reg] = '\0';
        tid = bam_name2id(hout, name);
        free(name);
        if (tid < 0) {
            fprintf(pysamerr, "[%s] Malformated region string or undefined reference name\n", __func__);
            return -1;
        }
        for (i = 0; i < n; ++i) {
            hts_idx_t *idx = sam_index_load(fp[i], fn[i]);
            // (rtrans[i*n+tid]) Look up what hout tid translates to in input tid space
            int mapped_tid = rtrans[i*hout->n_targets+tid];
            if (mapped_tid != INT32_MIN) {
                iter[i] = sam_itr_queryi(idx, mapped_tid, beg, end);
            } else {
                iter[i] = sam_itr_queryi(idx, HTS_IDX_NONE, 0, 0);
            }
            hts_idx_destroy(idx);
            if (iter[i] == NULL) break;
        }
        free(rtrans);
    } else {
        for (i = 0; i < n; ++i) {
            if (hdr[i] == NULL) {
                iter[i] = sam_itr_queryi(NULL, HTS_IDX_REST, 0, 0);
                if (iter[i] == NULL) break;
            }
            else iter[i] = NULL;
        }
    }

    if (i < n) {
        fprintf(pysamerr, "[%s] Memory allocation failed\n", __func__);
        return -1;
    }

    // Load the first read from each file into the heap
    for (i = 0; i < n; ++i) {
        heap1_t *h = heap + i;
        h->i = i;
        h->b = bam_init1();
        if ((iter[i]? sam_itr_next(fp[i], iter[i], h->b) : sam_read1(fp[i], hdr[i], h->b)) >= 0) {
            bam_translate(h->b, translation_tbl + i);
            h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam_is_rev(h->b);
            h->idx = idx++;
        }
        else {
            h->pos = HEAP_EMPTY;
            bam_destroy1(h->b);
            h->b = NULL;
        }
    }

    // Open output file and write header
    if ((fpout = sam_open(out, mode)) == 0) {
        fprintf(pysamerr, "[%s] fail to create the output file.\n", __func__);
        return -1;
    }
    sam_hdr_write(fpout, hout);
    if (!(flag & MERGE_UNCOMP)) hts_set_threads(fpout, n_threads);

    // Begin the actual merge
    ks_heapmake(heap, n, heap);
    while (heap->pos != HEAP_EMPTY) {
        bam1_t *b = heap->b;
        if (flag & MERGE_RG) {
            uint8_t *rg = bam_aux_get(b, "RG");
            if (rg) bam_aux_del(b, rg);
            bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]);
        }
        sam_write1(fpout, hout, b);
        if ((j = (iter[heap->i]? sam_itr_next(fp[heap->i], iter[heap->i], b) : sam_read1(fp[heap->i], hdr[heap->i], b))) >= 0) {
            bam_translate(b, translation_tbl + heap->i);
            heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam_is_rev(b);
            heap->idx = idx++;
        } else if (j == -1) {
            heap->pos = HEAP_EMPTY;
            bam_destroy1(heap->b);
            heap->b = NULL;
        } else fprintf(pysamerr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]);
        ks_heapadjust(heap, 0, n, heap);
    }

    // Clean up and close
    if (flag & MERGE_RG) {
        for (i = 0; i != n; ++i) free(RG[i]);
        free(RG); free(RG_len);
    }
    for (i = 0; i < n; ++i) {
        trans_tbl_destroy(translation_tbl + i);
        hts_itr_destroy(iter[i]);
        bam_hdr_destroy(hdr[i]);
        sam_close(fp[i]);
    }
    bam_hdr_destroy(hout);
    sam_close(fpout);
    free(translation_tbl); free(fp); free(heap); free(iter); free(hdr);
    return 0;
}
Пример #4
0
bam_hdr_t * fix_header(bam_hdr_t *old, faidx_t *fai)
{
    int i = 0, unpadded_len = 0;
    bam_hdr_t *header = 0 ;

    header = bam_hdr_dup(old);
    for (i = 0; i < old->n_targets; ++i) {
        unpadded_len = get_unpadded_len(fai, old->target_name[i], old->target_len[i]);
        if (unpadded_len < 0) {
            fprintf(stderr, "[depad] ERROR getting unpadded length of '%s', padded length %i\n", old->target_name[i], old->target_len[i]);
        } else {
            header->target_len[i] = unpadded_len;
            //fprintf(stderr, "[depad] Recalculating '%s' length %i -> %i\n", old->target_name[i], old->target_len[i], header->target_len[i]);
        }
    }
    /* Duplicating the header allocated new buffer for header string */
    /* After modifying the @SQ lines it will only get smaller, since */
    /* the LN entries will be the same or shorter, and we'll remove */
    /* any MD entries (MD5 checksums). */
    assert(strlen(old->text) == strlen(header->text));
    assert (0==strcmp(old->text, header->text));
    const char *text;
    text = old->text;
    header->text[0] = '\0'; /* Resuse the allocated buffer */
    char * newtext = header->text;
    char * end=NULL;
    while (text[0]=='@') {
        end = strchr(text, '\n');
        assert(end != 0);
        if (text[1]=='S' && text[2]=='Q' && text[3]=='\t') {
            const char *cp = text+3;
            char *name = strstr(text, "\tSN:");
            char *name_end;
            if (!name) {
                fprintf(stderr, "Unable to find SN: header field\n");
                return NULL;
            }
            name += 4;
            for (name_end = name; name_end != end && *name_end != '\t'; name_end++);
            strcat(newtext, "@SQ");

            /* Parse the @SQ lines */
            while (cp != end) {
                if (end-cp >= 2 && strncmp(cp, "LN", 2) == 0) {
                    // Rewrite the length
                    char len_buf[100];
                    int tid;
                    for (tid = 0; tid < header->n_targets; tid++) {
                        // may want to hash this, but new header API incoming.
                        if (strncmp(name, header->target_name[tid], name_end - name) == 0) {
                            sprintf(len_buf, "LN:%d", header->target_len[tid]);
                            strcat(newtext, len_buf);
                            break;
                        }
                    }
                    while (cp != end && *cp++ != '\t');
                    if (cp != end)
                        strcat(newtext, "\t");
                } else if (end-cp >= 2 &&
                           (strncmp(cp, "M5", 2) == 0 ||
                            strncmp(cp, "UR", 2) == 0)) {
                    // MD5 changed during depadding; ditch it.
                    // URLs are also invalid.
                    while (cp != end && *cp++ != '\t');
                } else {
                    // Otherwise copy this sub-field verbatim
                    const char *cp_start = cp;
                    while (cp != end && *cp++ != '\t');
                    strncat(newtext, cp_start, cp-cp_start);
                }
            }

            // Add newline, replacing trailing '\t' if last on line was the LN:
            char *text_end = newtext + strlen(newtext);
            if (text_end[-1] == '\t')
                text_end[-1] = '\n';
            else
                *text_end++ = '\n', *text_end = '\0';
        } else {
            /* Copy this line to the new header */
            strncat(newtext, text, end - text + 1);
        }
        text = end + 1;
    }
    assert (text[0]=='\0');
    /* Check we didn't overflow the buffer */
    assert (strlen(header->text) <= strlen(old->text));
    if (strlen(header->text) < header->l_text) {
        //fprintf(stderr, "[depad] Reallocating header buffer\n");
        assert (newtext == header->text);
        newtext = malloc(strlen(header->text) + 1);
        strcpy(newtext, header->text);
        free(header->text);
        header->text = newtext;
        header->l_text = strlen(newtext);
    }
    //fprintf(stderr, "[depad] Here is the new header (pending @SQ lines),\n\n%s\n(end)\n", header->text);
    return header;
}
Пример #5
0
/*
 * Check the files are consistent and capable of being concatenated.
 * Also fills out the rg2id read-group hash and the version numbers
 * and produces a new bam_hdr_t structure with merged RG lines.
 * Note it is only a simple merge, as we lack the niceties of a proper
 * header API.
 *
 * Returns updated header on success;
 *        NULL on failure.
 */
static bam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const bam_hdr_t *h,
                                     khash_s2i **rg2id, int *vers_maj_p, int *vers_min_p) {
    int i, vers_maj = -1, vers_min = -1;
    bam_hdr_t *new_h = NULL;

    if (h) {
        new_h = bam_hdr_dup(h);
        *rg2id = hash_rg(new_h);
    }

    for (i = 0; i < nfn; ++i) {
        samFile *in;
        cram_fd *in_c;
        khint_t ki;
        int new_rg = -1;

        in = sam_open(fn[i], "rc");
        if (in == 0) {
            fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]);
            return NULL;
        }
        in_c = in->fp.cram;

        int vmaj = cram_major_vers(in_c);
        int vmin = cram_minor_vers(in_c);
        if ((vers_maj != -1 && vers_maj != vmaj) ||
            (vers_min != -1 && vers_min != vmin)) {
            fprintf(stderr, "[%s] ERROR: input files have differing version numbers.\n",
                    __func__);
            return NULL;
        }
        vers_maj = vmaj;
        vers_min = vmin;

        bam_hdr_t *old = sam_hdr_read(in);
        khash_s2i *rg2id_in = hash_rg(old);

        if (!new_h) {
            new_h = bam_hdr_dup(old);
            *rg2id = hash_rg(new_h);
        }

        // Add any existing @RG entries to our global @RG hash.
        for (ki = 0; ki < rg2id_in->n_id; ki++) {
            int added;

            new_rg = hash_s2i_inc(*rg2id, rg2id_in->id[ki], rg2id_in->line[ki], &added);
            //fprintf(stderr, "RG %s: #%d -> #%d\n",
            //        rg2id_in->id[ki], ki, new_rg);

            if (added) {
                // Also add to new_h
                const char *line = rg2id_in->line[ki];
                const char *line_end = line;
                while (*line && *line_end++ != '\n')
                    ;
                new_h->l_text += line_end - line;
                new_h->text = realloc(new_h->text, new_h->l_text+1);
                strncat(&new_h->text[new_h->l_text - (line_end - line)],
                        line, line_end - line);
            }

            if (new_rg != ki && rg2id_in->n_id > 1) {
                fprintf(stderr, "[%s] ERROR: Same size @RG lists but differing order / contents\n",
                        __func__);
                return NULL;
            }
        }

        hash_s2i_free(rg2id_in);
        bam_hdr_destroy(old);
        sam_close(in);
    }

    *vers_maj_p = vers_maj;
    *vers_min_p = vers_min;

    return new_h;
}
Пример #6
0
/**
  * @brief creates a deep copy of an existing bam_hdr_t
  * @param original an htslib raw bam header pointer
  */
bam_hdr_t* sam_header_deep_copy(bam_hdr_t* original) {
  return bam_hdr_dup(original);
}