static int add_read_groups_file(const char *subcmd, samview_settings_t *settings, char *fn) { FILE *fp; char buf[1024]; int ret = 0; if (settings->rghash == NULL) { settings->rghash = kh_init(rg); if (settings->rghash == NULL) { perror(NULL); return -1; } } fp = fopen(fn, "r"); if (fp == NULL) { print_error_errno(subcmd, "failed to open \"%s\" for reading", fn); return -1; } while (ret != -1 && !feof(fp) && fscanf(fp, "%1023s", buf) > 0) { char *d = strdup(buf); if (d != NULL) { kh_put(rg, settings->rghash, d, &ret); if (ret == 0) free(d); /* Duplicate */ } else { ret = -1; } } if (ferror(fp)) ret = -1; if (ret == -1) { print_error_errno(subcmd, "failed to read \"%s\"", fn); } fclose(fp); return (ret != -1) ? 0 : -1; }
static bool readgroupise(state_t* state) { if (sam_hdr_write(state->output_file, state->output_header) != 0) { print_error_errno("addreplacerg", "[%s] Could not write header to output file", __func__); return false; } bam1_t* file_read = bam_init1(); int ret; while ((ret = sam_read1(state->input_file, state->input_header, file_read)) >= 0) { state->mode_func(state, file_read); if (sam_write1(state->output_file, state->output_header, file_read) < 0) { print_error_errno("addreplacerg", "[%s] Could not write read to output file", __func__); bam_destroy1(file_read); return false; } } bam_destroy1(file_read); if (ret != -1) { print_error_errno("addreplacerg", "[%s] Error reading from input file", __func__); return false; } else { return true; } }
static inline int check_sam_write1(samFile *fp, const bam_hdr_t *h, const bam1_t *b, const char *fname, int *retp) { int r = sam_write1(fp, h, b); if (r >= 0) return r; if (fname) print_error_errno("view", "writing to \"%s\" failed", fname); else print_error_errno("view", "writing to standard output failed"); *retp = EXIT_FAILURE; return r; }
static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int* status) { bool valid = true; bam_hdr_destroy(state->h); check_sam_close("bam2fq", state->fp, opts->fn_input, "file", status); if (state->fpse && fclose(state->fpse)) { print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse); valid = false; } int i; for (i = 0; i < 3; ++i) { if (state->fpr[i] != stdout && fclose(state->fpr[i])) { print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]); valid = false; } } free(state); return valid; }
static int accept_new_client(t_srv_socket sockets[], int s_sockfd) { struct sockaddr_in cl_sockaddr; socklen_t cl_sockaddr_size; int cl_sockfd; bool const is_connected = true; cl_sockaddr_size = sizeof(cl_sockaddr); if ((cl_sockfd = accept(s_sockfd, (struct sockaddr *)&cl_sockaddr, &cl_sockaddr_size)) == SYSERR) return (-print_error_errno("accept")); if (!sockets) { if (close(cl_sockfd) == SYSERR) return (print_error_errno("close")); return (EXIT_SUCCESS); } write(cl_sockfd, &is_connected, sizeof(bool)); return (srv_init_socket(sockets, CLIENT, cl_sockfd, &cl_sockaddr)); }
/* * Check the files are consistent and capable of being concatenated. * Also fills out the rg2id read-group hash and the version numbers * and produces a new bam_hdr_t structure with merged RG lines. * Note it is only a simple merge, as we lack the niceties of a proper * header API. * * Returns updated header on success; * NULL on failure. */ static bam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const bam_hdr_t *h, khash_s2i **rg2id, int *vers_maj_p, int *vers_min_p) { int i, vers_maj = -1, vers_min = -1; bam_hdr_t *new_h = NULL; if (h) { new_h = bam_hdr_dup(h); *rg2id = hash_rg(new_h); } for (i = 0; i < nfn; ++i) { samFile *in; cram_fd *in_c; khint_t ki; int new_rg = -1; in = sam_open(fn[i], "rc"); if (in == 0) { print_error_errno("cat", "fail to open file '%s'", fn[i]); return NULL; } in_c = in->fp.cram; int vmaj = cram_major_vers(in_c); int vmin = cram_minor_vers(in_c); if ((vers_maj != -1 && vers_maj != vmaj) || (vers_min != -1 && vers_min != vmin)) { fprintf(pysam_stderr, "[%s] ERROR: input files have differing version numbers.\n", __func__); return NULL; } vers_maj = vmaj; vers_min = vmin; bam_hdr_t *old = sam_hdr_read(in); khash_s2i *rg2id_in = hash_rg(old); if (!new_h) { new_h = bam_hdr_dup(old); *rg2id = hash_rg(new_h); } // Add any existing @RG entries to our global @RG hash. for (ki = 0; ki < rg2id_in->n_id; ki++) { int added; new_rg = hash_s2i_inc(*rg2id, rg2id_in->id[ki], rg2id_in->line[ki], &added); //fprintf(pysam_stderr, "RG %s: #%d -> #%d\n", // rg2id_in->id[ki], ki, new_rg); if (added) { // Also add to new_h const char *line = rg2id_in->line[ki]; const char *line_end = line; while (*line && *line_end++ != '\n') ; new_h->l_text += line_end - line; new_h->text = realloc(new_h->text, new_h->l_text+1); strncat(&new_h->text[new_h->l_text - (line_end - line)], line, line_end - line); } if (new_rg != ki && rg2id_in->n_id > 1) { fprintf(pysam_stderr, "[%s] ERROR: Same size @RG lists but differing order / contents\n", __func__); return NULL; } } hash_s2i_free(rg2id_in); bam_hdr_destroy(old); sam_close(in); } *vers_maj_p = vers_maj; *vers_min_p = vers_min; return new_h; }
int bam_fillmd(int argc, char *argv[]) { int c, flt_flag, tid = -2, ret, len, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag; samFile *fp = NULL, *fpout = NULL; bam_hdr_t *header = NULL; faidx_t *fai = NULL; char *ref = NULL, mode_w[8], *ref_file; bam1_t *b = NULL; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0), { NULL, 0, NULL, 0 } }; flt_flag = UPDATE_NM | UPDATE_MD; is_bam_out = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0; strcpy(mode_w, "w"); while ((c = getopt_long(argc, argv, "EqreuNhbSC:n:Ad", lopts, NULL)) >= 0) { switch (c) { case 'r': is_realn = 1; break; case 'e': flt_flag |= USE_EQUAL; break; case 'd': flt_flag |= DROP_TAG; break; case 'q': flt_flag |= BIN_QUAL; break; case 'h': flt_flag |= HASH_QNM; break; case 'N': flt_flag &= ~(UPDATE_MD|UPDATE_NM); break; case 'b': is_bam_out = 1; break; case 'u': is_uncompressed = is_bam_out = 1; break; case 'S': break; case 'n': max_nm = atoi(optarg); break; case 'C': capQ = atoi(optarg); break; case 'A': baq_flag |= 1; break; case 'E': baq_flag |= 2; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n\n", c); /* else fall-through */ case '?': return calmd_usage(); } } if (is_bam_out) strcat(mode_w, "b"); else strcat(mode_w, "h"); if (is_uncompressed) strcat(mode_w, "0"); if (optind + (ga.reference == NULL) >= argc) return calmd_usage(); fp = sam_open_format(argv[optind], "r", &ga.in); if (fp == NULL) { print_error_errno("calmd", "Failed to open input file '%s'", argv[optind]); return 1; } header = sam_hdr_read(fp); if (header == NULL || header->n_targets == 0) { fprintf(stderr, "[bam_fillmd] input SAM does not have header. Abort!\n"); goto fail; } fpout = sam_open_format("-", mode_w, &ga.out); if (fpout == NULL) { print_error_errno("calmd", "Failed to open output"); goto fail; } if (sam_hdr_write(fpout, header) < 0) { print_error_errno("calmd", "Failed to write sam header"); goto fail; } ref_file = argc > optind + 1 ? argv[optind+1] : ga.reference; fai = fai_load(ref_file); if (!fai) { print_error_errno("calmd", "Failed to open reference file '%s'", ref_file); goto fail; } b = bam_init1(); if (!b) { fprintf(stderr, "[bam_fillmd] Failed to allocate bam struct\n"); goto fail; } while ((ret = sam_read1(fp, header, b)) >= 0) { if (b->core.tid >= 0) { if (tid != b->core.tid) { free(ref); ref = fai_fetch(fai, header->target_name[b->core.tid], &len); tid = b->core.tid; if (ref == 0) { // FIXME: Should this always be fatal? fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n", header->target_name[tid]); if (is_realn || capQ > 10) goto fail; // Would otherwise crash } } if (is_realn) sam_prob_realn(b, ref, len, baq_flag); if (capQ > 10) { int q = sam_cap_mapq(b, ref, len, capQ); if (b->core.qual > q) b->core.qual = q; } if (ref) bam_fillmd1_core(b, ref, len, flt_flag, max_nm); } if (sam_write1(fpout, header, b) < 0) { print_error_errno("calmd", "failed to write to output file"); goto fail; } } if (ret < -1) { fprintf(stderr, "[bam_fillmd] Error reading input.\n"); goto fail; } bam_destroy1(b); bam_hdr_destroy(header); free(ref); fai_destroy(fai); sam_close(fp); if (sam_close(fpout) < 0) { fprintf(stderr, "[bam_fillmd] error when closing output file\n"); return 1; } return 0; fail: free(ref); if (b) bam_destroy1(b); if (header) bam_hdr_destroy(header); if (fai) fai_destroy(fai); if (fp) sam_close(fp); if (fpout) sam_close(fpout); return 1; }
static bool init(const parsed_opts_t* opts, state_t** state_out) { state_t* retval = (state_t*) calloc(1, sizeof(state_t)); if (retval == NULL) { fprintf(stderr, "[init] Out of memory allocating state struct.\n"); return false; } *state_out = retval; // Open files retval->input_file = sam_open_format(opts->input_name, "r", &opts->ga.in); if (retval->input_file == NULL) { fprintf(stderr, "[init] Could not open input file: %s\n", opts->input_name); return false; } retval->input_header = sam_hdr_read(retval->input_file); retval->output_header = bam_hdr_dup(retval->input_header); retval->output_file = sam_open_format(opts->output_name == NULL?"-":opts->output_name, "w", &opts->ga.out); if (retval->output_file == NULL) { print_error_errno("addreplacerg", "Could not open output file: %s\n", opts->output_name); return false; } if (opts->rg_line) { // Append new RG line to header. // Check does not already exist if ( confirm_rg(retval->output_header, opts->rg_id) ) { fprintf(stderr, "[init] ID of new RG line specified conflicts with that of an existing header RG line. Overwrite not yet implemented.\n"); return false; } retval->rg_id = strdup(opts->rg_id); size_t new_len = strlen( retval->output_header->text ) + strlen( opts->rg_line ) + 2; char* new_header = malloc(new_len); if (!new_header) { fprintf(stderr, "[init] Out of memory whilst writing new header.\n"); return false; } sprintf(new_header,"%s%s\n", retval->output_header->text, opts->rg_line); free(retval->output_header->text); retval->output_header->text = new_header; retval->output_header->l_text = (int)new_len - 1; } else { if (opts->rg_id) { // Confirm what has been supplied exists if ( !confirm_rg(retval->output_header, opts->rg_id) ) { fprintf(stderr, "RG ID supplied does not exist in header. Supply full @RG line with -r instead?\n"); return false; } retval->rg_id = strdup(opts->rg_id); } else { if ((retval->rg_id = get_first_rgid(retval->output_header)) == NULL ) { fprintf(stderr, "No RG specified on command line or in existing header.\n"); return false; } } } switch (opts->mode) { case overwrite_all: retval->mode_func = &overwrite_all_func; break; case orphan_only: retval->mode_func = &orphan_only_func; break; } return true; }
int bam_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outbam) { BGZF *fp, *in = NULL; uint8_t *buf = NULL; uint8_t ebuf[BGZF_EMPTY_BLOCK_SIZE]; const int es=BGZF_EMPTY_BLOCK_SIZE; int i; fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(fileno(pysam_stdout), "w"); if (fp == 0) { print_error_errno("cat", "fail to open output file '%s'", outbam); return -1; } if (h) { if (bam_hdr_write(fp, h) < 0) { print_error_errno("cat", "Couldn't write header"); goto fail; } } buf = (uint8_t*) malloc(BUF_SIZE); if (!buf) { fprintf(pysam_stderr, "[%s] Couldn't allocate buffer\n", __func__); goto fail; } for(i = 0; i < nfn; ++i){ bam_hdr_t *old; int len,j; in = strcmp(fn[i], "-")? bgzf_open(fn[i], "r") : bgzf_fdopen(fileno(stdin), "r"); if (in == 0) { print_error_errno("cat", "fail to open file '%s'", fn[i]); goto fail; } if (in->is_write) return -1; old = bam_hdr_read(in); if (old == NULL) { fprintf(pysam_stderr, "[%s] ERROR: couldn't read header for '%s'.\n", __func__, fn[i]); goto fail; } if (h == 0 && i == 0) { if (bam_hdr_write(fp, old) < 0) { print_error_errno("cat", "Couldn't write header"); goto fail; } } if (in->block_offset < in->block_length) { if (bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail; if (bgzf_flush(fp) != 0) goto write_fail; } j=0; while ((len = bgzf_raw_read(in, buf, BUF_SIZE)) > 0) { if(len<es){ int diff=es-len; if(j==0) { fprintf(pysam_stderr, "[%s] ERROR: truncated file?: '%s'.\n", __func__, fn[i]); goto fail; } if (bgzf_raw_write(fp, ebuf, len) < 0) goto write_fail; memcpy(ebuf,ebuf+len,diff); memcpy(ebuf+diff,buf,len); } else { if(j!=0) { if (bgzf_raw_write(fp, ebuf, es) < 0) goto write_fail; } len-= es; memcpy(ebuf,buf+len,es); if (bgzf_raw_write(fp, buf, len) < 0) goto write_fail; } j=1; } /* check final gzip block */ { const uint8_t gzip1=ebuf[0]; const uint8_t gzip2=ebuf[1]; const uint32_t isize=*((uint32_t*)(ebuf+es-4)); if(((gzip1!=GZIPID1) || (gzip2!=GZIPID2)) || (isize!=0)) { fprintf(pysam_stderr, "[%s] WARNING: Unexpected block structure in file '%s'.", __func__, fn[i]); fprintf(pysam_stderr, " Possible output corruption.\n"); if (bgzf_raw_write(fp, ebuf, es) < 0) goto write_fail; } } bam_hdr_destroy(old); bgzf_close(in); in = NULL; } free(buf); if (bgzf_close(fp) < 0) { fprintf(pysam_stderr, "[%s] Error on closing '%s'.\n", __func__, outbam); return -1; } return 0; write_fail: fprintf(pysam_stderr, "[%s] Error writing to '%s'.\n", __func__, outbam); fail: if (in) bgzf_close(in); if (fp) bgzf_close(fp); free(buf); return -1; }
int main_depth(int argc, char *argv[]) { int i, n, tid, reg_tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0; int all = 0, status = EXIT_SUCCESS, nfiles, max_depth = -1; const bam_pileup1_t **plp; char *reg = 0; // specified region void *bed = 0; // BED data structure char *file_list = NULL, **fn = NULL; bam_hdr_t *h = NULL; // BAM header of the 1st input aux_t **data; bam_mplp_t mplp; int last_pos = -1, last_tid = -1, ret; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0), { NULL, 0, NULL, 0 } }; // parse the command line while ((n = getopt_long(argc, argv, "r:b:q:Q:l:f:am:d:", lopts, NULL)) >= 0) { switch (n) { case 'l': min_len = atoi(optarg); break; // minimum query length case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header case 'b': bed = bed_read(optarg); // BED or position list file can be parsed now if (!bed) { print_error_errno("depth", "Could not read file \"%s\"", optarg); return 1; } break; case 'q': baseQ = atoi(optarg); break; // base quality threshold case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold case 'f': file_list = optarg; break; case 'a': all++; break; case 'd': case 'm': max_depth = atoi(optarg); break; // maximum coverage depth default: if (parse_sam_global_opt(n, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': return usage(); } } if (optind == argc && !file_list) return usage(); // initialize the auxiliary data structures if (file_list) { if ( read_file_list(file_list,&nfiles,&fn) ) return 1; n = nfiles; argv = fn; optind = 0; } else n = argc - optind; // the number of BAMs on the command line data = calloc(n, sizeof(aux_t*)); // data[i] for the i-th input reg_tid = 0; beg = 0; end = INT_MAX; // set the default region for (i = 0; i < n; ++i) { int rf; data[i] = calloc(1, sizeof(aux_t)); data[i]->fp = sam_open_format(argv[optind+i], "r", &ga.in); // open BAM if (data[i]->fp == NULL) { print_error_errno("depth", "Could not open \"%s\"", argv[optind+i]); status = EXIT_FAILURE; goto depth_end; } rf = SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR | SAM_SEQ; if (baseQ) rf |= SAM_QUAL; if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { fprintf(stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); return 1; } if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); return 1; } data[i]->min_mapQ = mapQ; // set the mapQ filter data[i]->min_len = min_len; // set the qlen filter data[i]->hdr = sam_hdr_read(data[i]->fp); // read the BAM header if (data[i]->hdr == NULL) { fprintf(stderr, "Couldn't read header for \"%s\"\n", argv[optind+i]); status = EXIT_FAILURE; goto depth_end; } if (reg) { // if a region is specified hts_idx_t *idx = sam_index_load(data[i]->fp, argv[optind+i]); // load the index if (idx == NULL) { print_error("depth", "can't load index for \"%s\"", argv[optind+i]); status = EXIT_FAILURE; goto depth_end; } data[i]->iter = sam_itr_querys(idx, data[i]->hdr, reg); // set the iterator hts_idx_destroy(idx); // the index is not needed any more; free the memory if (data[i]->iter == NULL) { print_error("depth", "can't parse region \"%s\"", reg); status = EXIT_FAILURE; goto depth_end; } } } h = data[0]->hdr; // easy access to the header of the 1st BAM if (reg) { beg = data[0]->iter->beg; // and to the parsed region coordinates end = data[0]->iter->end; reg_tid = data[0]->iter->tid; } // the core multi-pileup loop mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization if (0 < max_depth) bam_mplp_set_maxcnt(mplp,max_depth); // set maximum coverage depth n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM plp = calloc(n, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads (internal in mplp) while ((ret=bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position if (pos < beg || pos >= end) continue; // out of range; skip if (tid >= h->n_targets) continue; // diff number of @SQ lines per file? if (all) { while (tid > last_tid) { if (last_tid >= 0 && !reg) { // Deal with remainder or entirety of last tid. while (++last_pos < h->target_len[last_tid]) { // Horribly inefficient, but the bed API is an obfuscated black box. if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) continue; fputs(h->target_name[last_tid], stdout); printf("\t%d", last_pos+1); for (i = 0; i < n; i++) putchar('\t'), putchar('0'); putchar('\n'); } } last_tid++; last_pos = -1; if (all < 2) break; } // Deal with missing portion of current tid while (++last_pos < pos) { if (last_pos < beg) continue; // out of range; skip if (bed && bed_overlap(bed, h->target_name[tid], last_pos, last_pos + 1) == 0) continue; fputs(h->target_name[tid], stdout); printf("\t%d", last_pos+1); for (i = 0; i < n; i++) putchar('\t'), putchar('0'); putchar('\n'); } last_tid = tid; last_pos = pos; } if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster for (i = 0; i < n; ++i) { // base level filters have to go here int j, m = 0; for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos else if (bam_get_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality } printf("\t%d", n_plp[i] - m); // this the depth to output } putchar('\n'); } if (ret < 0) status = EXIT_FAILURE; free(n_plp); free(plp); bam_mplp_destroy(mplp); if (all) { // Handle terminating region if (last_tid < 0 && reg && all > 1) { last_tid = reg_tid; last_pos = beg-1; } while (last_tid >= 0 && last_tid < h->n_targets) { while (++last_pos < h->target_len[last_tid]) { if (last_pos >= end) break; if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) continue; fputs(h->target_name[last_tid], stdout); printf("\t%d", last_pos+1); for (i = 0; i < n; i++) putchar('\t'), putchar('0'); putchar('\n'); } last_tid++; last_pos = -1; if (all < 2 || reg) break; } } depth_end: for (i = 0; i < n && data[i]; ++i) { bam_hdr_destroy(data[i]->hdr); if (data[i]->fp) sam_close(data[i]->fp); hts_itr_destroy(data[i]->iter); free(data[i]); } free(data); free(reg); if (bed) bed_destroy(bed); if ( file_list ) { for (i=0; i<n; i++) free(fn[i]); free(fn); } sam_global_args_free(&ga); return status; }
int main_samview(int argc, char *argv[]) { int index; for(index = 0; index < argc; index++) { printf("The %d is %s\n",index,argv[index]); } getchar();return 0; int c, is_header = 0, is_header_only = 0, ret = 0, compress_level = -1, is_count = 0; int is_long_help = 0, n_threads = 0; int64_t count = 0; samFile *in = 0, *out = 0, *un_out=0; bam_hdr_t *header = NULL; char out_mode[5], out_un_mode[5], *out_format = ""; char *fn_in = 0, *fn_out = 0, *fn_list = 0, *q, *fn_un_out = 0; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; samview_settings_t settings = { .rghash = NULL, .min_mapQ = 0, .flag_on = 0, .flag_off = 0, .min_qlen = 0, .remove_B = 0, .subsam_seed = 0, .subsam_frac = -1., .library = NULL, .bed = NULL, }; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T'), { "threads", required_argument, NULL, '@' }, { NULL, 0, NULL, 0 } }; /* parse command-line options */ strcpy(out_mode, "w"); strcpy(out_un_mode, "w"); while ((c = getopt_long(argc, argv, "SbBcCt:h1Ho:O:q:f:F:ul:r:?T:R:L:s:@:m:x:U:", lopts, NULL)) >= 0) { switch (c) { case 's': if ((settings.subsam_seed = strtol(optarg, &q, 10)) != 0) { srand(settings.subsam_seed); settings.subsam_seed = rand(); } settings.subsam_frac = strtod(q, &q); break; case 'm': settings.min_qlen = atoi(optarg); break; case 'c': is_count = 1; break; case 'S': break; case 'b': out_format = "b"; break; case 'C': out_format = "c"; break; case 't': fn_list = strdup(optarg); break; case 'h': is_header = 1; break; case 'H': is_header_only = 1; break; case 'o': fn_out = strdup(optarg); break; case 'U': fn_un_out = strdup(optarg); break; case 'f': settings.flag_on |= strtol(optarg, 0, 0); break; case 'F': settings.flag_off |= strtol(optarg, 0, 0); break; case 'q': settings.min_mapQ = atoi(optarg); break; case 'u': compress_level = 0; break; case '1': compress_level = 1; break; case 'l': settings.library = strdup(optarg); break; case 'L': if ((settings.bed = bed_read(optarg)) == NULL) { print_error_errno("view", "Could not read file \"%s\"", optarg); ret = 1; goto view_end; } break; case 'r': if (add_read_group_single("view", &settings, optarg) != 0) { ret = 1; goto view_end; } break; case 'R': if (add_read_groups_file("view", &settings, optarg) != 0) { ret = 1; goto view_end; } break; /* REMOVED as htslib doesn't support this //case 'x': out_format = "x"; break; //case 'X': out_format = "X"; break; */ case '?': is_long_help = 1; break; case 'B': settings.remove_B = 1; break; case '@': n_threads = strtol(optarg, 0, 0); break; case 'x': { if (strlen(optarg) != 2) { fprintf(stderr, "main_samview: Error parsing -x auxiliary tags should be exactly two characters long.\n"); return usage(stderr, EXIT_FAILURE, is_long_help); } settings.remove_aux = (char**)realloc(settings.remove_aux, sizeof(char*) * (++settings.remove_aux_len)); settings.remove_aux[settings.remove_aux_len-1] = optarg; } break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) != 0) return usage(stderr, EXIT_FAILURE, is_long_help); break; } } if (compress_level >= 0 && !*out_format) out_format = "b"; if (is_header_only) is_header = 1; // File format auto-detection first if (fn_out) sam_open_mode(out_mode+1, fn_out, NULL); if (fn_un_out) sam_open_mode(out_un_mode+1, fn_un_out, NULL); // Overridden by manual -b, -C if (*out_format) out_mode[1] = out_un_mode[1] = *out_format; out_mode[2] = out_un_mode[2] = '\0'; // out_(un_)mode now 1 or 2 bytes long, followed by nul. if (compress_level >= 0) { char tmp[2]; tmp[0] = compress_level + '0'; tmp[1] = '\0'; strcat(out_mode, tmp); strcat(out_un_mode, tmp); } if (argc == optind && isatty(STDIN_FILENO)) return usage(stdout, EXIT_SUCCESS, is_long_help); // potential memory leak... fn_in = (optind < argc)? argv[optind] : "-"; // generate the fn_list if necessary if (fn_list == 0 && ga.reference) fn_list = samfaipath(ga.reference); // open file handlers if ((in = sam_open_format(fn_in, "r", &ga.in)) == 0) { print_error_errno("view", "failed to open \"%s\" for reading", fn_in); ret = 1; goto view_end; } if (fn_list) { if (hts_set_fai_filename(in, fn_list) != 0) { fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list); ret = 1; goto view_end; } } if ((header = sam_hdr_read(in)) == 0) { fprintf(stderr, "[main_samview] fail to read the header from \"%s\".\n", fn_in); ret = 1; goto view_end; } if (settings.rghash) { // FIXME: I do not know what "bam_header_t::n_text" is for... char *tmp; int l; tmp = drop_rg(header->text, settings.rghash, &l); free(header->text); header->text = tmp; header->l_text = l; } if (!is_count) { if ((out = sam_open_format(fn_out? fn_out : "-", out_mode, &ga.out)) == 0) { print_error_errno("view", "failed to open \"%s\" for writing", fn_out? fn_out : "standard output"); ret = 1; goto view_end; } if (fn_list) { if (hts_set_fai_filename(out, fn_list) != 0) { fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list); ret = 1; goto view_end; } } if (*out_format || is_header || out_mode[1] == 'b' || out_mode[1] == 'c' || (ga.out.format != sam && ga.out.format != unknown_format)) { if (sam_hdr_write(out, header) != 0) { fprintf(stderr, "[main_samview] failed to write the SAM header\n"); ret = 1; goto view_end; } } if (fn_un_out) { if ((un_out = sam_open_format(fn_un_out, out_un_mode, &ga.out)) == 0) { print_error_errno("view", "failed to open \"%s\" for writing", fn_un_out); ret = 1; goto view_end; } if (fn_list) { if (hts_set_fai_filename(un_out, fn_list) != 0) { fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list); ret = 1; goto view_end; } } if (*out_format || is_header || out_un_mode[1] == 'b' || out_un_mode[1] == 'c' || (ga.out.format != sam && ga.out.format != unknown_format)) { if (sam_hdr_write(un_out, header) != 0) { fprintf(stderr, "[main_samview] failed to write the SAM header\n"); ret = 1; goto view_end; } } } } if (n_threads > 1) { if (out) hts_set_threads(out, n_threads); } if (is_header_only) goto view_end; // no need to print alignments if (optind + 1 >= argc) { // convert/print the entire file bam1_t *b = bam_init1(); int r; while ((r = sam_read1(in, header, b)) >= 0) { // read one alignment from `in' if (!process_aln(header, b, &settings)) { if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } count++; } else { if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } } } if (r < -1) { fprintf(stderr, "[main_samview] truncated file.\n"); ret = 1; } bam_destroy1(b); } else { // retrieve alignments in specified regions int i; bam1_t *b; hts_idx_t *idx = sam_index_load(in, fn_in); // load index if (idx == 0) { // index is unavailable fprintf(stderr, "[main_samview] random alignment retrieval only works for indexed BAM or CRAM files.\n"); ret = 1; goto view_end; } b = bam_init1(); for (i = optind + 1; i < argc; ++i) { int result; hts_itr_t *iter = sam_itr_querys(idx, header, argv[i]); // parse a region in the format like `chr2:100-200' if (iter == NULL) { // region invalid or reference name not found int beg, end; if (hts_parse_reg(argv[i], &beg, &end)) fprintf(stderr, "[main_samview] region \"%s\" specifies an unknown reference name. Continue anyway.\n", argv[i]); else fprintf(stderr, "[main_samview] region \"%s\" could not be parsed. Continue anyway.\n", argv[i]); continue; } // fetch alignments while ((result = sam_itr_next(in, iter, b)) >= 0) { if (!process_aln(header, b, &settings)) { if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } count++; } else { if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } } } hts_itr_destroy(iter); if (result < -1) { fprintf(stderr, "[main_samview] retrieval of region \"%s\" failed due to truncated file or corrupt BAM index file\n", argv[i]); ret = 1; break; } } bam_destroy1(b); hts_idx_destroy(idx); // destroy the BAM index } view_end: if (is_count && ret == 0) printf("%" PRId64 "\n", count); // close files, free and return if (in) check_sam_close("view", in, fn_in, "standard input", &ret); if (out) check_sam_close("view", out, fn_out, "standard output", &ret); if (un_out) check_sam_close("view", un_out, fn_un_out, "file", &ret); free(fn_list); free(fn_out); free(settings.library); free(fn_un_out); sam_global_args_free(&ga); if ( header ) bam_hdr_destroy(header); if (settings.bed) bed_destroy(settings.bed); if (settings.rghash) { khint_t k; for (k = 0; k < kh_end(settings.rghash); ++k) if (kh_exist(settings.rghash, k)) free((char*)kh_key(settings.rghash, k)); kh_destroy(rg, settings.rghash); } if (settings.remove_aux_len) { free(settings.remove_aux); } return ret; } static int usage(FILE *fp, int exit_status, int is_long_help) { fprintf(fp, "\n" "Usage: samtools view [options] <in.bam>|<in.sam>|<in.cram> [region ...]\n" "\n" "Options:\n" // output options " -b output BAM\n" " -C output CRAM (requires -T)\n" " -1 use fast BAM compression (implies -b)\n" " -u uncompressed BAM output (implies -b)\n" " -h include header in SAM output\n" " -H print SAM header only (no alignments)\n" " -c print only the count of matching records\n" " -o FILE output file name [stdout]\n" " -U FILE output reads not selected by filters to FILE [null]\n" // extra input " -t FILE FILE listing reference names and lengths (see long help) [null]\n" // read filters " -L FILE only include reads overlapping this BED FILE [null]\n" " -r STR only include reads in read group STR [null]\n" " -R FILE only include reads with read group listed in FILE [null]\n" " -q INT only include reads with mapping quality >= INT [0]\n" " -l STR only include reads in library STR [null]\n" " -m INT only include reads with number of CIGAR operations consuming\n" " query sequence >= INT [0]\n" " -f INT only include reads with all bits set in INT set in FLAG [0]\n" " -F INT only include reads with none of the bits set in INT set in FLAG [0]\n" // read processing " -x STR read tag to strip (repeatable) [null]\n" " -B collapse the backward CIGAR operation\n" " -s FLOAT integer part sets seed of random number generator [0];\n" " rest sets fraction of templates to subsample [no subsampling]\n" // general options " -@, --threads INT\n" " number of BAM/CRAM compression threads [0]\n" " -? print long help, including note about region specification\n" " -S ignored (input format is auto-detected)\n"); sam_global_opt_help(fp, "-.O.T"); fprintf(fp, "\n"); if (is_long_help) fprintf(fp, "Notes:\n" "\n" "1. This command now auto-detects the input format (BAM/CRAM/SAM).\n" " Further control over the CRAM format can be specified by using the\n" " --output-fmt-option, e.g. to specify the number of sequences per slice\n" " and to use avoid reference based compression:\n" "\n" "\tsamtools view -C --output-fmt-option seqs_per_slice=5000 \\\n" "\t --output-fmt-option no_ref -o out.cram in.bam\n" "\n" " Options can also be specified as a comma separated list within the\n" " --output-fmt value too. For example this is equivalent to the above\n" "\n" "\tsamtools view --output-fmt cram,seqs_per_slice=5000,no_ref \\\n" "\t -o out.cram in.bam\n" "\n" "2. The file supplied with `-t' is SPACE/TAB delimited with the first\n" " two fields of each line consisting of the reference name and the\n" " corresponding sequence length. The `.fai' file generated by \n" " `samtools faidx' is suitable for use as this file. This may be an\n" " empty file if reads are unaligned.\n" "\n" "3. SAM->BAM conversion: samtools view -bT ref.fa in.sam.gz\n" "\n" "4. BAM->SAM conversion: samtools view -h in.bam\n" "\n" "5. A region should be presented in one of the following formats:\n" " `chr1', `chr2:1,000' and `chr3:1000-2,000'. When a region is\n" " specified, the input alignment file must be a sorted and indexed\n" " alignment (BAM/CRAM) file.\n" "\n" "6. Option `-u' is preferred over `-b' when the output is piped to\n" " another samtools command.\n" "\n"); return exit_status; }
static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) { bam2fq_state_t* state = calloc(1, sizeof(bam2fq_state_t)); state->flag_on = opts->flag_on; state->flag_off = opts->flag_off; state->has12 = opts->has12; state->use_oq = opts->use_oq; state->copy_tags = opts->copy_tags; state->filetype = opts->filetype; state->def_qual = opts->def_qual; state->fp = sam_open(opts->fn_input, "r"); if (state->fp == NULL) { print_error_errno("bam2fq","Cannot read file \"%s\"", opts->fn_input); free(state); return false; } uint32_t rf = SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL; if (opts->use_oq) rf |= SAM_AUX; if (hts_set_opt(state->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { fprintf(stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); free(state); return false; } if (hts_set_opt(state->fp, CRAM_OPT_DECODE_MD, 0)) { fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); free(state); return false; } if (opts->fnse) { state->fpse = fopen(opts->fnse,"w"); if (state->fpse == NULL) { print_error_errno("bam2fq", "Cannot write to singleton file \"%s\"", opts->fnse); free(state); return false; } } int i; for (i = 0; i < 3; ++i) { if (opts->fnr[i]) { state->fpr[i] = fopen(opts->fnr[i], "w"); if (state->fpr[i] == NULL) { print_error_errno("bam2fq", "Cannot write to r%d file \"%s\"", i, opts->fnr[i]); free(state); return false; } } else { state->fpr[i] = stdout; } } state->h = sam_hdr_read(state->fp); if (state->h == NULL) { fprintf(stderr, "Failed to read header for \"%s\"\n", opts->fn_input); free(state); return false; } *state_out = state; return true; }
// currently, this function ONLY works if each read has one hit static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int proper_pair_check, int add_ct, int do_mate_scoring) { bam_hdr_t *header; bam1_t *b[2] = { NULL, NULL }; int curr, has_prev, pre_end = 0, cur_end = 0, result; kstring_t str; str.l = str.m = 0; str.s = 0; header = sam_hdr_read(in); if (header == NULL) { fprintf(stderr, "[bam_mating_core] ERROR: Couldn't read header\n"); return 1; } // Accept unknown, unsorted, or queryname sort order, but error on coordinate sorted. if ((header->l_text > 3) && (strncmp(header->text, "@HD", 3) == 0)) { char *p, *q; p = strstr(header->text, "\tSO:coordinate"); q = strchr(header->text, '\n'); // Looking for SO:coordinate within the @HD line only // (e.g. must ignore in a @CO comment line later in header) if ((p != 0) && (p < q)) { fprintf(stderr, "[bam_mating_core] ERROR: Coordinate sorted, require grouped/sorted by queryname.\n"); goto fail; } } if (sam_hdr_write(out, header) < 0) goto write_fail; b[0] = bam_init1(); b[1] = bam_init1(); curr = 0; has_prev = 0; while ((result = sam_read1(in, header, b[curr])) >= 0) { bam1_t *cur = b[curr], *pre = b[1-curr]; if (cur->core.flag & BAM_FSECONDARY) { if ( !remove_reads ) { if (sam_write1(out, header, cur) < 0) goto write_fail; } continue; // skip secondary alignments } if (cur->core.flag & BAM_FSUPPLEMENTARY) { if (sam_write1(out, header, cur) < 0) goto write_fail; continue; // pass supplementary alignments through unchanged (TODO:make them match read they came from) } if (cur->core.tid < 0 || cur->core.pos < 0) // If unmapped set the flag { cur->core.flag |= BAM_FUNMAP; } if ((cur->core.flag&BAM_FUNMAP) == 0) // If mapped calculate end { cur_end = bam_endpos(cur); // Check cur_end isn't past the end of the contig we're on, if it is set the UNMAP'd flag if (cur_end > (int)header->target_len[cur->core.tid]) cur->core.flag |= BAM_FUNMAP; } if (has_prev) { // do we have a pair of reads to examine? if (strcmp(bam_get_qname(cur), bam_get_qname(pre)) == 0) { // identical pair name pre->core.flag |= BAM_FPAIRED; cur->core.flag |= BAM_FPAIRED; if (sync_mate(pre, cur)) goto fail; if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP)) && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) // if safe set TLEN/ISIZE { uint32_t cur5, pre5; cur5 = (cur->core.flag&BAM_FREVERSE)? cur_end : cur->core.pos; pre5 = (pre->core.flag&BAM_FREVERSE)? pre_end : pre->core.pos; cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5; } else cur->core.isize = pre->core.isize = 0; if (add_ct) bam_template_cigar(pre, cur, &str); // TODO: Add code to properly check if read is in a proper pair based on ISIZE distribution if (proper_pair_check && !plausibly_properly_paired(pre,cur)) { pre->core.flag &= ~BAM_FPROPER_PAIR; cur->core.flag &= ~BAM_FPROPER_PAIR; } if (do_mate_scoring) { if ((add_mate_score(pre, cur) == -1) || (add_mate_score(cur, pre) == -1)) { fprintf(stderr, "[bam_mating_core] ERROR: unable to add mate score.\n"); goto fail; } } // Write out result if ( !remove_reads ) { if (sam_write1(out, header, pre) < 0) goto write_fail; if (sam_write1(out, header, cur) < 0) goto write_fail; } else { // If we have to remove reads make sure we do it in a way that doesn't create orphans with bad flags if(pre->core.flag&BAM_FUNMAP) cur->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR); if(cur->core.flag&BAM_FUNMAP) pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR); if(!(pre->core.flag&BAM_FUNMAP)) { if (sam_write1(out, header, pre) < 0) goto write_fail; } if(!(cur->core.flag&BAM_FUNMAP)) { if (sam_write1(out, header, cur) < 0) goto write_fail; } } has_prev = 0; } else { // unpaired? clear bad info and write it out if (pre->core.tid < 0 || pre->core.pos < 0 || pre->core.flag&BAM_FUNMAP) { // If unmapped pre->core.flag |= BAM_FUNMAP; pre->core.tid = -1; pre->core.pos = -1; } pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0; pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR); if ( !remove_reads || !(pre->core.flag&BAM_FUNMAP) ) { if (sam_write1(out, header, pre) < 0) goto write_fail; } } } else has_prev = 1; curr = 1 - curr; pre_end = cur_end; } if (result < -1) goto fail; if (has_prev && !remove_reads) { // If we still have a BAM in the buffer it must be unpaired bam1_t *pre = b[1-curr]; if (pre->core.tid < 0 || pre->core.pos < 0 || pre->core.flag&BAM_FUNMAP) { // If unmapped pre->core.flag |= BAM_FUNMAP; pre->core.tid = -1; pre->core.pos = -1; } pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0; pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR); if (sam_write1(out, header, pre) < 0) goto write_fail; } bam_hdr_destroy(header); bam_destroy1(b[0]); bam_destroy1(b[1]); free(str.s); return 0; write_fail: print_error_errno("fixmate", "Couldn't write to output file"); fail: bam_hdr_destroy(header); bam_destroy1(b[0]); bam_destroy1(b[1]); return 1; }
int main_cat(int argc, char *argv[]) { bam_hdr_t *h = 0; char *outfn = 0; int c, ret = 0; samFile *in; while ((c = getopt(argc, argv, "h:o:")) >= 0) { switch (c) { case 'h': { samFile *fph = sam_open(optarg, "r"); if (fph == 0) { fprintf(pysam_stderr, "[%s] ERROR: fail to read the header from '%s'.\n", __func__, argv[1]); return 1; } h = sam_hdr_read(fph); if (h == NULL) { fprintf(pysam_stderr, "[%s] ERROR: failed to read the header for '%s'.\n", __func__, argv[1]); return 1; } sam_close(fph); break; } case 'o': outfn = strdup(optarg); break; } } if (argc - optind < 1) { fprintf(pysam_stderr, "Usage: samtools cat [-h header.sam] [-o out.bam] <in1.bam> [...]\n"); return 1; } in = sam_open(argv[optind], "r"); if (!in) { print_error_errno("cat", "failed to open file '%s'", argv[optind]); return 1; } switch (hts_get_format(in)->format) { case bam: sam_close(in); if (bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-") < 0) ret = 1; break; case cram: sam_close(in); if (cram_cat(argc - optind, argv + optind, h, outfn? outfn : "-") < 0) ret = 1; break; default: sam_close(in); fprintf(pysam_stderr, "[%s] ERROR: input is not BAM or CRAM\n", __func__); return 1; } free(outfn); if (h) bam_hdr_destroy(h); return ret; }
/* * CRAM files don't store the RG:Z:ID per read in the aux field. * Instead they have a numerical data series (RG) to point each read * back to the Nth @RG line in the file. This means that we may need * to edit the RG data series (if the files were produced from * "samtools split" for example). * * The encoding method is stored in the compression header. Typical * examples: * * RG => EXTERNAL {18} # Block content-id 18 holds RG values * # as a series of ITF8 encoded values * * RG => HUFFMAN {1, 255, 255, 255, 255, 255, 1, 0} * # One RG value #-1. (No RG) * * RG => HUFFMAN {1, 0, 1, 0} # One RG value #0 (always first RG) * * RG => HUFFMAN {2, 0, 1, 2, 1, 1} * # Two RG values, #0 and #1, written * # to the CORE block and possibly * # mixed with other data series. * * A single value can (but may not be) implemented as a zero bit * huffman code. In this situation we can change the meta-data in the * compression header to renumber an RG value.. */ int cram_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outcram) { samFile *out; cram_fd *out_c; int i, vers_maj, vers_min; khash_s2i *rg2id = NULL; bam_hdr_t *new_h = NULL; /* Check consistent versioning and compatible headers */ if (!(new_h = cram_cat_check_hdr(nfn, fn, h, &rg2id, &vers_maj, &vers_min))) return -1; /* Open the file with cram_vers */ char vers[100]; sprintf(vers, "%d.%d", vers_maj, vers_min); out = sam_open(outcram, "wc"); if (out == 0) { print_error_errno("cat", "fail to open output file '%s'", outcram); return -1; } out_c = out->fp.cram; cram_set_option(out_c, CRAM_OPT_VERSION, vers); //fprintf(pysam_stderr, "Creating cram vers %s\n", vers); cram_fd_set_header(out_c, sam_hdr_parse_(new_h->text, new_h->l_text)); // needed? if (sam_hdr_write(out, new_h) < 0) { print_error_errno("cat", "Couldn't write header"); return -1; } for (i = 0; i < nfn; ++i) { samFile *in; cram_fd *in_c; cram_container *c; bam_hdr_t *old; int new_rg = -1; in = sam_open(fn[i], "rc"); if (in == 0) { print_error_errno("cat", "fail to open file '%s'", fn[i]); return -1; } in_c = in->fp.cram; old = sam_hdr_read(in); khash_s2i *rg2id_in = hash_rg(old); // Compute RG mapping if suitable for changing. if (rg2id_in->n_id == 1) { int _; new_rg = hash_s2i_inc(rg2id, rg2id_in->id[0], NULL, &_); } else { new_rg = 0; } hash_s2i_free(rg2id_in); // Copy contains and blocks within them while ((c = cram_read_container(in_c))) { cram_block *blk; if (cram_container_is_empty(in_c)) { if (cram_write_container(out_c, c) != 0) return -1; // Container compression header if (!(blk = cram_read_block(in_c))) return -1; if (cram_write_block(out_c, blk) != 0) { cram_free_block(blk); return -1; } cram_free_block(blk); cram_free_container(c); continue; } // If we have just one RG key and new_rg != 0 then // we need to edit the compression header. IF WE CAN. if (new_rg) { int zero = 0; //fprintf(pysam_stderr, "Transcode RG %d to %d\n", 0, new_rg); cram_transcode_rg(in_c, out_c, c, 1, &zero, &new_rg); } else { int32_t num_slices; // Not switching rg so do the usual read/write loop if (cram_write_container(out_c, c) != 0) return -1; // Container compression header if (!(blk = cram_read_block(in_c))) return -1; if (cram_write_block(out_c, blk) != 0) { cram_free_block(blk); return -1; } cram_free_block(blk); // Container num_blocks can be invalid, due to a bug. // Instead we iterate in slice context instead. (void)cram_container_get_landmarks(c, &num_slices); cram_copy_slice(in_c, out_c, num_slices); } cram_free_container(c); } bam_hdr_destroy(old); sam_close(in); } sam_close(out); hash_s2i_free(rg2id); bam_hdr_destroy(new_h); return 0; }
int bam_mpileup(int argc, char *argv[]) { int c; const char *file_list = NULL; char **fn = NULL; int nfiles = 0, use_orphan = 0; mplp_conf_t mplp; memset(&mplp, 0, sizeof(mplp_conf_t)); mplp.min_baseQ = 13; mplp.capQ_thres = 0; mplp.max_depth = 250; mplp.max_indel_depth = 250; mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100; mplp.min_frac = 0.002; mplp.min_support = 1; mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_SMART_OVERLAPS; mplp.argc = argc; mplp.argv = argv; mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP; mplp.output_fname = NULL; static const struct option lopts[] = { {"rf", required_argument, NULL, 1}, // require flag {"ff", required_argument, NULL, 2}, // filter flag {"incl-flags", required_argument, NULL, 1}, {"excl-flags", required_argument, NULL, 2}, {"output", required_argument, NULL, 3}, {"open-prob", required_argument, NULL, 4}, {"illumina1.3+", no_argument, NULL, '6'}, {"count-orphans", no_argument, NULL, 'A'}, {"bam-list", required_argument, NULL, 'b'}, {"no-BAQ", no_argument, NULL, 'B'}, {"no-baq", no_argument, NULL, 'B'}, {"adjust-MQ", required_argument, NULL, 'C'}, {"adjust-mq", required_argument, NULL, 'C'}, {"max-depth", required_argument, NULL, 'd'}, {"redo-BAQ", no_argument, NULL, 'E'}, {"redo-baq", no_argument, NULL, 'E'}, {"fasta-ref", required_argument, NULL, 'f'}, {"exclude-RG", required_argument, NULL, 'G'}, {"exclude-rg", required_argument, NULL, 'G'}, {"positions", required_argument, NULL, 'l'}, {"region", required_argument, NULL, 'r'}, {"ignore-RG", no_argument, NULL, 'R'}, {"ignore-rg", no_argument, NULL, 'R'}, {"min-MQ", required_argument, NULL, 'q'}, {"min-mq", required_argument, NULL, 'q'}, {"min-BQ", required_argument, NULL, 'Q'}, {"min-bq", required_argument, NULL, 'Q'}, {"ignore-overlaps", no_argument, NULL, 'x'}, {"BCF", no_argument, NULL, 'g'}, {"bcf", no_argument, NULL, 'g'}, {"VCF", no_argument, NULL, 'v'}, {"vcf", no_argument, NULL, 'v'}, {"output-BP", no_argument, NULL, 'O'}, {"output-bp", no_argument, NULL, 'O'}, {"output-MQ", no_argument, NULL, 's'}, {"output-mq", no_argument, NULL, 's'}, {"output-tags", required_argument, NULL, 't'}, {"uncompressed", no_argument, NULL, 'u'}, {"ext-prob", required_argument, NULL, 'e'}, {"gap-frac", required_argument, NULL, 'F'}, {"tandem-qual", required_argument, NULL, 'h'}, {"skip-indels", no_argument, NULL, 'I'}, {"max-idepth", required_argument, NULL, 'L'}, {"min-ireads ", required_argument, NULL, 'm'}, {"per-sample-mF", no_argument, NULL, 'p'}, {"per-sample-mf", no_argument, NULL, 'p'}, {"platforms", required_argument, NULL, 'P'}, {NULL, 0, NULL, 0} }; while ((c = getopt_long(argc, argv, "Agf:r:l:q:Q:uRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsVvxt:",lopts,NULL)) >= 0) { switch (c) { case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break; case 1 : mplp.rflag_require = bam_str2flag(optarg); if ( mplp.rflag_require<0 ) { fprintf(stderr,"Could not parse --rf %s\n", optarg); return 1; } break; case 2 : mplp.rflag_filter = bam_str2flag(optarg); if ( mplp.rflag_filter<0 ) { fprintf(stderr,"Could not parse --ff %s\n", optarg); return 1; } break; case 3 : mplp.output_fname = optarg; break; case 4 : mplp.openQ = atoi(optarg); break; case 'f': mplp.fai = fai_load(optarg); if (mplp.fai == 0) return 1; mplp.fai_fname = optarg; break; case 'd': mplp.max_depth = atoi(optarg); break; case 'r': mplp.reg = strdup(optarg); break; case 'l': // In the original version the whole BAM was streamed which is inefficient // with few BED intervals and big BAMs. Todo: devise a heuristic to determine // best strategy, that is streaming or jumping. mplp.bed = bed_read(optarg); if (!mplp.bed) { print_error_errno("Could not read file \"%s\"", optarg); return 1; } break; case 'P': mplp.pl_list = strdup(optarg); break; case 'p': mplp.flag |= MPLP_PER_SAMPLE; break; case 'g': mplp.flag |= MPLP_BCF; break; case 'v': mplp.flag |= MPLP_BCF | MPLP_VCF; break; case 'u': mplp.flag |= MPLP_NO_COMP | MPLP_BCF; break; case 'B': mplp.flag &= ~MPLP_REALN; break; case 'D': mplp.fmt_flag |= B2B_FMT_DP; fprintf(stderr, "[warning] samtools mpileup option `-D` is functional, but deprecated. Please switch to `-t DP` in future.\n"); break; case 'S': mplp.fmt_flag |= B2B_FMT_SP; fprintf(stderr, "[warning] samtools mpileup option `-S` is functional, but deprecated. Please switch to `-t SP` in future.\n"); break; case 'V': mplp.fmt_flag |= B2B_FMT_DV; fprintf(stderr, "[warning] samtools mpileup option `-V` is functional, but deprecated. Please switch to `-t DV` in future.\n"); break; case 'I': mplp.flag |= MPLP_NO_INDEL; break; case 'E': mplp.flag |= MPLP_REDO_BAQ; break; case '6': mplp.flag |= MPLP_ILLUMINA13; break; case 'R': mplp.flag |= MPLP_IGNORE_RG; break; case 's': mplp.flag |= MPLP_PRINT_MAPQ; break; case 'O': mplp.flag |= MPLP_PRINT_POS; break; case 'C': mplp.capQ_thres = atoi(optarg); break; case 'q': mplp.min_mq = atoi(optarg); break; case 'Q': mplp.min_baseQ = atoi(optarg); break; case 'b': file_list = optarg; break; case 'o': { char *end; long value = strtol(optarg, &end, 10); // Distinguish between -o INT and -o FILE (a bit of a hack!) if (*end == '\0') mplp.openQ = value; else mplp.output_fname = optarg; } break; case 'e': mplp.extQ = atoi(optarg); break; case 'h': mplp.tandemQ = atoi(optarg); break; case 'A': use_orphan = 1; break; case 'F': mplp.min_frac = atof(optarg); break; case 'm': mplp.min_support = atoi(optarg); break; case 'L': mplp.max_indel_depth = atoi(optarg); break; case 'G': { FILE *fp_rg; char buf[1024]; mplp.rghash = khash_str2int_init(); if ((fp_rg = fopen(optarg, "r")) == 0) fprintf(stderr, "(%s) Fail to open file %s. Continue anyway.\n", __func__, optarg); while (!feof(fp_rg) && fscanf(fp_rg, "%s", buf) > 0) // this is not a good style, but forgive me... khash_str2int_inc(mplp.rghash, strdup(buf)); fclose(fp_rg); } break; case 't': mplp.fmt_flag |= parse_format_flag(optarg); break; default: fprintf(stderr,"Invalid option: '%c'\n", c); return 1; } } if ( !(mplp.flag&MPLP_REALN) && mplp.flag&MPLP_REDO_BAQ ) { fprintf(stderr,"Error: The -B option cannot be combined with -E\n"); return 1; } if (use_orphan) mplp.flag &= ~MPLP_NO_ORPHAN; if (argc == 1) { print_usage(stderr, &mplp); return 1; } int ret; if (file_list) { if ( read_file_list(file_list,&nfiles,&fn) ) return 1; ret = mpileup(&mplp,nfiles,fn); for (c=0; c<nfiles; c++) free(fn[c]); free(fn); } else ret = mpileup(&mplp, argc - optind, argv + optind); if (mplp.rghash) khash_str2int_destroy_free(mplp.rghash); free(mplp.reg); free(mplp.pl_list); if (mplp.fai) fai_destroy(mplp.fai); if (mplp.bed) bed_destroy(mplp.bed); return ret; }
int main_depth(int argc, char *argv[]) { int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0, status = EXIT_SUCCESS, nfiles; const bam_pileup1_t **plp; char *reg = 0; // specified region void *bed = 0; // BED data structure char *file_list = NULL, **fn = NULL; bam_hdr_t *h = NULL; // BAM header of the 1st input aux_t **data; bam_mplp_t mplp; // parse the command line while ((n = getopt(argc, argv, "r:b:q:Q:l:f:")) >= 0) { switch (n) { case 'l': min_len = atoi(optarg); break; // minimum query length case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header case 'b': bed = bed_read(optarg); // BED or position list file can be parsed now if (!bed) { print_error_errno("Could not read file \"%s\"", optarg); return 1; } break; case 'q': baseQ = atoi(optarg); break; // base quality threshold case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold case 'f': file_list = optarg; break; } } if (optind == argc && !file_list) { fprintf(pysamerr, "\n"); fprintf(pysamerr, "Usage: samtools depth [options] in1.bam [in2.bam [...]]\n"); fprintf(pysamerr, "Options:\n"); fprintf(pysamerr, " -b <bed> list of positions or regions\n"); fprintf(pysamerr, " -f <list> list of input BAM filenames, one per line [null]\n"); fprintf(pysamerr, " -l <int> read length threshold (ignore reads shorter than <int>)\n"); fprintf(pysamerr, " -q <int> base quality threshold\n"); fprintf(pysamerr, " -Q <int> mapping quality threshold\n"); fprintf(pysamerr, " -r <chr:from-to> region\n"); fprintf(pysamerr, "\n"); return 1; } // initialize the auxiliary data structures if (file_list) { if ( read_file_list(file_list,&nfiles,&fn) ) return 1; n = nfiles; argv = fn; optind = 0; } else n = argc - optind; // the number of BAMs on the command line data = calloc(n, sizeof(aux_t*)); // data[i] for the i-th input beg = 0; end = 1<<30; // set the default region for (i = 0; i < n; ++i) { data[i] = calloc(1, sizeof(aux_t)); data[i]->fp = sam_open(argv[optind+i], "r"); // open BAM if (data[i]->fp == NULL) { print_error_errno("Could not open \"%s\"", argv[optind+i]); status = EXIT_FAILURE; goto depth_end; } if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR | SAM_SEQ)) { fprintf(pysamerr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); return 1; } if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { fprintf(pysamerr, "Failed to set CRAM_OPT_DECODE_MD value\n"); return 1; } data[i]->min_mapQ = mapQ; // set the mapQ filter data[i]->min_len = min_len; // set the qlen filter data[i]->hdr = sam_hdr_read(data[i]->fp); // read the BAM header if (reg) { // if a region is specified hts_idx_t *idx = sam_index_load(data[i]->fp, argv[optind+i]); // load the index if (idx == NULL) { print_error("can't load index for \"%s\"", argv[optind+i]); status = EXIT_FAILURE; goto depth_end; } data[i]->iter = sam_itr_querys(idx, data[i]->hdr, reg); // set the iterator hts_idx_destroy(idx); // the index is not needed any more; free the memory if (data[i]->iter == NULL) { print_error("can't parse region \"%s\"", reg); status = EXIT_FAILURE; goto depth_end; } } } h = data[0]->hdr; // easy access to the header of the 1st BAM if (reg) { beg = data[0]->iter->beg; // and to the parsed region coordinates end = data[0]->iter->end; } // the core multi-pileup loop mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM plp = calloc(n, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads (internal in mplp) while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) { // come to the next covered position if (pos < beg || pos >= end) continue; // out of range; skip if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster for (i = 0; i < n; ++i) { // base level filters have to go here int j, m = 0; for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos else if (bam_get_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality } printf("\t%d", n_plp[i] - m); // this the depth to output } putchar('\n'); } free(n_plp); free(plp); bam_mplp_destroy(mplp); depth_end: for (i = 0; i < n && data[i]; ++i) { bam_hdr_destroy(data[i]->hdr); if (data[i]->fp) sam_close(data[i]->fp); hts_itr_destroy(data[i]->iter); free(data[i]); } free(data); free(reg); if (bed) bed_destroy(bed); if ( file_list ) { for (i=0; i<n; i++) free(fn[i]); free(fn); } return status; }
int bam_flagstat(int argc, char *argv[]) { samFile *fp; bam_hdr_t *header; bam_flagstat_t *s; char b0[16], b1[16]; int c; enum { INPUT_FMT_OPTION = CHAR_MAX+1, }; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', '-', '@'), {NULL, 0, NULL, 0} }; while ((c = getopt_long(argc, argv, "@:", lopts, NULL)) >= 0) { switch (c) { default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': usage_exit(samtools_stderr, EXIT_FAILURE); } } if (argc != optind+1) { if (argc == optind) usage_exit(samtools_stdout, EXIT_SUCCESS); else usage_exit(samtools_stderr, EXIT_FAILURE); } fp = sam_open_format(argv[optind], "r", &ga.in); if (fp == NULL) { print_error_errno("flagstat", "Cannot open input file \"%s\"", argv[optind]); return 1; } if (ga.nthreads > 0) hts_set_threads(fp, ga.nthreads); if (hts_set_opt(fp, CRAM_OPT_REQUIRED_FIELDS, SAM_FLAG | SAM_MAPQ | SAM_RNEXT)) { fprintf(samtools_stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); return 1; } if (hts_set_opt(fp, CRAM_OPT_DECODE_MD, 0)) { fprintf(samtools_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); return 1; } header = sam_hdr_read(fp); if (header == NULL) { fprintf(samtools_stderr, "Failed to read header for \"%s\"\n", argv[optind]); return 1; } s = bam_flagstat_core(fp, header); fprintf(samtools_stdout, "%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]); fprintf(samtools_stdout, "%lld + %lld secondary\n", s->n_secondary[0], s->n_secondary[1]); fprintf(samtools_stdout, "%lld + %lld supplementary\n", s->n_supp[0], s->n_supp[1]); fprintf(samtools_stdout, "%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]); fprintf(samtools_stdout, "%lld + %lld mapped (%s : %s)\n", s->n_mapped[0], s->n_mapped[1], percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1])); fprintf(samtools_stdout, "%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]); fprintf(samtools_stdout, "%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]); fprintf(samtools_stdout, "%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]); fprintf(samtools_stdout, "%lld + %lld properly paired (%s : %s)\n", s->n_pair_good[0], s->n_pair_good[1], percent(b0, s->n_pair_good[0], s->n_pair_all[0]), percent(b1, s->n_pair_good[1], s->n_pair_all[1])); fprintf(samtools_stdout, "%lld + %lld with itself and mate mapped\n", s->n_pair_map[0], s->n_pair_map[1]); fprintf(samtools_stdout, "%lld + %lld singletons (%s : %s)\n", s->n_sgltn[0], s->n_sgltn[1], percent(b0, s->n_sgltn[0], s->n_pair_all[0]), percent(b1, s->n_sgltn[1], s->n_pair_all[1])); fprintf(samtools_stdout, "%lld + %lld with mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]); fprintf(samtools_stdout, "%lld + %lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]); free(s); bam_hdr_destroy(header); sam_close(fp); sam_global_args_free(&ga); return 0; }
int bam_mating(int argc, char *argv[]) { htsThreadPool p = {NULL, 0}; samFile *in = NULL, *out = NULL; int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1, mate_score = 0; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; char wmode[3] = {'w', 'b', 0}; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), { NULL, 0, NULL, 0 } }; // parse args if (argc == 1) { usage(stdout); return 0; } while ((c = getopt_long(argc, argv, "rpcmO:@:", lopts, NULL)) >= 0) { switch (c) { case 'r': remove_reads = 1; break; case 'p': proper_pair_check = 0; break; case 'c': add_ct = 1; break; case 'm': mate_score = 1; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': usage(stderr); goto fail; } } if (optind+1 >= argc) { usage(stderr); goto fail; } // init if ((in = sam_open_format(argv[optind], "rb", &ga.in)) == NULL) { print_error_errno("fixmate", "cannot open input file"); goto fail; } sam_open_mode(wmode+1, argv[optind+1], NULL); if ((out = sam_open_format(argv[optind+1], wmode, &ga.out)) == NULL) { print_error_errno("fixmate", "cannot open output file"); goto fail; } if (ga.nthreads > 0) { if (!(p.pool = hts_tpool_init(ga.nthreads))) { fprintf(stderr, "Error creating thread pool\n"); goto fail; } hts_set_opt(in, HTS_OPT_THREAD_POOL, &p); hts_set_opt(out, HTS_OPT_THREAD_POOL, &p); } // run res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct, mate_score); // cleanup sam_close(in); if (sam_close(out) < 0) { fprintf(stderr, "[bam_mating] error while closing output file\n"); res = 1; } if (p.pool) hts_tpool_destroy(p.pool); sam_global_args_free(&ga); return res; fail: if (in) sam_close(in); if (out) sam_close(out); if (p.pool) hts_tpool_destroy(p.pool); sam_global_args_free(&ga); return 1; }