static void copy_check_alignment(const char *infname, const char *informat, const char *outfname, const char *outmode, const char *outref) { samFile *in = sam_open(infname, "r"); samFile *out = sam_open(outfname, outmode); bam1_t *aln = bam_init1(); bam_hdr_t *header = NULL; int res; if (!in) { fail("couldn't open %s", infname); goto err; } if (!out) { fail("couldn't open %s with mode %s", outfname, outmode); goto err; } if (!aln) { fail("bam_init1() failed"); goto err; } if (outref) { if (hts_set_opt(out, CRAM_OPT_REFERENCE, outref) < 0) { fail("setting reference %s for %s", outref, outfname); goto err; } } header = sam_hdr_read(in); if (!header) { fail("reading header from %s", infname); goto err; } if (sam_hdr_write(out, header) < 0) fail("writing headers to %s", outfname); while ((res = sam_read1(in, header, aln)) >= 0) { int mod4 = ((intptr_t) bam_get_cigar(aln)) % 4; if (mod4 != 0) fail("%s CIGAR not 4-byte aligned; offset is 4k+%d for \"%s\"", informat, mod4, bam_get_qname(aln)); if (sam_write1(out, header, aln) < 0) fail("writing to %s", outfname); } if (res < -1) { fail("failed to read alignment from %s", infname); } err: bam_destroy1(aln); bam_hdr_destroy(header); if (in) sam_close(in); if (out) sam_close(out); }
static bool readgroupise(state_t* state) { if (sam_hdr_write(state->output_file, state->output_header) != 0) { print_error_errno("addreplacerg", "[%s] Could not write header to output file", __func__); return false; } bam1_t* file_read = bam_init1(); int ret; while ((ret = sam_read1(state->input_file, state->input_header, file_read)) >= 0) { state->mode_func(state, file_read); if (sam_write1(state->output_file, state->output_header, file_read) < 0) { print_error_errno("addreplacerg", "[%s] Could not write read to output file", __func__); bam_destroy1(file_read); return false; } } bam_destroy1(file_read); if (ret != -1) { print_error_errno("addreplacerg", "[%s] Error reading from input file", __func__); return false; } else { return true; } }
int main(int argc, char **argv) { dlib::BamHandle in = dlib::BamHandle("bed_test.bam"); dlib::ParsedBed bed = dlib::ParsedBed("bed_test.bed", in.header); bam1_t *b = bam_init1(); size_t diffs = 0; void *lh3bed = bed_read("bed_test.bed"); samFile *so = sam_open("disagreed.bam", "wb9"); sam_hdr_write(so, in.header); size_t disagrees = 0, agrees = 0; int dbr = 0, lh3r = 0; while(in.read(b) != -1) { if(b->core.flag & (BAM_FUNMAP)) continue; if((dbr = bed.bam1_test(b)) != (lh3r = bed_overlap(lh3bed, in.header->target_name[b->core.tid], b->core.pos, bam_endpos(b)))) { LOG_EXIT("dbr: %i. lh3r: %i. Contig: %s. Position: %i. endpos; %i\n", dbr, lh3r, in.header->target_name[b->core.tid], b->core.pos, bam_endpos(b)); if(++disagrees % 100 == 0) LOG_DEBUG("disagrees: %lu.\n", disagrees); sam_write1(so, in.header, b); } else { if(++agrees % 500000 == 0) LOG_DEBUG("agrees: %lu.\n", agrees); } } sam_close(so); bam_destroy1(b); bed_destroy(lh3bed); return EXIT_SUCCESS; }
void ingestHeader(std::shared_ptr<bam_hdr_t> &header) { auto version = bamql::version(); std::stringstream name; name << "bamql-chain "; for (auto chains = known_chains.begin(); chains != known_chains.end(); chains++) { if (chains->second == chain) { name << chains->first; break; } } uuid_t uuid; uuid_generate(uuid); char id_str[sizeof(uuid_t) * 2 + 1]; uuid_unparse(uuid, id_str); auto copy = bamql::appendProgramToHeader( header.get(), name.str(), std::string(id_str), version, query); if (output_file) { sam_hdr_write(output_file.get(), copy.get()); } if (next) next->ingestHeader(chain == 3 ? header : copy); }
/* * Reads a file and outputs a new CRAM file to stdout with 'h' * replaced as the header. No checks are made to the validity. * * FIXME: error checking */ int cram_reheader(cram_fd *in, bam_hdr_t *h, const char *arg_list, int add_PG) { htsFile *h_out = hts_open("-", "wc"); cram_fd *out = h_out->fp.cram; cram_container *c = NULL; int ret = -1; // Attempt to fill out a cram->refs[] array from @SQ headers cram_fd_set_header(out, sam_hdr_parse_(h->text, h->l_text)); if (add_PG) { if (sam_hdr_add_PG(cram_fd_get_header(out), "samtools", "VN", samtools_version(), arg_list ? "CL": NULL, arg_list ? arg_list : NULL, NULL) != 0) goto err; // Covert back to bam_hdr_t struct free(h->text); h->text = strdup(sam_hdr_str(cram_fd_get_header(out))); h->l_text = sam_hdr_length(cram_fd_get_header(out)); if (!h->text) goto err; } if (sam_hdr_write(h_out, h) != 0) goto err; cram_set_option(out, CRAM_OPT_REFERENCE, NULL); while ((c = cram_read_container(in))) { int32_t i, num_blocks = cram_container_get_num_blocks(c); if (cram_write_container(out, c) != 0) goto err; for (i = 0; i < num_blocks; i++) { cram_block *blk = cram_read_block(in); if (!blk || cram_write_block(out, blk) != 0) { if (blk) cram_free_block(blk); goto err; } cram_free_block(blk); } cram_free_container(c); } ret = 0; err: if (hts_close(h_out) != 0) ret = -1; return ret; }
static void write_buffer(const char *fn, const char *mode, size_t l, bam1_p *buf, const bam_hdr_t *h, int n_threads) { size_t i; samFile* fp; fp = sam_open(fn, mode); if (fp == NULL) return; sam_hdr_write(fp, h); if (n_threads > 1) hts_set_threads(fp, n_threads); for (i = 0; i < l; ++i) sam_write1(fp, h, buf[i]); sam_close(fp); }
samfile_t *samopen(const char *fn, const char *mode, const void *aux) { // hts_open() is really sam_open(), except for #define games samFile *hts_fp = hts_open(fn, mode); if (hts_fp == NULL) return NULL; samfile_t *fp = malloc(sizeof (samfile_t)); if (!fp) { sam_close(hts_fp); return NULL; } fp->file = hts_fp; fp->x.bam = hts_fp->fp.bgzf; if (strchr(mode, 'r')) { if (aux) { if (hts_set_fai_filename(fp->file, aux) != 0) { sam_close(hts_fp); free(fp); return NULL; } } fp->header = sam_hdr_read(fp->file); // samclose() will free this if (fp->header == NULL) { sam_close(hts_fp); free(fp); return NULL; } fp->is_write = 0; if (fp->header->n_targets == 0 && bam_verbose >= 1) fprintf(samtools_stderr, "[samopen] no @SQ lines in the header.\n"); } else { enum htsExactFormat fmt = hts_get_format(fp->file)->format; fp->header = (bam_hdr_t *)aux; // For writing, we won't free it fp->is_write = 1; if (!(fmt == text_format || fmt == sam) || strchr(mode, 'h')) { if (sam_hdr_write(fp->file, fp->header) < 0) { if (bam_verbose >= 1) fprintf(samtools_stderr, "[samopen] Couldn't write header\n"); sam_close(hts_fp); free(fp); return NULL; } } } return fp; }
static int view_sam(hFILE *hfp, const char *filename) { samFile *in = hts_hopen(hfp, filename, "r"); if (in == NULL) return 0; samFile *out = dup_stdout("w"); bam_hdr_t *hdr = sam_hdr_read(in); if (show_headers) sam_hdr_write(out, hdr); if (mode == view_all) { bam1_t *b = bam_init1(); while (sam_read1(in, hdr, b) >= 0) sam_write1(out, hdr, b); bam_destroy1(b); } bam_hdr_destroy(hdr); hts_close(out); hts_close(in); return 1; }
void parse() { if (sam_hdr_write(out_file, replace_header) != 0) { throw new std::runtime_error("IEEEE!"); } bam1_t* file_read = bam_init1(); while (sam_read1(file_iter, file_header, file_read) >= 0) { if (file_read->core.tid != -1) { file_read->core.tid = trans[file_read->core.tid]; } if (file_read->core.mtid != -1) { file_read->core.mtid = trans[file_read->core.mtid]; } sam_write1(out_file, file_header, file_read); } // Clean up if (file_read) { bam_destroy1(file_read); } }
SamWriterPrivate(const std::string& filename, const PBBAM_SHARED_PTR<bam_hdr_t> rawHeader) : internal::FileProducer(filename) , file_(nullptr) , header_(rawHeader) { if (!header_) throw std::runtime_error("null header"); // open file const string& usingFilename = TempFilename(); const string& mode = string("w"); file_.reset(sam_open(usingFilename.c_str(), mode.c_str())); if (!file_) throw std::runtime_error("could not open file for writing"); // write header const int ret = sam_hdr_write(file_.get(), header_.get()); if (ret != 0) throw std::runtime_error("could not write header"); }
void SamWriter::write_header() const { sam_hdr_write(m_out_file, m_header.m_header.get()); }
int main_samview(int argc, char *argv[]) { int index; for(index = 0; index < argc; index++) { printf("The %d is %s\n",index,argv[index]); } getchar();return 0; int c, is_header = 0, is_header_only = 0, ret = 0, compress_level = -1, is_count = 0; int is_long_help = 0, n_threads = 0; int64_t count = 0; samFile *in = 0, *out = 0, *un_out=0; bam_hdr_t *header = NULL; char out_mode[5], out_un_mode[5], *out_format = ""; char *fn_in = 0, *fn_out = 0, *fn_list = 0, *q, *fn_un_out = 0; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; samview_settings_t settings = { .rghash = NULL, .min_mapQ = 0, .flag_on = 0, .flag_off = 0, .min_qlen = 0, .remove_B = 0, .subsam_seed = 0, .subsam_frac = -1., .library = NULL, .bed = NULL, }; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T'), { "threads", required_argument, NULL, '@' }, { NULL, 0, NULL, 0 } }; /* parse command-line options */ strcpy(out_mode, "w"); strcpy(out_un_mode, "w"); while ((c = getopt_long(argc, argv, "SbBcCt:h1Ho:O:q:f:F:ul:r:?T:R:L:s:@:m:x:U:", lopts, NULL)) >= 0) { switch (c) { case 's': if ((settings.subsam_seed = strtol(optarg, &q, 10)) != 0) { srand(settings.subsam_seed); settings.subsam_seed = rand(); } settings.subsam_frac = strtod(q, &q); break; case 'm': settings.min_qlen = atoi(optarg); break; case 'c': is_count = 1; break; case 'S': break; case 'b': out_format = "b"; break; case 'C': out_format = "c"; break; case 't': fn_list = strdup(optarg); break; case 'h': is_header = 1; break; case 'H': is_header_only = 1; break; case 'o': fn_out = strdup(optarg); break; case 'U': fn_un_out = strdup(optarg); break; case 'f': settings.flag_on |= strtol(optarg, 0, 0); break; case 'F': settings.flag_off |= strtol(optarg, 0, 0); break; case 'q': settings.min_mapQ = atoi(optarg); break; case 'u': compress_level = 0; break; case '1': compress_level = 1; break; case 'l': settings.library = strdup(optarg); break; case 'L': if ((settings.bed = bed_read(optarg)) == NULL) { print_error_errno("view", "Could not read file \"%s\"", optarg); ret = 1; goto view_end; } break; case 'r': if (add_read_group_single("view", &settings, optarg) != 0) { ret = 1; goto view_end; } break; case 'R': if (add_read_groups_file("view", &settings, optarg) != 0) { ret = 1; goto view_end; } break; /* REMOVED as htslib doesn't support this //case 'x': out_format = "x"; break; //case 'X': out_format = "X"; break; */ case '?': is_long_help = 1; break; case 'B': settings.remove_B = 1; break; case '@': n_threads = strtol(optarg, 0, 0); break; case 'x': { if (strlen(optarg) != 2) { fprintf(stderr, "main_samview: Error parsing -x auxiliary tags should be exactly two characters long.\n"); return usage(stderr, EXIT_FAILURE, is_long_help); } settings.remove_aux = (char**)realloc(settings.remove_aux, sizeof(char*) * (++settings.remove_aux_len)); settings.remove_aux[settings.remove_aux_len-1] = optarg; } break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) != 0) return usage(stderr, EXIT_FAILURE, is_long_help); break; } } if (compress_level >= 0 && !*out_format) out_format = "b"; if (is_header_only) is_header = 1; // File format auto-detection first if (fn_out) sam_open_mode(out_mode+1, fn_out, NULL); if (fn_un_out) sam_open_mode(out_un_mode+1, fn_un_out, NULL); // Overridden by manual -b, -C if (*out_format) out_mode[1] = out_un_mode[1] = *out_format; out_mode[2] = out_un_mode[2] = '\0'; // out_(un_)mode now 1 or 2 bytes long, followed by nul. if (compress_level >= 0) { char tmp[2]; tmp[0] = compress_level + '0'; tmp[1] = '\0'; strcat(out_mode, tmp); strcat(out_un_mode, tmp); } if (argc == optind && isatty(STDIN_FILENO)) return usage(stdout, EXIT_SUCCESS, is_long_help); // potential memory leak... fn_in = (optind < argc)? argv[optind] : "-"; // generate the fn_list if necessary if (fn_list == 0 && ga.reference) fn_list = samfaipath(ga.reference); // open file handlers if ((in = sam_open_format(fn_in, "r", &ga.in)) == 0) { print_error_errno("view", "failed to open \"%s\" for reading", fn_in); ret = 1; goto view_end; } if (fn_list) { if (hts_set_fai_filename(in, fn_list) != 0) { fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list); ret = 1; goto view_end; } } if ((header = sam_hdr_read(in)) == 0) { fprintf(stderr, "[main_samview] fail to read the header from \"%s\".\n", fn_in); ret = 1; goto view_end; } if (settings.rghash) { // FIXME: I do not know what "bam_header_t::n_text" is for... char *tmp; int l; tmp = drop_rg(header->text, settings.rghash, &l); free(header->text); header->text = tmp; header->l_text = l; } if (!is_count) { if ((out = sam_open_format(fn_out? fn_out : "-", out_mode, &ga.out)) == 0) { print_error_errno("view", "failed to open \"%s\" for writing", fn_out? fn_out : "standard output"); ret = 1; goto view_end; } if (fn_list) { if (hts_set_fai_filename(out, fn_list) != 0) { fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list); ret = 1; goto view_end; } } if (*out_format || is_header || out_mode[1] == 'b' || out_mode[1] == 'c' || (ga.out.format != sam && ga.out.format != unknown_format)) { if (sam_hdr_write(out, header) != 0) { fprintf(stderr, "[main_samview] failed to write the SAM header\n"); ret = 1; goto view_end; } } if (fn_un_out) { if ((un_out = sam_open_format(fn_un_out, out_un_mode, &ga.out)) == 0) { print_error_errno("view", "failed to open \"%s\" for writing", fn_un_out); ret = 1; goto view_end; } if (fn_list) { if (hts_set_fai_filename(un_out, fn_list) != 0) { fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list); ret = 1; goto view_end; } } if (*out_format || is_header || out_un_mode[1] == 'b' || out_un_mode[1] == 'c' || (ga.out.format != sam && ga.out.format != unknown_format)) { if (sam_hdr_write(un_out, header) != 0) { fprintf(stderr, "[main_samview] failed to write the SAM header\n"); ret = 1; goto view_end; } } } } if (n_threads > 1) { if (out) hts_set_threads(out, n_threads); } if (is_header_only) goto view_end; // no need to print alignments if (optind + 1 >= argc) { // convert/print the entire file bam1_t *b = bam_init1(); int r; while ((r = sam_read1(in, header, b)) >= 0) { // read one alignment from `in' if (!process_aln(header, b, &settings)) { if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } count++; } else { if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } } } if (r < -1) { fprintf(stderr, "[main_samview] truncated file.\n"); ret = 1; } bam_destroy1(b); } else { // retrieve alignments in specified regions int i; bam1_t *b; hts_idx_t *idx = sam_index_load(in, fn_in); // load index if (idx == 0) { // index is unavailable fprintf(stderr, "[main_samview] random alignment retrieval only works for indexed BAM or CRAM files.\n"); ret = 1; goto view_end; } b = bam_init1(); for (i = optind + 1; i < argc; ++i) { int result; hts_itr_t *iter = sam_itr_querys(idx, header, argv[i]); // parse a region in the format like `chr2:100-200' if (iter == NULL) { // region invalid or reference name not found int beg, end; if (hts_parse_reg(argv[i], &beg, &end)) fprintf(stderr, "[main_samview] region \"%s\" specifies an unknown reference name. Continue anyway.\n", argv[i]); else fprintf(stderr, "[main_samview] region \"%s\" could not be parsed. Continue anyway.\n", argv[i]); continue; } // fetch alignments while ((result = sam_itr_next(in, iter, b)) >= 0) { if (!process_aln(header, b, &settings)) { if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } count++; } else { if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } } } hts_itr_destroy(iter); if (result < -1) { fprintf(stderr, "[main_samview] retrieval of region \"%s\" failed due to truncated file or corrupt BAM index file\n", argv[i]); ret = 1; break; } } bam_destroy1(b); hts_idx_destroy(idx); // destroy the BAM index } view_end: if (is_count && ret == 0) printf("%" PRId64 "\n", count); // close files, free and return if (in) check_sam_close("view", in, fn_in, "standard input", &ret); if (out) check_sam_close("view", out, fn_out, "standard output", &ret); if (un_out) check_sam_close("view", un_out, fn_un_out, "file", &ret); free(fn_list); free(fn_out); free(settings.library); free(fn_un_out); sam_global_args_free(&ga); if ( header ) bam_hdr_destroy(header); if (settings.bed) bed_destroy(settings.bed); if (settings.rghash) { khint_t k; for (k = 0; k < kh_end(settings.rghash); ++k) if (kh_exist(settings.rghash, k)) free((char*)kh_key(settings.rghash, k)); kh_destroy(rg, settings.rghash); } if (settings.remove_aux_len) { free(settings.remove_aux); } return ret; } static int usage(FILE *fp, int exit_status, int is_long_help) { fprintf(fp, "\n" "Usage: samtools view [options] <in.bam>|<in.sam>|<in.cram> [region ...]\n" "\n" "Options:\n" // output options " -b output BAM\n" " -C output CRAM (requires -T)\n" " -1 use fast BAM compression (implies -b)\n" " -u uncompressed BAM output (implies -b)\n" " -h include header in SAM output\n" " -H print SAM header only (no alignments)\n" " -c print only the count of matching records\n" " -o FILE output file name [stdout]\n" " -U FILE output reads not selected by filters to FILE [null]\n" // extra input " -t FILE FILE listing reference names and lengths (see long help) [null]\n" // read filters " -L FILE only include reads overlapping this BED FILE [null]\n" " -r STR only include reads in read group STR [null]\n" " -R FILE only include reads with read group listed in FILE [null]\n" " -q INT only include reads with mapping quality >= INT [0]\n" " -l STR only include reads in library STR [null]\n" " -m INT only include reads with number of CIGAR operations consuming\n" " query sequence >= INT [0]\n" " -f INT only include reads with all bits set in INT set in FLAG [0]\n" " -F INT only include reads with none of the bits set in INT set in FLAG [0]\n" // read processing " -x STR read tag to strip (repeatable) [null]\n" " -B collapse the backward CIGAR operation\n" " -s FLOAT integer part sets seed of random number generator [0];\n" " rest sets fraction of templates to subsample [no subsampling]\n" // general options " -@, --threads INT\n" " number of BAM/CRAM compression threads [0]\n" " -? print long help, including note about region specification\n" " -S ignored (input format is auto-detected)\n"); sam_global_opt_help(fp, "-.O.T"); fprintf(fp, "\n"); if (is_long_help) fprintf(fp, "Notes:\n" "\n" "1. This command now auto-detects the input format (BAM/CRAM/SAM).\n" " Further control over the CRAM format can be specified by using the\n" " --output-fmt-option, e.g. to specify the number of sequences per slice\n" " and to use avoid reference based compression:\n" "\n" "\tsamtools view -C --output-fmt-option seqs_per_slice=5000 \\\n" "\t --output-fmt-option no_ref -o out.cram in.bam\n" "\n" " Options can also be specified as a comma separated list within the\n" " --output-fmt value too. For example this is equivalent to the above\n" "\n" "\tsamtools view --output-fmt cram,seqs_per_slice=5000,no_ref \\\n" "\t -o out.cram in.bam\n" "\n" "2. The file supplied with `-t' is SPACE/TAB delimited with the first\n" " two fields of each line consisting of the reference name and the\n" " corresponding sequence length. The `.fai' file generated by \n" " `samtools faidx' is suitable for use as this file. This may be an\n" " empty file if reads are unaligned.\n" "\n" "3. SAM->BAM conversion: samtools view -bT ref.fa in.sam.gz\n" "\n" "4. BAM->SAM conversion: samtools view -h in.bam\n" "\n" "5. A region should be presented in one of the following formats:\n" " `chr1', `chr2:1,000' and `chr3:1000-2,000'. When a region is\n" " specified, the input alignment file must be a sorted and indexed\n" " alignment (BAM/CRAM) file.\n" "\n" "6. Option `-u' is preferred over `-b' when the output is piped to\n" " another samtools command.\n" "\n"); return exit_status; }
static bool split(state_t* state) { if (state->unaccounted_file && sam_hdr_write(state->unaccounted_file, state->unaccounted_header) != 0) { fprintf(pysamerr, "Could not write output file header\n"); return false; } size_t i; for (i = 0; i < state->output_count; i++) { if (sam_hdr_write(state->rg_output_file[i], state->rg_output_header[i]) != 0) { fprintf(pysamerr, "Could not write output file header\n"); return false; } } bam1_t* file_read = bam_init1(); // Read the first record if (sam_read1(state->merged_input_file, state->merged_input_header, file_read) < 0) { // Nothing more to read? Ignore this file bam_destroy1(file_read); file_read = NULL; } while (file_read != NULL) { // Get RG tag from read and look it up in hash to find file to output it to uint8_t* tag = bam_aux_get(file_read, "RG"); khiter_t iter; if ( tag != NULL ) { char* rg = bam_aux2Z(tag); iter = kh_get_c2i(state->rg_hash, rg); } else { iter = kh_end(state->rg_hash); } // Write the read out to correct file if (iter != kh_end(state->rg_hash)) { // if found write to the appropriate untangled bam int i = kh_val(state->rg_hash,iter); sam_write1(state->rg_output_file[i], state->rg_output_header[i], file_read); } else { // otherwise write to the unaccounted bam if there is one or fail if (state->unaccounted_file == NULL) { if (tag) { fprintf(pysamerr, "Read \"%s\" with unaccounted for tag \"%s\".\n", bam_get_qname(file_read), bam_aux2Z(tag)); } else { fprintf(pysamerr, "Read \"%s\" has no RG tag.\n", bam_get_qname(file_read)); } bam_destroy1(file_read); return false; } else { sam_write1(state->unaccounted_file, state->unaccounted_header, file_read); } } // Replace written read with the next one to process if (sam_read1(state->merged_input_file, state->merged_input_header, file_read) < 0) { // Nothing more to read? Ignore this file in future bam_destroy1(file_read); file_read = NULL; } } return true; }
int main(int argc, char *argv[]) { samFile *in; char *fn_ref = 0; int flag = 0, c, clevel = -1, ignore_sam_err = 0; char moder[8]; bam_hdr_t *h; bam1_t *b; htsFile *out; char modew[8]; int r = 0, exit_code = 0; while ((c = getopt(argc, argv, "IbDCSl:t:")) >= 0) { switch (c) { case 'S': flag |= 1; break; case 'b': flag |= 2; break; case 'D': flag |= 4; break; case 'C': flag |= 8; break; case 'l': clevel = atoi(optarg); flag |= 2; break; case 't': fn_ref = optarg; break; case 'I': ignore_sam_err = 1; break; } } if (argc == optind) { fprintf(stderr, "Usage: samview [-bSCSI] [-l level] <in.bam>|<in.sam>|<in.cram> [region]\n"); return 1; } strcpy(moder, "r"); if (flag&4) strcat(moder, "c"); else if ((flag&1) == 0) strcat(moder, "b"); in = sam_open(argv[optind], moder); h = sam_hdr_read(in); h->ignore_sam_err = ignore_sam_err; b = bam_init1(); strcpy(modew, "w"); if (clevel >= 0 && clevel <= 9) sprintf(modew + 1, "%d", clevel); if (flag&8) strcat(modew, "c"); else if (flag&2) strcat(modew, "b"); out = hts_open("-", modew); /* CRAM output */ if (flag & 8) { // Parse input header and use for CRAM output out->fp.cram->header = sam_hdr_parse_(h->text, h->l_text); // Create CRAM references arrays if (fn_ref) cram_set_option(out->fp.cram, CRAM_OPT_REFERENCE, fn_ref); else // Attempt to fill out a cram->refs[] array from @SQ headers cram_set_option(out->fp.cram, CRAM_OPT_REFERENCE, NULL); } sam_hdr_write(out, h); if (optind + 1 < argc && !(flag&1)) { // BAM input and has a region int i; hts_idx_t *idx; if ((idx = bam_index_load(argv[optind])) == 0) { fprintf(stderr, "[E::%s] fail to load the BAM index\n", __func__); return 1; } for (i = optind + 1; i < argc; ++i) { hts_itr_t *iter; if ((iter = bam_itr_querys(idx, h, argv[i])) == 0) { fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, argv[i]); continue; } while ((r = bam_itr_next(in, iter, b)) >= 0) { if (sam_write1(out, h, b) < 0) { fprintf(stderr, "Error writing output.\n"); exit_code = 1; break; } } hts_itr_destroy(iter); } hts_idx_destroy(idx); } else while ((r = sam_read1(in, h, b)) >= 0) { if (sam_write1(out, h, b) < 0) { fprintf(stderr, "Error writing output.\n"); exit_code = 1; break; } } sam_close(out); if (r < -1) { fprintf(stderr, "Error parsing input.\n"); exit_code = 1; } bam_destroy1(b); bam_hdr_destroy(h); sam_close(in); return exit_code; }
int main(int argc, char *argv[]) { samFile *in; char *fn_ref = 0; int flag = 0, c, clevel = -1, ignore_sam_err = 0; char moder[8]; bam_hdr_t *h; bam1_t *b; htsFile *out; char modew[8]; int r = 0, exit_code = 0; hts_opt *in_opts = NULL, *out_opts = NULL, *last = NULL; int nreads = 0; int benchmark = 0; while ((c = getopt(argc, argv, "IbDCSl:t:i:o:N:B")) >= 0) { switch (c) { case 'S': flag |= 1; break; case 'b': flag |= 2; break; case 'D': flag |= 4; break; case 'C': flag |= 8; break; case 'B': benchmark = 1; break; case 'l': clevel = atoi(optarg); flag |= 2; break; case 't': fn_ref = optarg; break; case 'I': ignore_sam_err = 1; break; case 'i': if (add_option(&in_opts, optarg)) return 1; break; case 'o': if (add_option(&out_opts, optarg)) return 1; break; case 'N': nreads = atoi(optarg); } } if (argc == optind) { fprintf(stderr, "Usage: samview [-bSCSIB] [-N num_reads] [-l level] [-o option=value] <in.bam>|<in.sam>|<in.cram> [region]\n"); return 1; } strcpy(moder, "r"); if (flag&4) strcat(moder, "c"); else if ((flag&1) == 0) strcat(moder, "b"); in = sam_open(argv[optind], moder); if (in == NULL) { fprintf(stderr, "Error opening \"%s\"\n", argv[optind]); return EXIT_FAILURE; } h = sam_hdr_read(in); h->ignore_sam_err = ignore_sam_err; b = bam_init1(); strcpy(modew, "w"); if (clevel >= 0 && clevel <= 9) sprintf(modew + 1, "%d", clevel); if (flag&8) strcat(modew, "c"); else if (flag&2) strcat(modew, "b"); out = hts_open("-", modew); if (out == NULL) { fprintf(stderr, "Error opening standard output\n"); return EXIT_FAILURE; } /* CRAM output */ if (flag & 8) { int ret; // Parse input header and use for CRAM output out->fp.cram->header = sam_hdr_parse_(h->text, h->l_text); // Create CRAM references arrays if (fn_ref) ret = cram_set_option(out->fp.cram, CRAM_OPT_REFERENCE, fn_ref); else // Attempt to fill out a cram->refs[] array from @SQ headers ret = cram_set_option(out->fp.cram, CRAM_OPT_REFERENCE, NULL); if (ret != 0) return EXIT_FAILURE; } // Process any options; currently cram only. for (; in_opts; in_opts = (last=in_opts)->next, free(last)) { hts_set_opt(in, in_opts->opt, in_opts->val); if (in_opts->opt == CRAM_OPT_REFERENCE) if (hts_set_opt(out, in_opts->opt, in_opts->val) != 0) return EXIT_FAILURE; } for (; out_opts; out_opts = (last=out_opts)->next, free(last)) if (hts_set_opt(out, out_opts->opt, out_opts->val) != 0) return EXIT_FAILURE; if (!benchmark) sam_hdr_write(out, h); if (optind + 1 < argc && !(flag&1)) { // BAM input and has a region int i; hts_idx_t *idx; if ((idx = sam_index_load(in, argv[optind])) == 0) { fprintf(stderr, "[E::%s] fail to load the BAM index\n", __func__); return 1; } for (i = optind + 1; i < argc; ++i) { hts_itr_t *iter; if ((iter = sam_itr_querys(idx, h, argv[i])) == 0) { fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, argv[i]); continue; } while ((r = sam_itr_next(in, iter, b)) >= 0) { if (!benchmark && sam_write1(out, h, b) < 0) { fprintf(stderr, "Error writing output.\n"); exit_code = 1; break; } if (nreads && --nreads == 0) break; } hts_itr_destroy(iter); } hts_idx_destroy(idx); } else while ((r = sam_read1(in, h, b)) >= 0) { if (!benchmark && sam_write1(out, h, b) < 0) { fprintf(stderr, "Error writing output.\n"); exit_code = 1; break; } if (nreads && --nreads == 0) break; } if (r < -1) { fprintf(stderr, "Error parsing input.\n"); exit_code = 1; } r = sam_close(out); if (r < 0) { fprintf(stderr, "Error closing output.\n"); exit_code = 1; } bam_destroy1(b); bam_hdr_destroy(h); r = sam_close(in); if (r < 0) { fprintf(stderr, "Error closing input.\n"); exit_code = 1; } return exit_code; }
int bam_fillmd(int argc, char *argv[]) { int c, flt_flag, tid = -2, ret, len, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag; samFile *fp = NULL, *fpout = NULL; bam_hdr_t *header = NULL; faidx_t *fai = NULL; char *ref = NULL, mode_w[8], *ref_file; bam1_t *b = NULL; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0), { NULL, 0, NULL, 0 } }; flt_flag = UPDATE_NM | UPDATE_MD; is_bam_out = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0; strcpy(mode_w, "w"); while ((c = getopt_long(argc, argv, "EqreuNhbSC:n:Ad", lopts, NULL)) >= 0) { switch (c) { case 'r': is_realn = 1; break; case 'e': flt_flag |= USE_EQUAL; break; case 'd': flt_flag |= DROP_TAG; break; case 'q': flt_flag |= BIN_QUAL; break; case 'h': flt_flag |= HASH_QNM; break; case 'N': flt_flag &= ~(UPDATE_MD|UPDATE_NM); break; case 'b': is_bam_out = 1; break; case 'u': is_uncompressed = is_bam_out = 1; break; case 'S': break; case 'n': max_nm = atoi(optarg); break; case 'C': capQ = atoi(optarg); break; case 'A': baq_flag |= 1; break; case 'E': baq_flag |= 2; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n\n", c); /* else fall-through */ case '?': return calmd_usage(); } } if (is_bam_out) strcat(mode_w, "b"); else strcat(mode_w, "h"); if (is_uncompressed) strcat(mode_w, "0"); if (optind + (ga.reference == NULL) >= argc) return calmd_usage(); fp = sam_open_format(argv[optind], "r", &ga.in); if (fp == NULL) { print_error_errno("calmd", "Failed to open input file '%s'", argv[optind]); return 1; } header = sam_hdr_read(fp); if (header == NULL || header->n_targets == 0) { fprintf(stderr, "[bam_fillmd] input SAM does not have header. Abort!\n"); goto fail; } fpout = sam_open_format("-", mode_w, &ga.out); if (fpout == NULL) { print_error_errno("calmd", "Failed to open output"); goto fail; } if (sam_hdr_write(fpout, header) < 0) { print_error_errno("calmd", "Failed to write sam header"); goto fail; } ref_file = argc > optind + 1 ? argv[optind+1] : ga.reference; fai = fai_load(ref_file); if (!fai) { print_error_errno("calmd", "Failed to open reference file '%s'", ref_file); goto fail; } b = bam_init1(); if (!b) { fprintf(stderr, "[bam_fillmd] Failed to allocate bam struct\n"); goto fail; } while ((ret = sam_read1(fp, header, b)) >= 0) { if (b->core.tid >= 0) { if (tid != b->core.tid) { free(ref); ref = fai_fetch(fai, header->target_name[b->core.tid], &len); tid = b->core.tid; if (ref == 0) { // FIXME: Should this always be fatal? fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n", header->target_name[tid]); if (is_realn || capQ > 10) goto fail; // Would otherwise crash } } if (is_realn) sam_prob_realn(b, ref, len, baq_flag); if (capQ > 10) { int q = sam_cap_mapq(b, ref, len, capQ); if (b->core.qual > q) b->core.qual = q; } if (ref) bam_fillmd1_core(b, ref, len, flt_flag, max_nm); } if (sam_write1(fpout, header, b) < 0) { print_error_errno("calmd", "failed to write to output file"); goto fail; } } if (ret < -1) { fprintf(stderr, "[bam_fillmd] Error reading input.\n"); goto fail; } bam_destroy1(b); bam_hdr_destroy(header); free(ref); fai_destroy(fai); sam_close(fp); if (sam_close(fpout) < 0) { fprintf(stderr, "[bam_fillmd] error when closing output file\n"); return 1; } return 0; fail: free(ref); if (b) bam_destroy1(b); if (header) bam_hdr_destroy(header); if (fai) fai_destroy(fai); if (fp) sam_close(fp); if (fpout) sam_close(fpout); return 1; }
int main_pad2unpad(int argc, char *argv[]) { samFile *in = 0, *out = 0; bam_hdr_t *h = 0, *h_fix = 0; faidx_t *fai = 0; int c, compress_level = -1, is_long_help = 0; char in_mode[5], out_mode[6], *fn_out = 0, *fn_list = 0; int ret=0; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 'T'), { NULL, 0, NULL, 0 } }; /* parse command-line options */ strcpy(in_mode, "r"); strcpy(out_mode, "w"); while ((c = getopt_long(argc, argv, "SCso:u1T:?", lopts, NULL)) >= 0) { switch (c) { case 'S': break; case 'C': hts_parse_format(&ga.out, "cram"); break; case 's': assert(compress_level == -1); hts_parse_format(&ga.out, "sam"); break; case 'o': fn_out = strdup(optarg); break; case 'u': compress_level = 0; if (ga.out.format == unknown_format) hts_parse_format(&ga.out, "bam"); break; case '1': compress_level = 1; if (ga.out.format == unknown_format) hts_parse_format(&ga.out, "bam"); break; case '?': is_long_help = 1; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n\n", c); return usage(is_long_help); } } if (argc == optind) return usage(is_long_help); strcat(out_mode, "h"); if (compress_level >= 0) { char tmp[2]; tmp[0] = compress_level + '0'; tmp[1] = '\0'; strcat(out_mode, tmp); } // Load FASTA reference (also needed for SAM -> BAM if missing header) if (ga.reference) { fn_list = samfaipath(ga.reference); fai = fai_load(ga.reference); } // open file handlers if ((in = sam_open_format(argv[optind], in_mode, &ga.in)) == 0) { fprintf(stderr, "[depad] failed to open \"%s\" for reading.\n", argv[optind]); ret = 1; goto depad_end; } if (fn_list && hts_set_fai_filename(in, fn_list) != 0) { fprintf(stderr, "[depad] failed to load reference file \"%s\".\n", fn_list); ret = 1; goto depad_end; } if ((h = sam_hdr_read(in)) == 0) { fprintf(stderr, "[depad] failed to read the header from \"%s\".\n", argv[optind]); ret = 1; goto depad_end; } if (fai) { h_fix = fix_header(h, fai); } else { fprintf(stderr, "[depad] Warning - reference lengths will not be corrected without FASTA reference\n"); h_fix = h; } char wmode[2]; strcat(out_mode, sam_open_mode(wmode, fn_out, NULL)==0 ? wmode : "b"); if ((out = sam_open_format(fn_out? fn_out : "-", out_mode, &ga.out)) == 0) { fprintf(stderr, "[depad] failed to open \"%s\" for writing.\n", fn_out? fn_out : "standard output"); ret = 1; goto depad_end; } // Reference-based CRAM won't work unless we also create a new reference. // We could embed this, but for now we take the easy option. if (ga.out.format == cram) hts_set_opt(out, CRAM_OPT_NO_REF, 1); if (sam_hdr_write(out, h_fix) != 0) { fprintf(stderr, "[depad] failed to write header.\n"); ret = 1; goto depad_end; } // Do the depad ret = bam_pad2unpad(in, out, h, fai); depad_end: // close files, free and return if (fai) fai_destroy(fai); if (h) bam_hdr_destroy(h); sam_close(in); sam_close(out); free(fn_list); free(fn_out); return ret; }
/* * CRAM files don't store the RG:Z:ID per read in the aux field. * Instead they have a numerical data series (RG) to point each read * back to the Nth @RG line in the file. This means that we may need * to edit the RG data series (if the files were produced from * "samtools split" for example). * * The encoding method is stored in the compression header. Typical * examples: * * RG => EXTERNAL {18} # Block content-id 18 holds RG values * # as a series of ITF8 encoded values * * RG => HUFFMAN {1, 255, 255, 255, 255, 255, 1, 0} * # One RG value #-1. (No RG) * * RG => HUFFMAN {1, 0, 1, 0} # One RG value #0 (always first RG) * * RG => HUFFMAN {2, 0, 1, 2, 1, 1} * # Two RG values, #0 and #1, written * # to the CORE block and possibly * # mixed with other data series. * * A single value can (but may not be) implemented as a zero bit * huffman code. In this situation we can change the meta-data in the * compression header to renumber an RG value.. */ int cram_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outcram) { samFile *out; cram_fd *out_c; int i, vers_maj, vers_min; khash_s2i *rg2id = NULL; bam_hdr_t *new_h = NULL; /* Check consistent versioning and compatible headers */ if (!(new_h = cram_cat_check_hdr(nfn, fn, h, &rg2id, &vers_maj, &vers_min))) return -1; /* Open the file with cram_vers */ char vers[100]; sprintf(vers, "%d.%d", vers_maj, vers_min); out = sam_open(outcram, "wc"); if (out == 0) { fprintf(stderr, "[%s] ERROR: fail to open output file '%s'.\n", __func__, outcram); return 1; } out_c = out->fp.cram; cram_set_option(out_c, CRAM_OPT_VERSION, vers); //fprintf(stderr, "Creating cram vers %s\n", vers); cram_fd_set_header(out_c, sam_hdr_parse_(new_h->text, new_h->l_text)); // needed? sam_hdr_write(out, new_h); for (i = 0; i < nfn; ++i) { samFile *in; cram_fd *in_c; cram_container *c; bam_hdr_t *old; int new_rg = -1; in = sam_open(fn[i], "rc"); if (in == 0) { fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]); return -1; } in_c = in->fp.cram; old = sam_hdr_read(in); khash_s2i *rg2id_in = hash_rg(old); // Compute RG mapping if suitable for changing. if (rg2id_in->n_id == 1) { int _; new_rg = hash_s2i_inc(rg2id, rg2id_in->id[0], NULL, &_); } else { new_rg = 0; } hash_s2i_free(rg2id_in); // Copy contains and blocks within them while ((c = cram_read_container(in_c))) { cram_block *blk; if (cram_container_is_empty(in_c)) { if (cram_write_container(out_c, c) != 0) return -1; // Container compression header if (!(blk = cram_read_block(in_c))) return -1; if (cram_write_block(out_c, blk) != 0) { cram_free_block(blk); return -1; } cram_free_block(blk); cram_free_container(c); continue; } // If we have just one RG key and new_rg != 0 then // we need to edit the compression header. IF WE CAN. if (new_rg) { int zero = 0; //fprintf(stderr, "Transcode RG %d to %d\n", 0, new_rg); cram_transcode_rg(in_c, out_c, c, 1, &zero, &new_rg); } else { int32_t num_slices; // Not switching rg so do the usual read/write loop if (cram_write_container(out_c, c) != 0) return -1; // Container compression header if (!(blk = cram_read_block(in_c))) return -1; if (cram_write_block(out_c, blk) != 0) { cram_free_block(blk); return -1; } cram_free_block(blk); // Container num_blocks can be invalid, due to a bug. // Instead we iterate in slice context instead. (void)cram_container_get_landmarks(c, &num_slices); cram_copy_slice(in_c, out_c, num_slices); } cram_free_container(c); } bam_hdr_destroy(old); sam_close(in); } sam_close(out); hash_s2i_free(rg2id); bam_hdr_destroy(new_h); return 0; }
int main(int argc, char **argv) { if (argc < 4) errx(1, "usage\t:%s <bam> <split out> <discord out> (optional #threads)", argv[0]); char *bam_file_name = argv[1]; char *split_file_name = argv[2]; char *disc_file_name = argv[3]; int threads = 2; if (argc == 5) { threads = atoi(argv[4]); } samFile *disc = sam_open(disc_file_name, "wb"); samFile *split = sam_open(split_file_name, "wb"); samFile *in = sam_open(bam_file_name, "rb"); if(in == NULL) errx(1, "Unable to open BAM/SAM file."); // TODO: handle cram. if (threads > 1) { bgzf_mt(in->fp.bgzf, threads, 256); } hts_idx_t *idx = sam_index_load(in, bam_file_name); if(idx == NULL) errx(1,"Unable to open BAM/SAM index."); bam_hdr_t *hdr = sam_hdr_read(in); int r = sam_hdr_write(disc, hdr); r = sam_hdr_write(split, hdr); bam1_t *aln = bam_init1(); int ret; while(ret = sam_read1(in, hdr, aln) >= 0) { if (((aln->core.flag) & 1294) == 0) r = sam_write1(disc, hdr, aln); uint8_t *sa = bam_aux_get(aln, "SA"); if (sa != 0) { char *sa_tag = strdup(bam_aux2Z(sa)); if ( count_tags(sa_tag) == 1) { char *chrm, strand, *cigar; uint32_t pos; split_sa_tag(sa_tag, &chrm, &pos, &strand, &cigar); struct line sa, al; calcOffsets(cigar, pos, strand, &sa); sa.chrm = chrm; sa.strand = strand; calcAlnOffsets(bam_get_cigar(aln), aln->core.n_cigar, aln->core.pos, bam_is_rev(aln) ? '-' : '+', &al); al.chrm = hdr->target_name[aln->core.tid]; al.strand = bam_is_rev(aln) ? '-' : '+'; struct line *left = &al, *right = &sa; if (left->SQO > right->SQO) { left = &sa; right = &al; } int overlap = MAX(1 + MIN(left->EQO, right->EQO) - MAX(left->SQO, right->SQO), 0); int alen1 = 1 + left->EQO - left->SQO; int alen2 = 1 + right->EQO - right->SQO; int mno = MIN(alen1-overlap, alen2-overlap); if (mno < MIN_NON_OVERLAP) continue; if ( (strcmp(left->chrm, right->chrm) == 0) && (left->strand == right->strand) ) { int leftDiag, rightDiag, insSize; if (left->strand == '-') { leftDiag = left->rapos - left->sclip; rightDiag = (right->rapos + right->raLen) - (right->sclip + right->qaLen); insSize = rightDiag - leftDiag; } else { leftDiag = (left->rapos + left->raLen) - (left->sclip + left->qaLen); rightDiag = right->rapos - right->sclip; insSize = leftDiag - rightDiag; } int desert = right->SQO - left->EQO - 1; if ((abs(insSize) < MIN_INDEL_SIZE) || ((desert > 0) && ( (desert - (int)MAX(0, insSize)) > MAX_UNMAPPED_BASES))) continue; } char *qname = bam_get_qname(aln); if ((aln->core.flag & 64) == 64) qname[0] = 'A'; else qname[0] = 'B'; r = sam_write1(split, hdr, aln); } free(sa_tag); } } bam_destroy1(aln); hts_idx_destroy(idx); bam_hdr_destroy(hdr); sam_close(in); sam_close(disc); sam_close(split); if(ret < -1) { errx(1, "lumpy_filter: error reading bam: %s\n", bam_file_name); } }
// currently, this function ONLY works if each read has one hit static void bam_mating_core(samFile* in, samFile* out, int remove_reads, int proper_pair_check, int add_ct) { bam_hdr_t *header; bam1_t *b[2]; int curr, has_prev, pre_end = 0, cur_end = 0; kstring_t str; str.l = str.m = 0; str.s = 0; header = sam_hdr_read(in); if (header == NULL) { fprintf(stderr, "[bam_mating_core] ERROR: Couldn't read header\n"); exit(1); } // Accept unknown, unsorted, or queryname sort order, but error on coordinate sorted. if ((header->l_text > 3) && (strncmp(header->text, "@HD", 3) == 0)) { char *p, *q; p = strstr(header->text, "\tSO:coordinate"); q = strchr(header->text, '\n'); // Looking for SO:coordinate within the @HD line only // (e.g. must ignore in a @CO comment line later in header) if ((p != 0) && (p < q)) { fprintf(stderr, "[bam_mating_core] ERROR: Coordinate sorted, require grouped/sorted by queryname.\n"); exit(1); } } sam_hdr_write(out, header); b[0] = bam_init1(); b[1] = bam_init1(); curr = 0; has_prev = 0; while (sam_read1(in, header, b[curr]) >= 0) { bam1_t *cur = b[curr], *pre = b[1-curr]; if (cur->core.flag & BAM_FSECONDARY) { if ( !remove_reads ) sam_write1(out, header, cur); continue; // skip secondary alignments } if (cur->core.flag & BAM_FSUPPLEMENTARY) { sam_write1(out, header, cur); continue; // pass supplementary alignments through unchanged (TODO:make them match read they came from) } if (cur->core.tid < 0 || cur->core.pos < 0) // If unmapped set the flag { cur->core.flag |= BAM_FUNMAP; } if ((cur->core.flag&BAM_FUNMAP) == 0) // If mapped calculate end { cur_end = bam_endpos(cur); // Check cur_end isn't past the end of the contig we're on, if it is set the UNMAP'd flag if (cur_end > (int)header->target_len[cur->core.tid]) cur->core.flag |= BAM_FUNMAP; } if (has_prev) { // do we have a pair of reads to examine? if (strcmp(bam_get_qname(cur), bam_get_qname(pre)) == 0) { // identical pair name pre->core.flag |= BAM_FPAIRED; cur->core.flag |= BAM_FPAIRED; sync_mate(pre, cur); if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP)) && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) // if safe set TLEN/ISIZE { uint32_t cur5, pre5; cur5 = (cur->core.flag&BAM_FREVERSE)? cur_end : cur->core.pos; pre5 = (pre->core.flag&BAM_FREVERSE)? pre_end : pre->core.pos; cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5; } else cur->core.isize = pre->core.isize = 0; if (add_ct) bam_template_cigar(pre, cur, &str); // TODO: Add code to properly check if read is in a proper pair based on ISIZE distribution if (proper_pair_check && !plausibly_properly_paired(pre,cur)) { pre->core.flag &= ~BAM_FPROPER_PAIR; cur->core.flag &= ~BAM_FPROPER_PAIR; } // Write out result if ( !remove_reads ) { sam_write1(out, header, pre); sam_write1(out, header, cur); } else { // If we have to remove reads make sure we do it in a way that doesn't create orphans with bad flags if(pre->core.flag&BAM_FUNMAP) cur->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR); if(cur->core.flag&BAM_FUNMAP) pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR); if(!(pre->core.flag&BAM_FUNMAP)) sam_write1(out, header, pre); if(!(cur->core.flag&BAM_FUNMAP)) sam_write1(out, header, cur); } has_prev = 0; } else { // unpaired? clear bad info and write it out if (pre->core.tid < 0 || pre->core.pos < 0 || pre->core.flag&BAM_FUNMAP) { // If unmapped pre->core.flag |= BAM_FUNMAP; pre->core.tid = -1; pre->core.pos = -1; } pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0; pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR); if ( !remove_reads || !(pre->core.flag&BAM_FUNMAP) ) sam_write1(out, header, pre); } } else has_prev = 1; curr = 1 - curr; pre_end = cur_end; } if (has_prev && !remove_reads) { // If we still have a BAM in the buffer it must be unpaired bam1_t *pre = b[1-curr]; if (pre->core.tid < 0 || pre->core.pos < 0 || pre->core.flag&BAM_FUNMAP) { // If unmapped pre->core.flag |= BAM_FUNMAP; pre->core.tid = -1; pre->core.pos = -1; } pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0; pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR); sam_write1(out, header, pre); } bam_hdr_destroy(header); bam_destroy1(b[0]); bam_destroy1(b[1]); free(str.s); }
int do_grep() { #ifdef DEBUGa printf("[!]do_grep\n"); #endif BamInfo_t *pbam; kh_cstr_t BamID; khiter_t ki, bami; kstring_t ks1 = { 0, 0, NULL }; kstring_t ks2 = { 0, 0, NULL }; kstring_t ks3 = { 0, 0, NULL }; kstring_t kstr = { 0, 0, NULL }; //ksprintf(kstr, "%s/%s_grep/", myConfig.WorkDir, myConfig.ProjectID); //const char *filePrefix = strdup(ks_str(kstr)); //kputs(myConfig.WorkDir,kstr); samFile *in; bam_hdr_t *h; hts_idx_t *idx; bam1_t *b, *d, *d2, *bR1, *bR2, *bR3; bR1 = bam_init1(); bR2 = bam_init1(); bR3 = bam_init1(); htsFile *out; //hts_opt *in_opts = NULL, *out_opts = NULL; int r = 0, exit_code = 0; kvec_t(bam1_t) R1, R2, RV; pierCluster_t *pierCluster; //samdat_t tmp_samdat; #ifdef DEBUGa kstr.l = 0; ksprintf(&kstr, "%s/%s_grep/Greped.dump", myConfig.WorkDir, myConfig.ProjectID); FILE *fsdump = fopen(ks_str(&kstr),"w"); #endif kstr.l = 0; ksprintf(&kstr, "%s/%s_grep/Greped.ini", myConfig.WorkDir, myConfig.ProjectID); FILE *fs = fopen(ks_str(&kstr),"w"); uint32_t blockid = 0; for (bami = kh_begin(bamNFOp); bami != kh_end(bamNFOp); ++bami) { //printf(">[%d]:\n",bami); if (kh_exist(bamNFOp, bami)) { kv_init(R1); kv_init(R2); kv_init(RV); //tmp_samdat = (const samdat_t){ 0 }; //memset(&tmp_samdat,0,sizeof(samdat_t)); //printf("-[%d]:\n",bami); BamID = kh_key(bamNFOp, bami); pbam = &kh_value(bamNFOp, bami); fprintf(stderr, "%u [%s]=%s\t%u %u\n",bami,BamID,pbam->fileName,pbam->insertSize,pbam->SD); in = sam_open(pbam->fileName, "r"); if (in == NULL) { fprintf(stderr, "[x]Error opening \"%s\"\n", pbam->fileName); return EXIT_FAILURE; } h = sam_hdr_read(in); kstr.l = 0; ksprintf(&kstr, "%s/%s_grep/%s.bam", myConfig.WorkDir, myConfig.ProjectID, BamID); out = hts_open(ks_str(&kstr), "wb"); if (out == NULL) { fprintf(stderr, "[x]Error opening [%s]\n",ks_str(&kstr)); return EXIT_FAILURE; } if (sam_hdr_write(out, h) < 0) { fprintf(stderr, "[!]Error writing output header.\n"); exit_code = 1; } int8_t *ChrIsHum; if (h == NULL) { fprintf(stderr, "[x]Couldn't read header for \"%s\"\n", pbam->fileName); return EXIT_FAILURE; } else { ChrIsHum = (int8_t *) malloc(h->n_targets * sizeof(int8_t)); for (int32_t i=0; i < h->n_targets; ++i) { //ChrIsHum[i] = -1; ki = kh_get(chrNFO, chrNFOp, h->target_name[i]); if (ki == kh_end(chrNFOp)) { errx(4,"[x]Cannot find ChrID for [%s] !",h->target_name[i]); } else { ChrInfo_t * tmp = &kh_value(chrNFOp, ki); ChrIsHum[i] = tmp->isHum; //printf(">>> %d Chr:%s %d\n",i,h->target_name[i],ChrIsHum[i]); } } } h->ignore_sam_err = 0; b = bam_init1(); d = bam_init1(); d2 = bam_init1(); if ((idx = sam_index_load(in, pbam->fileName)) == 0) { fprintf(stderr, "[E::%s] fail to load the BAM index\n", __func__); return 1; } pierCluster = sam_plp_init(); while ((r = sam_read1(in, h, b)) >= 0) { int8_t flag = false; const bam1_core_t *c = &b->core; if (c->qual < myConfig.minBamQual) { continue; } if (c->n_cigar) { uint32_t *cigar = bam_get_cigar(b); int i = c->n_cigar; --i; if ( (bam_cigar_opchr(cigar[0])=='S' && bam_cigar_oplen(cigar[0]) >= myConfig.minGrepSlen) || (bam_cigar_opchr(cigar[i])=='S' && bam_cigar_oplen(cigar[i]) >= myConfig.minGrepSlen) ) { flag = true; } /* We only need /\d+S/ on both terminal, NOT inside. for (int i = 0; i < c->n_cigar; ++i) { if (bam_cigar_opchr(cigar[i])=='S') { // soft clipping if ( bam_cigar_oplen(cigar[i]) >= myConfig.minGrepSlen ) { flag = true; } } } */ } if (flag && ChrIsHum[c->tid]) { // Now, skip Virus items. //bam_copy1(bR1, b); flag = 0; // recycle //int enoughMapQ = 0; //kstring_t ks = { 0, 0, NULL }; /*if (sam_format1(h, b, &ks1) < 0) { fprintf(stderr, "Error writing output.\n"); exit_code = 1; break; } else */if ((c->mtid == c->tid && ChrIsHum[c->tid]) || (ChrIsHum[c->tid] ^ ChrIsHum[c->mtid])) { // Only grep those mapped on same Human ChrID, or diff species/一方在病毒的情况. //printf(">[%s]\n",ks_str(&ks1)); flag |= 1; //tmp_samdat.b = bam_dup1(b); //kv_push(samdat_t,R1,tmp_samdat); /*if (checkMapQ(ChrIsHum, b, true)) { ++enoughMapQ; }*/ } if (getPairedSam(in, idx, b, d) != 0) { flag &= ~1; continue; } else { flag |= 2; /*if (checkMapQ(ChrIsHum, d, false)) { ++enoughMapQ; }*/ if (c->flag & BAM_FSECONDARY) { if (getPairedSam(in, idx, d, d2) == 0) { //sam_format1(h, d2, &ks3); flag |= 4; /*if (checkMapQ(ChrIsHum, d2, false)) { ++enoughMapQ; }*/ } } } /* 对于 BAM_FSECONDARY(256) 的 Read,跳两次 与 读 SA 项,效果一样。 >[sf95_Ref_48245009_48245108_48245208_Vir_-_2000_2044_R_100_90 353 chr2 13996555 0 50S40M chr18 48245109 0ACACAACAATGTTCCGGAGACTCTAAGGCCTCCCGATACAGAGCAGAGGCCACACACACACACACCATGGAATACTATTCAGCCAAAAAA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC NM:i:0 MD:Z:40 AS:i:40 XS:i:40 RG:Z:Fsimout_mB SA:Z:rgi|59585|emb|X04615.1|,2000,-,40S46M4S,60,0; YC:Z:CT YD:Z:f] -[sf95_Ref_48245009_48245108_48245208_Vir_-_2000_2044_R_100_90 177 chr18 48245109 9 40S50M gi|59585|emb|X04615.1|2000 0 GTTCCGGAGACTCTAAGGCCTCCCGATACAGAGCAGAGGCCACACACACACACACCATGGAATACTATTCAGCCAAAAAAAGGAATTCAA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC NM:i:0 MD:Z:50 AS:i:50 XS:i:46 RG:Z:Fsimout_mB SA:Z:rgi|59585|emb|X04615.1|,2000,+,50S40M,9,0; YC:Z:GA YD:Z:f] +[sf95_Ref_48245009_48245108_48245208_Vir_-_2000_2044_R_100_90 113 gi|59585|emb|X04615.1| 2000 60 40S46M4S chr18 48245109 0 TTTTTTGGCTGAATAGTATTCCATGGTGTGTGTGTGTGTGGCCTCTGCTCTGTATCGGGAGGCCTTAGAGTCTCCGGAACATTGTTGTGT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC NM:i:0 MD:Z:46 AS:i:46 XS:i:27 RG:Z:Fsimout_mB SA:Z:fchr2,13996555,+,50S40M,0,0; YC:Z:CT YD:Z:r] */ /*if (sam_format1(h, d, &ks2) < 0) { fprintf(stderr, "Error writing output.\n"); exit_code = 1; break; }*/ if (((flag & 3) == 3) /*&& enoughMapQ >= myConfig.samples*/) { /*printf(">%d[%s]\n",checkMapQ(ChrIsHum, b, true),ks_str(&ks1)); printf("-%d[%s]\n",checkMapQ(ChrIsHum, d, false),ks_str(&ks2)); if (flag & 4) { printf("+%d[%s]\n",checkMapQ(ChrIsHum, d2, false),ks_str(&ks3)); } printf("<--%d\n",enoughMapQ);*/ bam_aux_append(b, "Zd", 'Z', 2, (uint8_t*)"H"); if (sam_plp_push(ChrIsHum, pierCluster, b) == 0) { //printf("--HumRange=%s:%d-%d\n", h->target_name[(pierCluster->HumanRange).tid], (pierCluster->HumanRange).pos, (pierCluster->HumanRange).endpos); if ((!ChrIsHum[(d->core).tid]) && (flag & 2)) { bam_aux_append(d, "Zd", 'Z', 2, (uint8_t*)"V"); sam_plp_push(ChrIsHum, pierCluster, d); } if ((!ChrIsHum[(d2->core).tid]) && (flag & 4)) { bam_aux_append(d2, "Zd", 'Z', 2, (uint8_t*)"V"); sam_plp_push(ChrIsHum, pierCluster, d2); } } else { ++blockid; //print #ifdef DEBUGa fprintf(fsdump,"[%u %s]\nHumRange=%s:%d-%d\n", blockid, BamID, h->target_name[(pierCluster->HumanRange).tid], (pierCluster->HumanRange).pos, (pierCluster->HumanRange).endpos); fprintf(fsdump,"VirRange=%s:%d-%d\n", h->target_name[(pierCluster->VirusRange).tid], (pierCluster->VirusRange).pos, (pierCluster->VirusRange).endpos); #endif fprintf(fs,"[%u]\nBamID=%s\nHumRange=%s:%d-%d\n",blockid, BamID, h->target_name[(pierCluster->HumanRange).tid], (pierCluster->HumanRange).pos, (pierCluster->HumanRange).endpos); if ( (pierCluster->VirusRange).pos == 0 && (pierCluster->VirusRange).endpos == 0 ) { fprintf(fs,"VirRange=NA\n"); } else { fprintf(fs,"VirRange=%s:%d-%d\n", h->target_name[(pierCluster->VirusRange).tid], (pierCluster->VirusRange).pos, (pierCluster->VirusRange).endpos); } for (size_t i=0; i<kv_size(pierCluster->Reads);++i) { bam1_t *bi = kv_A(pierCluster->Reads, i); bam_aux_append(bi, "Zc", 'i', sizeof(uint32_t), (uint8_t*)&blockid); #ifdef DEBUGa if (sam_format1(h, bi, &ks1) < 0) { fprintf(stderr, "Error writing output.\n"); exit_code = 1; break; } else { fprintf(fsdump,"%s\n",ks1.s); } #endif if (sam_write1(out, h, bi) < 0) { fprintf(stderr, "[x]Error writing output.\n"); exit_code = 1; break; } } #ifdef DEBUGa fprintf(fsdump,"\n"); printf("HumRange=%s:%d-%d\n", h->target_name[(pierCluster->HumanRange).tid], (pierCluster->HumanRange).pos, (pierCluster->HumanRange).endpos); //fflush(fs); #endif sam_plp_dectroy(pierCluster); pierCluster = sam_plp_init(); } } } //char *qname = bam_get_qname(b); } r = sam_close(out); // stdout can only be closed once if (r < 0) { fprintf(stderr, "Error closing output.\n"); exit_code = 1; } hts_idx_destroy(idx); bam_destroy1(b); bam_destroy1(d); bam_destroy1(d2); bam_hdr_destroy(h); r = sam_close(in); free(ChrIsHum); #ifdef DEBUGa //fflush(NULL); //pressAnyKey(); #endif sam_plp_dectroy(pierCluster); //printf("<[%d]:\n",bami); } } #ifdef DEBUGa fclose(fsdump); #endif fclose(fs); getPairedSam(NULL, NULL, NULL, NULL); // sam_close(fp2); //printf("---[%d]---\n",exit_code); bam_destroy1(bR1); bam_destroy1(bR2); bam_destroy1(bR3); ks_release(&ks1); ks_release(&ks2); ks_release(&ks3); ks_release(&kstr); //free((char*)filePrefix); return exit_code; }
int main_samview(int argc, char *argv[]) { samFile *in; char *fn_ref = 0; int flag = 0, c, clevel = -1, ignore_sam_err = 0; char moder[8]; bam_hdr_t *h; bam1_t *b; while ((c = getopt(argc, argv, "IbSl:t:")) >= 0) { switch (c) { case 'S': flag |= 1; break; case 'b': flag |= 2; break; case 'l': clevel = atoi(optarg); flag |= 2; break; case 't': fn_ref = optarg; break; case 'I': ignore_sam_err = 1; break; } } if (argc == optind) { fprintf(stderr, "Usage: samview [-bSI] [-l level] <in.bam>|<in.sam> [region]\n"); return 1; } strcpy(moder, "r"); if ((flag&1) == 0) strcat(moder, "b"); in = sam_open(argv[optind], moder, fn_ref); h = sam_hdr_read(in); h->ignore_sam_err = ignore_sam_err; b = bam_init1(); if ((flag&4) == 0) { // SAM/BAM output htsFile *out; char modew[8]; strcpy(modew, "w"); if (clevel >= 0 && clevel <= 9) sprintf(modew + 1, "%d", clevel); if (flag&2) strcat(modew, "b"); out = hts_open("-", modew, 0); sam_hdr_write(out, h); if (optind + 1 < argc && !(flag&1)) { // BAM input and has a region int i; hts_idx_t *idx; if ((idx = bam_index_load(argv[optind])) == 0) { fprintf(stderr, "[E::%s] fail to load the BAM index\n", __func__); return 1; } for (i = optind + 1; i < argc; ++i) { hts_itr_t *iter; if ((iter = bam_itr_querys(idx, h, argv[i])) == 0) { fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, argv[i]); continue; } while (bam_itr_next((BGZF*)in->fp, iter, b) >= 0) sam_write1(out, h, b); hts_itr_destroy(iter); } hts_idx_destroy(idx); } else while (sam_read1(in, h, b) >= 0) sam_write1(out, h, b); sam_close(out); } bam_destroy1(b); bam_hdr_destroy(h); sam_close(in); return 0; }
int main(int argc, char **argv) { htsFile *in = NULL; htsFile *out = NULL; char *in_name = "-"; char *out_name = "-"; char *ref_name = NULL; char *ref_seq = NULL; char modew[8] = "w"; faidx_t *fai = NULL; bam_hdr_t *hdr = NULL; bam1_t *rec = NULL; int c, res, last_ref = -1, ref_len = 0; int adjust = 0, extended = 0, recalc = 0, flags = 0; while ((c = getopt(argc, argv, "aef:hi:o:r")) >= 0) { switch (c) { case 'a': adjust = 1; break; case 'e': extended = 1; break; case 'f': ref_name = optarg; break; case 'h': usage(argv[0]); return EXIT_SUCCESS; case 'i': in_name = optarg; break; case 'o': out_name = optarg; break; case 'r': recalc = 1; break; default: usage(argv[0]); return EXIT_FAILURE; } } if (!ref_name) { usage(argv[0]); return EXIT_FAILURE; } flags = (adjust ? 1 : 0) | (extended ? 2 : 0) | (recalc ? 4 : 0); fai = fai_load(ref_name); if (!fai) { fprintf(stderr, "Couldn't load reference %s\n", ref_name); goto fail; } rec = bam_init1(); if (!rec) { perror(NULL); goto fail; } in = hts_open(in_name, "r"); if (!in) { fprintf(stderr, "Couldn't open %s : %s\n", in_name, strerror(errno)); goto fail; } hdr = sam_hdr_read(in); if (!hdr) { fprintf(stderr, "Couldn't read header for %s\n", in_name); goto fail; } out = hts_open(out_name, modew); if (!out) { fprintf(stderr, "Couldn't open %s : %s\n", out_name, strerror(errno)); goto fail; } if (sam_hdr_write(out, hdr) < 0) { fprintf(stderr, "Couldn't write header to %s : %s\n", out_name, strerror(errno)); goto fail; } while ((res = sam_read1(in, hdr, rec)) >= 0) { if (rec->core.tid >= hdr->n_targets) { fprintf(stderr, "Invalid BAM reference id %d\n", rec->core.tid); goto fail; } if (last_ref != rec->core.tid && rec->core.tid >= 0) { free(ref_seq); ref_seq = faidx_fetch_seq(fai, hdr->target_name[rec->core.tid], 0, INT_MAX, &ref_len); if (!ref_seq) { fprintf(stderr, "Couldn't get reference %s\n", hdr->target_name[rec->core.tid]); goto fail; } last_ref = rec->core.tid; } if (rec->core.tid >= 0) { res = sam_prob_realn(rec, ref_seq, ref_len, flags); if (res <= -4) { fprintf(stderr, "Error running sam_prob_realn : %s\n", strerror(errno)); goto fail; } } if (sam_write1(out, hdr, rec) < 0) { fprintf(stderr, "Error writing to %s\n", out_name); goto fail; } } res = hts_close(in); in = NULL; if (res < 0) { fprintf(stderr, "Error closing %s\n", in_name); goto fail; } res = hts_close(out); out = NULL; if (res < 0) { fprintf(stderr, "Error closing %s\n", out_name); goto fail; } bam_hdr_destroy(hdr); bam_destroy1(rec); free(ref_seq); fai_destroy(fai); return EXIT_SUCCESS; fail: if (hdr) bam_hdr_destroy(hdr); if (rec) bam_destroy1(rec); if (in) hts_close(in); if (out) hts_close(out); free(ref_seq); fai_destroy(fai); return EXIT_FAILURE; }
/*! @abstract Merge multiple sorted BAM. @param is_by_qname whether to sort by query name @param out output BAM file name @param mode sam_open() mode to be used to create the final output file (overrides level settings from UNCOMP and LEVEL1 flags) @param headers name of SAM file from which to copy '@' header lines, or NULL to copy them from the first file to be merged @param n number of files to be merged @param fn names of files to be merged @param flag flags that control how the merge is undertaken @param reg region to merge @param n_threads number of threads to use (passed to htslib) @discussion Padding information may NOT correctly maintained. This function is NOT thread safe. */ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char *headers, int n, char * const *fn, int flag, const char *reg, int n_threads) { samFile *fpout, **fp; heap1_t *heap; bam_hdr_t *hout = NULL; int i, j, *RG_len = NULL; uint64_t idx = 0; char **RG = NULL; hts_itr_t **iter = NULL; bam_hdr_t **hdr = NULL; trans_tbl_t *translation_tbl = NULL; // Is there a specified pre-prepared header to use for output? if (headers) { samFile* fpheaders = sam_open(headers, "r"); if (fpheaders == NULL) { const char *message = strerror(errno); fprintf(pysamerr, "[bam_merge_core] cannot open '%s': %s\n", headers, message); return -1; } hout = sam_hdr_read(fpheaders); sam_close(fpheaders); } g_is_by_qname = by_qname; fp = (samFile**)calloc(n, sizeof(samFile*)); heap = (heap1_t*)calloc(n, sizeof(heap1_t)); iter = (hts_itr_t**)calloc(n, sizeof(hts_itr_t*)); hdr = (bam_hdr_t**)calloc(n, sizeof(bam_hdr_t*)); translation_tbl = (trans_tbl_t*)calloc(n, sizeof(trans_tbl_t)); // prepare RG tag from file names if (flag & MERGE_RG) { RG = (char**)calloc(n, sizeof(char*)); RG_len = (int*)calloc(n, sizeof(int)); for (i = 0; i != n; ++i) { int l = strlen(fn[i]); const char *s = fn[i]; if (l > 4 && strcmp(s + l - 4, ".bam") == 0) l -= 4; for (j = l - 1; j >= 0; --j) if (s[j] == '/') break; ++j; l -= j; RG[i] = (char*)calloc(l + 1, 1); RG_len[i] = l; strncpy(RG[i], s + j, l); } } // open and read the header from each file for (i = 0; i < n; ++i) { bam_hdr_t *hin; fp[i] = sam_open(fn[i], "r"); if (fp[i] == NULL) { int j; fprintf(pysamerr, "[bam_merge_core] fail to open file %s\n", fn[i]); for (j = 0; j < i; ++j) sam_close(fp[j]); free(fp); free(heap); // FIXME: possible memory leak return -1; } hin = sam_hdr_read(fp[i]); if (hout) trans_tbl_init(hout, hin, translation_tbl+i, flag & MERGE_COMBINE_RG, flag & MERGE_COMBINE_PG); else { // As yet, no headers to merge into... hout = bam_hdr_dup(hin); // ...so no need to translate header into itself trans_tbl_init(hout, hin, translation_tbl+i, true, true); } // TODO sam_itr_next() doesn't yet work for SAM files, // so for those keep the headers around for use with sam_read1() if (hts_get_format(fp[i])->format == sam) hdr[i] = hin; else { bam_hdr_destroy(hin); hdr[i] = NULL; } if ((translation_tbl+i)->lost_coord_sort && !by_qname) { fprintf(pysamerr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]); } } // Transform the header into standard form pretty_header(&hout->text,hout->l_text); // If we're only merging a specified region move our iters to start at that point if (reg) { int* rtrans = rtrans_build(n, hout->n_targets, translation_tbl); int tid, beg, end; const char *name_lim = hts_parse_reg(reg, &beg, &end); char *name = malloc(name_lim - reg + 1); memcpy(name, reg, name_lim - reg); name[name_lim - reg] = '\0'; tid = bam_name2id(hout, name); free(name); if (tid < 0) { fprintf(pysamerr, "[%s] Malformated region string or undefined reference name\n", __func__); return -1; } for (i = 0; i < n; ++i) { hts_idx_t *idx = sam_index_load(fp[i], fn[i]); // (rtrans[i*n+tid]) Look up what hout tid translates to in input tid space int mapped_tid = rtrans[i*hout->n_targets+tid]; if (mapped_tid != INT32_MIN) { iter[i] = sam_itr_queryi(idx, mapped_tid, beg, end); } else { iter[i] = sam_itr_queryi(idx, HTS_IDX_NONE, 0, 0); } hts_idx_destroy(idx); if (iter[i] == NULL) break; } free(rtrans); } else { for (i = 0; i < n; ++i) { if (hdr[i] == NULL) { iter[i] = sam_itr_queryi(NULL, HTS_IDX_REST, 0, 0); if (iter[i] == NULL) break; } else iter[i] = NULL; } } if (i < n) { fprintf(pysamerr, "[%s] Memory allocation failed\n", __func__); return -1; } // Load the first read from each file into the heap for (i = 0; i < n; ++i) { heap1_t *h = heap + i; h->i = i; h->b = bam_init1(); if ((iter[i]? sam_itr_next(fp[i], iter[i], h->b) : sam_read1(fp[i], hdr[i], h->b)) >= 0) { bam_translate(h->b, translation_tbl + i); h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam_is_rev(h->b); h->idx = idx++; } else { h->pos = HEAP_EMPTY; bam_destroy1(h->b); h->b = NULL; } } // Open output file and write header if ((fpout = sam_open(out, mode)) == 0) { fprintf(pysamerr, "[%s] fail to create the output file.\n", __func__); return -1; } sam_hdr_write(fpout, hout); if (!(flag & MERGE_UNCOMP)) hts_set_threads(fpout, n_threads); // Begin the actual merge ks_heapmake(heap, n, heap); while (heap->pos != HEAP_EMPTY) { bam1_t *b = heap->b; if (flag & MERGE_RG) { uint8_t *rg = bam_aux_get(b, "RG"); if (rg) bam_aux_del(b, rg); bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]); } sam_write1(fpout, hout, b); if ((j = (iter[heap->i]? sam_itr_next(fp[heap->i], iter[heap->i], b) : sam_read1(fp[heap->i], hdr[heap->i], b))) >= 0) { bam_translate(b, translation_tbl + heap->i); heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam_is_rev(b); heap->idx = idx++; } else if (j == -1) { heap->pos = HEAP_EMPTY; bam_destroy1(heap->b); heap->b = NULL; } else fprintf(pysamerr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]); ks_heapadjust(heap, 0, n, heap); } // Clean up and close if (flag & MERGE_RG) { for (i = 0; i != n; ++i) free(RG[i]); free(RG); free(RG_len); } for (i = 0; i < n; ++i) { trans_tbl_destroy(translation_tbl + i); hts_itr_destroy(iter[i]); bam_hdr_destroy(hdr[i]); sam_close(fp[i]); } bam_hdr_destroy(hout); sam_close(fpout); free(translation_tbl); free(fp); free(heap); free(iter); free(hdr); return 0; }
/****************************************************************************** * * The main worker node function. * * int thread_id: the thread_id * char *fastq1: FIFO from which bowtie2 can get read1 * char *fastq2: FIFO from which bowtie2 can get read2 (if it exists) * *******************************************************************************/ void herd_worker_node(int thread_id, char *fastq1, char *fastq2) { int cmd_length = 1, max_qname = 0, status, strand; char *cmd, *last_qname = calloc(1, sizeof(char)); MPI_Header *packed_header; MPI_read *packed_read = calloc(1, sizeof(MPI_read)); bam_hdr_t *header; bam1_t *read1 = bam_init1(); bam1_t *read2 = bam_init1(); samFile *fp; #ifdef DEBUG MPI_Status stat; int current_p_size = 100; htsFile *of; bam_hdr_t *debug_header = bam_hdr_init(); bam1_t *debug_read = bam_init1(); global_header = bam_hdr_init(); void *p = calloc(100,1); char *oname = NULL; #else int i = 0; #endif time_t t0, t1; int swapped = 0; assert(last_qname); assert(packed_read); //Which strand should we be aligning to? if(config.directional) { strand = (thread_id-1) % 2; } else { strand = (thread_id-1) % 4; } packed_read->size = 0; packed_read->packed = NULL; //construct the bowtie2 command cmd_length += (int) strlen("bowtie2 -q --reorder") + 1; cmd_length += (int) strlen(config.bowtie2_options) + 1; cmd_length += (int) strlen("--norc -x") + 1; cmd_length += (int) strlen(config.genome_dir) + strlen("bisulfite_genome/CT_conversion/BS_CT") + 1; cmd_length += (int) 2*(strlen("-1 ") + strlen(fastq1)) + 3; if(config.paired) cmd_length += (int) strlen(fastq2); //This is likely unneeded. #ifdef DEBUG oname = malloc(sizeof(char) *(1+strlen(config.odir)+strlen(config.basename)+strlen("_X.bam"))); assert(oname); sprintf(oname, "%s%s_%i.bam", config.odir, config.basename, thread_id); if(!config.quiet) fprintf(stderr, "Writing output to %s\n", oname); of = sam_open(oname, "wb"); free(oname); #endif cmd = (char *) malloc(sizeof(char) * cmd_length); assert(cmd); if(strand == 0) { //OT Read#1 C->T, Read#2 G->A, Genome C->T only the + strand if(config.paired) { sprintf(cmd, "bowtie2 -q --reorder %s --norc -x %sbisulfite_genome/CT_conversion/BS_CT -1 %s -2 %s", config.bowtie2_options, config.genome_dir, fastq1, fastq2); } else { sprintf(cmd, "bowtie2 -q --reorder %s --norc -x %sbisulfite_genome/CT_conversion/BS_CT -U %s", config.bowtie2_options, config.genome_dir, fastq1); } } else if(strand == 1) { //OB Read#1 C->T, Read#2 G->A, Genome G->A only the - strand if(config.paired) { sprintf(cmd, "bowtie2 -q --reorder %s --nofw -x %sbisulfite_genome/GA_conversion/BS_GA -1 %s -2 %s", config.bowtie2_options, config.genome_dir, fastq1, fastq2); } else { sprintf(cmd, "bowtie2 -q --reorder %s --nofw -x %sbisulfite_genome/GA_conversion/BS_GA -U %s", config.bowtie2_options, config.genome_dir, fastq1); } } else if(strand == 2) { //CTOT Read#1 G->A, Read#2 C->T, Genome C->T, only the - strand if(config.paired) { sprintf(cmd, "bowtie2 -q --reorder %s --nofw -x %sbisulfite_genome/CT_conversion/BS_CT -1 %s -2 %s", config.bowtie2_options, config.genome_dir, fastq1, fastq2); } else { sprintf(cmd, "bowtie2 -q --reorder %s --nofw -x %sbisulfite_genome/CT_conversion/BS_CT -U %s", config.bowtie2_options, config.genome_dir, fastq1); } } else if(strand == 3) { //CTOB Read#1 G->A, Read#2 C->T, Genome G->A, only the + strand if(config.paired) { sprintf(cmd, "bowtie2 -q --reorder %s --norc -x %sbisulfite_genome/GA_conversion/BS_GA -1 %s -2 %s", config.bowtie2_options, config.genome_dir, fastq1, fastq2); } else { sprintf(cmd, "bowtie2 -q --reorder %s --norc -x %sbisulfite_genome/GA_conversion/BS_GA -U %s", config.bowtie2_options, config.genome_dir, fastq1); } } else { fprintf(stderr, "Oh shit, got strand %i!\n", strand); return; } //Start the process if(!config.quiet) fprintf(stderr, "Node %i executing: %s\n", thread_id, cmd); fflush(stderr); fp = sam_popen(cmd); header = sam_hdr_read(fp); #ifdef DEBUG sam_hdr_write(of, header); #endif #ifndef DEBUG packed_header = pack_header(header); if(thread_id == 1) { //Send the header MPI_Send((void *) &(packed_header->size), 1, MPI_INT, 0, 1, MPI_COMM_WORLD); status = MPI_Send((void *) packed_header->packed, packed_header->size, MPI_BYTE, 0, 2, MPI_COMM_WORLD); if(status != MPI_SUCCESS) { fprintf(stderr, "MPI_Send returned %i\n", status); fflush(stderr); } } #else packed_header = pack_header(header); void *tmp_pointer = malloc(packed_header->size); assert(tmp_pointer); MPI_Request request; MPI_Isend((void *) packed_header->packed, packed_header->size, MPI_BYTE, 0, 2, MPI_COMM_WORLD, &request); status = MPI_Recv(tmp_pointer, packed_header->size, MPI_BYTE, 0, 2, MPI_COMM_WORLD, &stat); if(status != MPI_SUCCESS) fprintf(stderr, "We seem to have not been able to send the message to ourselves!\n"); MPI_Wait(&request, &stat); unpack_header(debug_header, tmp_pointer); global_header = debug_header; free(tmp_pointer); #endif t0 = time(NULL); if(!config.quiet) fprintf(stderr, "Node %i began sending reads @%s", thread_id, ctime(&t0)); fflush(stderr); while(sam_read1(fp, header, read1) >= 0) { #ifdef DEBUG sam_write1(of, global_header, read1); #endif if(strcmp(bam_get_qname(read1), last_qname) == 0) { //Multimapper if(config.paired) { sam_read1(fp, header, read2); #ifdef DEBUG sam_write1(of, global_header, read2); #endif } continue; } else { if(read1->core.l_qname > max_qname) { max_qname = read1->core.l_qname + 10; last_qname = realloc(last_qname, sizeof(char) * max_qname); assert(last_qname); } strcpy(last_qname, bam_get_qname(read1)); } //Are paired-end reads in the wrong order? swapped = 0; if(config.paired) { if(read1->core.flag & BAM_FREAD2) { swapped = 1; sam_read1(fp, header, read2); packed_read = pack_read(read2, packed_read); #ifndef DEBUG MPI_Send((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD); #else sam_write1(of, global_header, read2); if(packed_read->size > current_p_size) { p = realloc(p, packed_read->size); assert(p); } MPI_Isend((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &request); status = MPI_Recv(p, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &stat); MPI_Wait(&request, &stat); debug_read = unpack_read(debug_read, p); #endif } } //Send the read packed_read = pack_read(read1, packed_read); #ifndef DEBUG MPI_Send((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD); #else if(packed_read->size > current_p_size) { p = realloc(p, packed_read->size); assert(p); } MPI_Isend(packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &request); status = MPI_Recv(p, packed_header->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &stat); MPI_Wait(&request, &stat); #endif //Deal with paired-end reads if(config.paired && !swapped) { sam_read1(fp, header, read2); packed_read = pack_read(read2, packed_read); #ifndef DEBUG MPI_Send((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD); #else sam_write1(of, global_header, read2); if(packed_read->size > current_p_size) { p = realloc(p, packed_read->size); assert(p); } MPI_Isend((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &request); status = MPI_Recv(p, packed_header->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &stat); MPI_Wait(&request, &stat); debug_read = unpack_read(debug_read, p); #endif } #ifndef DEBUG i++; #endif } t1 = time(NULL); if(!config.quiet) fprintf(stderr, "Node %i finished sending reads @%s\t(%f sec elapsed)\n", thread_id, ctime(&t1), difftime(t1, t0)); fflush(stderr); //Notify the master node packed_read->size = 0; #ifndef DEBUG void *A = malloc(1); assert(A); MPI_Send(A, 1, MPI_BYTE, 0, 5, MPI_COMM_WORLD); free(A); #endif //Close things up bam_hdr_destroy(header); bam_destroy1(read1); bam_destroy1(read2); free(cmd); if(packed_read->packed != NULL) free(packed_read->packed); free(packed_read); if(packed_header->packed != NULL) free(packed_header->packed); free(packed_header); free(last_qname); sam_pclose(fp); //Remove the FIFO(s) unlink(fastq1); if(config.paired) unlink(fastq2); #ifdef DEBUG sam_close(of); bam_hdr_destroy(debug_header); bam_destroy1(debug_read); free(p); #endif if(!config.quiet) fprintf(stderr, "Exiting worker node %i\n", thread_id); fflush(stderr); };