int bam_merge(int argc, char *argv[]) { #ifndef _PBGZF_USE int c, is_by_qname = 0, flag = 0, ret = 0, n_threads = 0, level = -1; #else int c, is_by_qname = 0, flag = 0, ret = 0, level = -1; #endif char *fn_headers = NULL, *reg = 0; while ((c = getopt(argc, argv, "h:nru1R:f@:l:")) >= 0) { switch (c) { case 'r': flag |= MERGE_RG; break; case 'f': flag |= MERGE_FORCE; break; case 'h': fn_headers = strdup(optarg); break; case 'n': is_by_qname = 1; break; case '1': flag |= MERGE_LEVEL1; break; case 'u': flag |= MERGE_UNCOMP; break; case 'R': reg = strdup(optarg); break; case 'l': level = atoi(optarg); break; #ifndef _PBGZF_USE case '@': n_threads = atoi(optarg); break; #endif } } if (optind + 2 >= argc) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: samtools merge [-nr] [-h inh.sam] <out.bam> <in1.bam> <in2.bam> [...]\n\n"); fprintf(stderr, "Options: -n sort by read names\n"); fprintf(stderr, " -r attach RG tag (inferred from file names)\n"); fprintf(stderr, " -u uncompressed BAM output\n"); fprintf(stderr, " -f overwrite the output BAM if exist\n"); fprintf(stderr, " -1 compress level 1\n"); fprintf(stderr, " -l INT compression level, from 0 to 9 [-1]\n"); #ifndef _PBGZF_USE fprintf(stderr, " -@ INT number of BAM compression threads [0]\n"); #endif fprintf(stderr, " -R STR merge file in the specified region STR [all]\n"); fprintf(stderr, " -h FILE copy the header in FILE to <out.bam> [in1.bam]\n\n"); fprintf(stderr, "Note: Samtools' merge does not reconstruct the @RG dictionary in the header. Users\n"); fprintf(stderr, " must provide the correct header with -h, or uses Picard which properly maintains\n"); fprintf(stderr, " the header dictionary in merging.\n\n"); return 1; } if (!(flag & MERGE_FORCE) && strcmp(argv[optind], "-")) { FILE *fp = fopen(argv[optind], "rb"); if (fp != NULL) { fclose(fp); fprintf(stderr, "[%s] File '%s' exists. Please apply '-f' to overwrite. Abort.\n", __func__, argv[optind]); return 1; } } #ifndef _PBGZF_USE if (bam_merge_core2(is_by_qname, argv[optind], fn_headers, argc - optind - 1, argv + optind + 1, flag, reg, n_threads, level) < 0) ret = 1; #else if (bam_merge_core2(is_by_qname, argv[optind], fn_headers, argc - optind - 1, argv + optind + 1, flag, reg, level) < 0) ret = 1; #endif free(reg); free(fn_headers); return ret; }
int bam_merge_core(int by_qname, const char *out, const char *headers, int n, char * const *fn, int flag, const char *reg) { #ifndef _PBGZF_USE return bam_merge_core2(by_qname, out, headers, n, fn, flag, reg, 0, -1); #else return bam_merge_core2(by_qname, out, headers, n, fn, flag, reg, -1); #endif }
int bam_merge_core(int by_qname, const char *out, const char *headers, int n, char * const *fn, int flag, const char *reg) { char mode[12]; strcpy(mode, "wb"); if (flag & MERGE_UNCOMP) strcat(mode, "0"); else if (flag & MERGE_LEVEL1) strcat(mode, "1"); return bam_merge_core2(by_qname, out, mode, headers, n, fn, flag, reg, 0); }
/*! @abstract Sort an unsorted BAM file based on the chromosome order and the leftmost position of an alignment @param is_by_qname whether to sort by query name @param fn name of the file to be sorted @param prefix prefix of the output and the temporary files; upon sucessess, prefix.bam will be written. @param max_mem approxiate maximum memory (very inaccurate) @discussion It may create multiple temporary subalignment files and then merge them by calling bam_merge_core(). This function is NOT thread safe. */ void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size_t _max_mem, int is_stdout, int n_threads, int level, int sort_type) { int ret, i, n_files = 0; size_t mem, max_k, k, max_mem; bam_header_t *header; bamFile fp; bam1_t *b, **buf; char *fnout = 0; if (n_threads < 2) n_threads = 1; g_is_by_qname = is_by_qname; max_k = k = 0; mem = 0; max_mem = _max_mem * n_threads; buf = 0; fp = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r"); if (fp == 0) { fprintf(stderr, "[bam_sort_core] fail to open file %s\n", fn); return; } header = bam_header_read(fp); if (is_by_qname) change_SO(header, "queryname"); else change_SO(header, "coordinate"); // write sub files for (;;) { if (k == max_k) { size_t old_max = max_k; max_k = max_k? max_k<<1 : 0x10000; buf = realloc(buf, max_k * sizeof(void*)); memset(buf + old_max, 0, sizeof(void*) * (max_k - old_max)); } if (buf[k] == 0) buf[k] = (bam1_t*)calloc(1, sizeof(bam1_t)); b = buf[k]; if ((ret = bam_read1(fp, b)) < 0) break; if (b->data_len < b->m_data>>2) { // shrink b->m_data = b->data_len; kroundup32(b->m_data); b->data = realloc(b->data, b->m_data); } mem += sizeof(bam1_t) + b->m_data + sizeof(void*) + sizeof(void*); // two sizeof(void*) for the data allocated to pointer arrays ++k; if (mem >= max_mem) { n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads, sort_type); mem = k = 0; } } if (ret != -1) fprintf(stderr, "[bam_sort_core] truncated file. Continue anyway.\n"); // output file name fnout = calloc(strlen(prefix) + 20, 1); if (is_stdout) sprintf(fnout, "-"); else sprintf(fnout, "%s.bam", prefix); // write the final output if (n_files == 0) { // a single block char mode[8]; strcpy(mode, "w"); if (level >= 0) sprintf(mode + 1, "%d", level < 9? level : 9); sort_aux_core(k, buf, sort_type); #ifndef _PBGZF_USE write_buffer(fnout, mode, k, buf, header, n_threads); #else write_buffer(fnout, mode, k, buf, header); #endif } else { // then merge char **fns; n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads, sort_type); fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n_files); fns = (char**)calloc(n_files, sizeof(char*)); for (i = 0; i < n_files; ++i) { fns[i] = (char*)calloc(strlen(prefix) + 20, 1); sprintf(fns[i], "%s.%.4d.bam", prefix, i); } #ifndef _PBGZF_USE bam_merge_core2(is_by_qname, fnout, 0, n_files, fns, 0, 0, n_threads, level); #else bam_merge_core2(is_by_qname, fnout, 0, n_files, fns, 0, 0, level); #endif for (i = 0; i < n_files; ++i) { unlink(fns[i]); free(fns[i]); } free(fns); } free(fnout); // free for (k = 0; k < max_k; ++k) { if (!buf[k]) continue; free(buf[k]->data); free(buf[k]); } free(buf); bam_header_destroy(header); bam_close(fp); }
int bam_merge(int argc, char *argv[]) { int c, is_by_qname = 0, flag = 0, ret = 0, n_threads = 0, level = -1; char *fn_headers = NULL, *reg = NULL, mode[12]; long random_seed = (long)time(NULL); char** fn = NULL; int fn_size = 0; if (argc == 1) { merge_usage(stdout); return 0; } while ((c = getopt(argc, argv, "h:nru1R:f@:l:cps:b:")) >= 0) { switch (c) { case 'r': flag |= MERGE_RG; break; case 'f': flag |= MERGE_FORCE; break; case 'h': fn_headers = strdup(optarg); break; case 'n': is_by_qname = 1; break; case '1': flag |= MERGE_LEVEL1; level = 1; break; case 'u': flag |= MERGE_UNCOMP; level = 0; break; case 'R': reg = strdup(optarg); break; case 'l': level = atoi(optarg); break; case '@': n_threads = atoi(optarg); break; case 'c': flag |= MERGE_COMBINE_RG; break; case 'p': flag |= MERGE_COMBINE_PG; break; case 's': random_seed = atol(optarg); break; case 'b': { // load the list of files to read int nfiles; char **fn_read = hts_readlines(optarg, &nfiles); if (fn_read) { // Append to end of array fn = realloc(fn, (fn_size+nfiles) * sizeof(char*)); if (fn == NULL) { ret = 1; goto end; } memcpy(fn+fn_size, fn_read, nfiles * sizeof(char*)); fn_size += nfiles; } else { fprintf(pysamerr, "[%s] Invalid file list \"%s\"\n", __func__, optarg); ret = 1; } break; } } } if ( argc - optind < 1 ) { fprintf(pysamerr, "You must at least specify the output file.\n"); merge_usage(pysamerr); return 1; } srand48(random_seed); if (!(flag & MERGE_FORCE) && strcmp(argv[optind], "-")) { FILE *fp = fopen(argv[optind], "rb"); if (fp != NULL) { fclose(fp); fprintf(pysamerr, "[%s] File '%s' exists. Please apply '-f' to overwrite. Abort.\n", __func__, argv[optind]); return 1; } } int nargcfiles = argc - (optind+1); if (nargcfiles > 0) { // Add argc files to end of array fn = realloc(fn, (fn_size+nargcfiles) * sizeof(char*)); if (fn == NULL) { ret = 1; goto end; } memcpy(fn+fn_size, argv + (optind+1), nargcfiles * sizeof(char*)); } if (fn_size+nargcfiles < 2) { fprintf(pysamerr, "You must specify at least 2 input files.\n"); merge_usage(pysamerr); return 1; } strcpy(mode, "wb"); if (level >= 0) sprintf(strchr(mode, '\0'), "%d", level < 9? level : 9); if (bam_merge_core2(is_by_qname, argv[optind], mode, fn_headers, fn_size+nargcfiles, fn, flag, reg, n_threads) < 0) ret = 1; end: if (fn_size > 0) { int i; for (i=0; i<fn_size; i++) free(fn[i]); free(fn); } free(reg); free(fn_headers); return ret; }
/*! @abstract Sort an unsorted BAM file based on the chromosome order and the leftmost position of an alignment @param is_by_qname whether to sort by query name @param fn name of the file to be sorted @param prefix prefix of the temporary files (prefix.NNNN.bam are written) @param fnout name of the final output file to be written @param modeout sam_open() mode to be used to create the final output file @param max_mem approxiate maximum memory (very inaccurate) @return 0 for successful sorting, negative on errors @discussion It may create multiple temporary subalignment files and then merge them by calling bam_merge_core(). This function is NOT thread safe. */ int bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, const char *fnout, const char *modeout, size_t _max_mem, int n_threads) { int ret, i, n_files = 0; size_t mem, max_k, k, max_mem; bam_hdr_t *header; samFile *fp; bam1_t *b, **buf; if (n_threads < 2) n_threads = 1; g_is_by_qname = is_by_qname; max_k = k = 0; mem = 0; max_mem = _max_mem * n_threads; buf = NULL; fp = sam_open(fn, "r"); if (fp == NULL) { fprintf(pysamerr, "[bam_sort_core] fail to open file %s\n", fn); return -1; } header = sam_hdr_read(fp); if (is_by_qname) change_SO(header, "queryname"); else change_SO(header, "coordinate"); // write sub files for (;;) { if (k == max_k) { size_t kk, old_max = max_k; max_k = max_k? max_k<<1 : 0x10000; buf = (bam1_t**)realloc(buf, max_k * sizeof(bam1_t*)); for (kk = old_max; kk < max_k; ++kk) buf[kk] = NULL; } if (buf[k] == NULL) buf[k] = bam_init1(); b = buf[k]; if ((ret = sam_read1(fp, header, b)) < 0) break; if (b->l_data < b->m_data>>2) { // shrink b->m_data = b->l_data; kroundup32(b->m_data); b->data = (uint8_t*)realloc(b->data, b->m_data); } mem += sizeof(bam1_t) + b->m_data + sizeof(void*) + sizeof(void*); // two sizeof(void*) for the data allocated to pointer arrays ++k; if (mem >= max_mem) { n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads); mem = k = 0; } } if (ret != -1) fprintf(pysamerr, "[bam_sort_core] truncated file. Continue anyway.\n"); // write the final output if (n_files == 0) { // a single block ks_mergesort(sort, k, buf, 0); write_buffer(fnout, modeout, k, buf, header, n_threads); } else { // then merge char **fns; n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads); fprintf(pysamerr, "[bam_sort_core] merging from %d files...\n", n_files); fns = (char**)calloc(n_files, sizeof(char*)); for (i = 0; i < n_files; ++i) { fns[i] = (char*)calloc(strlen(prefix) + 20, 1); sprintf(fns[i], "%s.%.4d.bam", prefix, i); } if (bam_merge_core2(is_by_qname, fnout, modeout, NULL, n_files, fns, MERGE_COMBINE_RG|MERGE_COMBINE_PG, NULL, n_threads) < 0) { // Propagate bam_merge_core2() failure; it has already emitted a // message explaining the failure, so no further message is needed. return -1; } for (i = 0; i < n_files; ++i) { unlink(fns[i]); free(fns[i]); } free(fns); } // free for (k = 0; k < max_k; ++k) bam_destroy1(buf[k]); free(buf); bam_hdr_destroy(header); sam_close(fp); return 0; }
int bam_merge_core(int by_qname, const char *out, const char *headers, int n, char * const *fn, int flag, const char *reg) { return bam_merge_core2(by_qname, out, headers, n, fn, flag, reg, 0, -1); }