int main(int argc, char**argv) { // test state const int NUM_TESTS = 6; int verbose = 0; int success = 0; int failure = 0; int getopt_char; while ((getopt_char = getopt(argc, argv, "v")) != -1) { switch (getopt_char) { case 'v': ++verbose; break; default: break; } } bam1_t* b; // Setup stderr redirect kstring_t res = { 0, 0, NULL }; FILE* orig_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save stderr char* tempfname = (optind < argc)? argv[optind] : "test_bam_translate.tmp"; FILE* check = NULL; // setup if (verbose) printf("BEGIN test 1\n"); // TID test trans_tbl_t tbl1; setup_test_1(&b,&tbl1); if (verbose > 1) { printf("b\n"); dump_read(b); } if (verbose) printf("RUN test 1\n"); // test xfreopen(tempfname, "w", stderr); // Redirect stderr to pipe bam_translate(b, &tbl1); fclose(stderr); if (verbose) printf("END RUN test 1\n"); if (verbose > 1) { printf("b\n"); dump_read(b); } // check result check = fopen(tempfname, "r"); res.l = 0; if (kgetline(&res, (kgets_func *)fgets, check) < 0 && (feof(check) || res.l == 0) ) { ++success; } else { ++failure; if (verbose) printf("FAIL test 1\n"); } fclose(check); // teardown bam_destroy1(b); trans_tbl_destroy(&tbl1); if (verbose) printf("END test 1\n"); // setup if (verbose) printf("BEGIN test 2\n"); // RG exists and translate test trans_tbl_t tbl2; setup_test_2(&b,&tbl2); if (verbose > 1) { printf("b\n"); dump_read(b); } if (verbose) printf("RUN test 2\n"); // test xfreopen(tempfname, "w", stderr); // Redirect stderr to pipe bam_translate(b, &tbl2); fclose(stderr); if (verbose) printf("END RUN test 2\n"); if (verbose > 1) { printf("b\n"); dump_read(b); } // check result check = fopen(tempfname, "r"); res.l = 0; if (kgetline(&res, (kgets_func *)fgets, check) < 0 && (feof(check) || res.l == 0) ) { ++success; } else { ++failure; if (verbose) printf("FAIL test 2\n"); } fclose(check); // teardown bam_destroy1(b); trans_tbl_destroy(&tbl2); if (verbose) printf("END test 2\n"); if (verbose) printf("BEGIN test 3\n"); // PG exists and translate test // setup trans_tbl_t tbl3; setup_test_3(&b,&tbl3); if (verbose > 1) { printf("b\n"); dump_read(b); } if (verbose) printf("RUN test 3\n"); // test xfreopen(tempfname, "w", stderr); // Redirect stderr to pipe bam_translate(b, &tbl3); fclose(stderr); if (verbose) printf("END RUN test 3\n"); if (verbose > 1) { printf("b\n"); dump_read(b); } // check result check = fopen(tempfname, "r"); res.l = 0; if (kgetline(&res, (kgets_func *)fgets, check) < 0 && (feof(check) || res.l == 0)) { ++success; } else { ++failure; if (verbose) printf("FAIL test 3\n"); } fclose(check); // teardown bam_destroy1(b); trans_tbl_destroy(&tbl3); if (verbose) printf("END test 3\n"); if (verbose) printf("BEGIN test 4\n"); // RG test non-existent // setup trans_tbl_t tbl4; setup_test_4(&b,&tbl4); if (verbose > 1) { printf("b\n"); dump_read(b); } if (verbose) printf("RUN test 4\n"); // test xfreopen(tempfname, "w", stderr); // Redirect stderr to pipe bam_translate(b, &tbl4); fclose(stderr); if (verbose) printf("END RUN test 4\n"); if (verbose > 1) { printf("b\n"); dump_read(b); } // check result check = fopen(tempfname, "r"); res.l = 0; if (kgetline(&res, (kgets_func *)fgets, check) >= 0 && strcmp("[bam_translate] RG tag \"rg4hello\" on read \"123456789\" encountered with no corresponding entry in header, tag lost. Unknown tags are only reported once per input file for each tag ID.",res.s) == 0) { ++success; } else { ++failure; if (verbose) printf("FAIL test 4\n"); } fclose(check); // teardown bam_destroy1(b); trans_tbl_destroy(&tbl4); if (verbose) printf("END test 4\n"); if (verbose) printf("BEGIN test 5\n"); // PG test non-existent // setup trans_tbl_t tbl5; setup_test_5(&b,&tbl5); if (verbose > 1) { printf("b\n"); dump_read(b); printf("RUN test 5\n"); } // test xfreopen(tempfname, "w", stderr); // Redirect stderr to pipe bam_translate(b, &tbl5); fclose(stderr); if (verbose) printf("END RUN test 5\n"); if (verbose > 1) { printf("b\n"); dump_read(b); } // check result check = fopen(tempfname, "r"); res.l = 0; if (kgetline(&res, (kgets_func *)fgets, check) >= 0 && strcmp("[bam_translate] PG tag \"pg5hello\" on read \"123456789\" encountered with no corresponding entry in header, tag lost. Unknown tags are only reported once per input file for each tag ID.",res.s) == 0) { ++success; } else { ++failure; if (verbose) printf("FAIL test 5\n"); } fclose(check); // teardown bam_destroy1(b); trans_tbl_destroy(&tbl5); if (verbose) printf("END test 5\n"); if (verbose) printf("BEGIN test 6\n"); // RG and PG exists and translate test // setup trans_tbl_t tbl6; setup_test_6(&b,&tbl6); if (verbose > 1) { printf("b\n"); dump_read(b); } if (verbose) printf("RUN test 6\n"); // test xfreopen(tempfname, "w", stderr); // Redirect stderr to pipe bam_translate(b, &tbl6); fclose(stderr); if (verbose) printf("END RUN test 6\n"); if (verbose > 1) { printf("b\n"); dump_read(b); } // check result check = fopen(tempfname, "r"); res.l = 0; if (kgetline(&res, (kgets_func *)fgets, check) < 0 && (feof(check) || res.l == 0) ) { ++success; } else { ++failure; if (verbose) printf("FAIL test 6\n"); } fclose(check); // teardown bam_destroy1(b); trans_tbl_destroy(&tbl6); if (verbose) printf("END test 6\n"); // Cleanup free(res.s); remove(tempfname); if (failure > 0) fprintf(orig_stderr, "%d failures %d successes\n", failure, success); fclose(orig_stderr); return (success == NUM_TESTS)? EXIT_SUCCESS : EXIT_FAILURE; }
/*! @abstract Merge multiple sorted BAM. @param is_by_qname whether to sort by query name @param out output BAM file name @param mode sam_open() mode to be used to create the final output file (overrides level settings from UNCOMP and LEVEL1 flags) @param headers name of SAM file from which to copy '@' header lines, or NULL to copy them from the first file to be merged @param n number of files to be merged @param fn names of files to be merged @param flag flags that control how the merge is undertaken @param reg region to merge @param n_threads number of threads to use (passed to htslib) @discussion Padding information may NOT correctly maintained. This function is NOT thread safe. */ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char *headers, int n, char * const *fn, int flag, const char *reg, int n_threads) { samFile *fpout, **fp; heap1_t *heap; bam_hdr_t *hout = NULL; int i, j, *RG_len = NULL; uint64_t idx = 0; char **RG = NULL; hts_itr_t **iter = NULL; bam_hdr_t **hdr = NULL; trans_tbl_t *translation_tbl = NULL; // Is there a specified pre-prepared header to use for output? if (headers) { samFile* fpheaders = sam_open(headers, "r"); if (fpheaders == NULL) { const char *message = strerror(errno); fprintf(pysamerr, "[bam_merge_core] cannot open '%s': %s\n", headers, message); return -1; } hout = sam_hdr_read(fpheaders); sam_close(fpheaders); } g_is_by_qname = by_qname; fp = (samFile**)calloc(n, sizeof(samFile*)); heap = (heap1_t*)calloc(n, sizeof(heap1_t)); iter = (hts_itr_t**)calloc(n, sizeof(hts_itr_t*)); hdr = (bam_hdr_t**)calloc(n, sizeof(bam_hdr_t*)); translation_tbl = (trans_tbl_t*)calloc(n, sizeof(trans_tbl_t)); // prepare RG tag from file names if (flag & MERGE_RG) { RG = (char**)calloc(n, sizeof(char*)); RG_len = (int*)calloc(n, sizeof(int)); for (i = 0; i != n; ++i) { int l = strlen(fn[i]); const char *s = fn[i]; if (l > 4 && strcmp(s + l - 4, ".bam") == 0) l -= 4; for (j = l - 1; j >= 0; --j) if (s[j] == '/') break; ++j; l -= j; RG[i] = (char*)calloc(l + 1, 1); RG_len[i] = l; strncpy(RG[i], s + j, l); } } // open and read the header from each file for (i = 0; i < n; ++i) { bam_hdr_t *hin; fp[i] = sam_open(fn[i], "r"); if (fp[i] == NULL) { int j; fprintf(pysamerr, "[bam_merge_core] fail to open file %s\n", fn[i]); for (j = 0; j < i; ++j) sam_close(fp[j]); free(fp); free(heap); // FIXME: possible memory leak return -1; } hin = sam_hdr_read(fp[i]); if (hout) trans_tbl_init(hout, hin, translation_tbl+i, flag & MERGE_COMBINE_RG, flag & MERGE_COMBINE_PG); else { // As yet, no headers to merge into... hout = bam_hdr_dup(hin); // ...so no need to translate header into itself trans_tbl_init(hout, hin, translation_tbl+i, true, true); } // TODO sam_itr_next() doesn't yet work for SAM files, // so for those keep the headers around for use with sam_read1() if (hts_get_format(fp[i])->format == sam) hdr[i] = hin; else { bam_hdr_destroy(hin); hdr[i] = NULL; } if ((translation_tbl+i)->lost_coord_sort && !by_qname) { fprintf(pysamerr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]); } } // Transform the header into standard form pretty_header(&hout->text,hout->l_text); // If we're only merging a specified region move our iters to start at that point if (reg) { int* rtrans = rtrans_build(n, hout->n_targets, translation_tbl); int tid, beg, end; const char *name_lim = hts_parse_reg(reg, &beg, &end); char *name = malloc(name_lim - reg + 1); memcpy(name, reg, name_lim - reg); name[name_lim - reg] = '\0'; tid = bam_name2id(hout, name); free(name); if (tid < 0) { fprintf(pysamerr, "[%s] Malformated region string or undefined reference name\n", __func__); return -1; } for (i = 0; i < n; ++i) { hts_idx_t *idx = sam_index_load(fp[i], fn[i]); // (rtrans[i*n+tid]) Look up what hout tid translates to in input tid space int mapped_tid = rtrans[i*hout->n_targets+tid]; if (mapped_tid != INT32_MIN) { iter[i] = sam_itr_queryi(idx, mapped_tid, beg, end); } else { iter[i] = sam_itr_queryi(idx, HTS_IDX_NONE, 0, 0); } hts_idx_destroy(idx); if (iter[i] == NULL) break; } free(rtrans); } else { for (i = 0; i < n; ++i) { if (hdr[i] == NULL) { iter[i] = sam_itr_queryi(NULL, HTS_IDX_REST, 0, 0); if (iter[i] == NULL) break; } else iter[i] = NULL; } } if (i < n) { fprintf(pysamerr, "[%s] Memory allocation failed\n", __func__); return -1; } // Load the first read from each file into the heap for (i = 0; i < n; ++i) { heap1_t *h = heap + i; h->i = i; h->b = bam_init1(); if ((iter[i]? sam_itr_next(fp[i], iter[i], h->b) : sam_read1(fp[i], hdr[i], h->b)) >= 0) { bam_translate(h->b, translation_tbl + i); h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam_is_rev(h->b); h->idx = idx++; } else { h->pos = HEAP_EMPTY; bam_destroy1(h->b); h->b = NULL; } } // Open output file and write header if ((fpout = sam_open(out, mode)) == 0) { fprintf(pysamerr, "[%s] fail to create the output file.\n", __func__); return -1; } sam_hdr_write(fpout, hout); if (!(flag & MERGE_UNCOMP)) hts_set_threads(fpout, n_threads); // Begin the actual merge ks_heapmake(heap, n, heap); while (heap->pos != HEAP_EMPTY) { bam1_t *b = heap->b; if (flag & MERGE_RG) { uint8_t *rg = bam_aux_get(b, "RG"); if (rg) bam_aux_del(b, rg); bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]); } sam_write1(fpout, hout, b); if ((j = (iter[heap->i]? sam_itr_next(fp[heap->i], iter[heap->i], b) : sam_read1(fp[heap->i], hdr[heap->i], b))) >= 0) { bam_translate(b, translation_tbl + heap->i); heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam_is_rev(b); heap->idx = idx++; } else if (j == -1) { heap->pos = HEAP_EMPTY; bam_destroy1(heap->b); heap->b = NULL; } else fprintf(pysamerr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]); ks_heapadjust(heap, 0, n, heap); } // Clean up and close if (flag & MERGE_RG) { for (i = 0; i != n; ++i) free(RG[i]); free(RG); free(RG_len); } for (i = 0; i < n; ++i) { trans_tbl_destroy(translation_tbl + i); hts_itr_destroy(iter[i]); bam_hdr_destroy(hdr[i]); sam_close(fp[i]); } bam_hdr_destroy(hout); sam_close(fpout); free(translation_tbl); free(fp); free(heap); free(iter); free(hdr); return 0; }