int main_reheader(int argc, char *argv[]) { bam_header_t *h; BGZF *in; if (argc != 3) { fprintf(stderr, "Usage: samtools reheader <in.header.sam> <in.bam>\n"); return 1; } { // read the header tamFile fph = sam_open(argv[1]); if (fph == 0) { fprintf(stderr, "[%s] fail to read the header from %s.\n", __func__, argv[1]); return 1; } h = sam_header_read(fph); sam_close(fph); } in = strcmp(argv[2], "-")? bam_open(argv[2], "r") : bam_dopen(fileno(stdin), "r"); if (in == 0) { fprintf(stderr, "[%s] fail to open file %s.\n", __func__, argv[2]); return 1; } bam_reheader(in, h, fileno(stdout)); bgzf_close(in); return 0; }
samfile_t* samopen_in(quip_reader_t reader, void* reader_data, bool binary, void* aux) { samfile_t *fp; fp = (samfile_t*)calloc(1, sizeof(samfile_t)); fp->type |= TYPE_READ; if (binary) { // binary fp->type |= TYPE_BAM; fp->x.bam = bam_open_in(reader, reader_data); if (fp->x.bam == 0) goto open_err_ret; fp->header = bam_header_read(fp->x.bam); } else { // text fp->x.tamr = sam_open_in(reader, reader_data); if (fp->x.tamr == 0) goto open_err_ret; fp->header = sam_header_read(fp->x.tamr); if (fp->header->n_targets == 0) { // no @SQ fields if (aux) { // check if aux is present bam_header_t *textheader = fp->header; fp->header = sam_header_read2((const char*)aux); if (fp->header == 0) goto open_err_ret; append_header_text(fp->header, textheader->text, textheader->l_text); bam_header_destroy(textheader); } if (fp->header->n_targets == 0 && bam_verbose >= 1) fprintf(stderr, "[samopen] no @SQ lines in the header.\n"); } else if (bam_verbose >= 2) fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets); } return fp; open_err_ret: free(fp); return 0; }
/*static*/ bam_header_t * SAM::update_header_from_list(bam_header_t *header, names_list_t & list) { Temporary_File samfile; samfile.close_file(); samfile_t * sf = samopen(samfile.get_filename().c_str(),"wh",header); samclose(sf); Temporary_File tempfile; ofstream &output = tempfile.get_stream(); ifstream input(samfile.get_filename().c_str()); string temp; while (not input.eof()) { getline(input,temp); if ((temp.size() >= 3) and (temp[0] != '@' or temp[1] != 'S' or temp[2] != 'Q')) output << temp << '\n'; } for (names_list_t::iterator iter = list.begin(); iter != list.end(); iter++) output << "@SQ\tSN:" << iter->first << "\tLN:" << iter->second << '\n'; tempfile.close_file(); tamFile fp = sam_open(tempfile.get_filename().c_str()); bam_header_t * newheader = sam_header_read(fp); sam_close(fp); return newheader; }
int main_cat(int argc, char *argv[]) { bam_header_t *h = 0; char *outfn = 0; int c, ret; while ((c = getopt(argc, argv, "h:o:")) >= 0) { switch (c) { case 'h': { tamFile fph = sam_open(optarg); if (fph == 0) { fprintf(stderr, "[%s] ERROR: fail to read the header from '%s'.\n", __func__, argv[1]); return 1; } h = sam_header_read(fph); sam_close(fph); break; } case 'o': outfn = strdup(optarg); break; } } if (argc - optind < 2) { fprintf(stderr, "Usage: samtools cat [-h header.sam] [-o out.bam] <in1.bam> <in2.bam> [...]\n"); return 1; } ret = bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-"); free(outfn); return ret; }
void convert_sam_to_bam(char* sam_input, char* bam_input) { bam1_t* bam_p = bam_init1(); LOG_DEBUG("CONVERT-START: sam to bam\n"); //open SAM file for read if (time_flag) { start_timer(t1_convert); } tamFile sam_fd = sam_open(sam_input); //open BAM file for write bam_file_t* bam_file_p = bam_fopen_mode(bam_input, NULL, "w"); //read header from SAM file bam_header_t* bam_header_p = sam_header_read(sam_fd); //write header to BAM file bam_header_write(bam_file_p->bam_fd, bam_header_p); //write alignments to BAM file while (sam_read1(sam_fd, bam_header_p, bam_p) > 0) { bam_write1(bam_file_p->bam_fd, bam_p); num_alignments++; } //close BAM and SAM files, free bam alignment and bam file object bam_fclose(bam_file_p); sam_close(sam_fd); bam_header_destroy(bam_header_p); bam_destroy1(bam_p); if (time_flag) { stop_timer(t1_convert, t2_convert, convert_time); } //number_of_batchs = 1, convention value for statistics (not real batch) number_of_batchs = 1; }
samfile_t *samopen(const char *fn, const char *mode, const void *aux) { samfile_t *fp; fp = (samfile_t*)calloc(1, sizeof(samfile_t)); if (strchr(mode, 'r')) { // read fp->type |= TYPE_READ; if (strchr(mode, 'b')) { // binary fp->type |= TYPE_BAM; fp->x.bam = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r"); if (fp->x.bam == 0) goto open_err_ret; fp->header = bam_header_read(fp->x.bam); } else { // text fp->x.tamr = sam_open(fn); if (fp->x.tamr == 0) goto open_err_ret; fp->header = sam_header_read(fp->x.tamr); if (fp->header->n_targets == 0) { // no @SQ fields if (aux) { // check if aux is present bam_header_t *textheader = fp->header; fp->header = sam_header_read2((const char*)aux); if (fp->header == 0) goto open_err_ret; append_header_text(fp->header, textheader->text, textheader->l_text); bam_header_destroy(textheader); } if (fp->header->n_targets == 0 && bam_verbose >= 1) fprintf(stderr, "[samopen] no @SQ lines in the header.\n"); } //else if (bam_verbose >= 2) fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets); } } else if (strchr(mode, 'w')) { // write fp->header = bam_header_dup((const bam_header_t*)aux); if (strchr(mode, 'b')) { // binary char bmode[3]; int i, compress_level = -1; for (i = 0; mode[i]; ++i) if (mode[i] >= '0' && mode[i] <= '9') break; if (mode[i]) compress_level = mode[i] - '0'; if (strchr(mode, 'u')) compress_level = 0; bmode[0] = 'w'; bmode[1] = compress_level < 0? 0 : compress_level + '0'; bmode[2] = 0; fp->type |= TYPE_BAM; fp->x.bam = strcmp(fn, "-")? bam_open(fn, bmode) : bam_dopen(fileno(stdout), bmode); if (fp->x.bam == 0) goto open_err_ret; bam_header_write(fp->x.bam, fp->header); } else { // text // open file fp->x.tamw = strcmp(fn, "-")? fopen(fn, "w") : stdout; if (fp->x.tamw == 0) goto open_err_ret; if (strchr(mode, 'X')) fp->type |= BAM_OFSTR<<2; else if (strchr(mode, 'x')) fp->type |= BAM_OFHEX<<2; else fp->type |= BAM_OFDEC<<2; // write header if (strchr(mode, 'h')) { int i; bam_header_t *alt; // parse the header text alt = bam_header_init(); alt->l_text = fp->header->l_text; alt->text = fp->header->text; sam_header_parse(alt); alt->l_text = 0; alt->text = 0; // check if there are @SQ lines in the header fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw); // FIXME: better to skip the trailing NULL if (alt->n_targets) { // then write the header text without dumping ->target_{name,len} if (alt->n_targets != fp->header->n_targets && bam_verbose >= 1) fprintf(stderr, "[samopen] inconsistent number of target sequences. Output the text header.\n"); } else { // then dump ->target_{name,len} for (i = 0; i < fp->header->n_targets; ++i) fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]); } bam_header_destroy(alt); } } } return fp; open_err_ret: free(fp); return 0; }
int bam_cat(int nfn, char * const *fn, const bam_header_t *h, const char* outbam) { BGZF *fp; FILE* fp_file; uint8_t *buf; uint8_t ebuf[BGZF_EMPTY_BLOCK_SIZE]; const int es=BGZF_EMPTY_BLOCK_SIZE; int i; fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(_fileno(stdout), "w"); if (fp == 0) { fprintf(stderr, "[%s] ERROR: fail to open output file '%s'.\n", __FUNCTION__, outbam); return 1; } if (h) bam_header_write(fp, h); buf = (uint8_t*) malloc(BUF_SIZE); for(i = 0; i < nfn; ++i){ BGZF *in; bam_header_t *old; int len,j; in = strcmp(fn[i], "-")? bam_open(fn[i], "r") : bam_dopen(_fileno(stdin), "r"); if (in == 0) { fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __FUNCTION__, fn[i]); return -1; } if (in->open_mode != 'r') return -1; old = bam_header_read(in); if (h == 0 && i == 0) bam_header_write(fp, old); if (in->block_offset < in->block_length) { bgzf_write(fp, (uint8_t*)in->uncompressed_block + in->block_offset, in->block_length - in->block_offset); bgzf_flush(fp); } j=0; #ifdef _USE_KNETFILE fp_file=fp->x.fpw; while ((len = knet_read(in->x.fpr, buf, BUF_SIZE)) > 0) { #else fp_file=fp->file; while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0) { #endif if(len<es){ int diff=es-len; if(j==0) { fprintf(stderr, "[%s] ERROR: truncated file?: '%s'.\n", __FUNCTION__, fn[i]); return -1; } fwrite(ebuf, 1, len, fp_file); memcpy(ebuf,ebuf+len,diff); memcpy(ebuf+diff,buf,len); } else { if(j!=0) fwrite(ebuf, 1, es, fp_file); len-= es; memcpy(ebuf,buf+len,es); fwrite(buf, 1, len, fp_file); } j=1; } /* check final gzip block */ { const uint8_t gzip1=ebuf[0]; const uint8_t gzip2=ebuf[1]; const uint32_t isize=*((uint32_t*)(ebuf+es-4)); if(((gzip1!=GZIPID1) || (gzip2!=GZIPID2)) || (isize!=0)) { fprintf(stderr, "[%s] WARNING: Unexpected block structure in file '%s'.", __FUNCTION__, fn[i]); fprintf(stderr, " Possible output corruption.\n"); fwrite(ebuf, 1, es, fp_file); } } bam_header_destroy(old); bgzf_close(in); } free(buf); bgzf_close(fp); return 0; } int main_cat(int argc, char *argv[]) { bam_header_t *h = 0; char *outfn = 0; int c, ret; while ((c = getopt(argc, argv, "h:o:")) >= 0) { switch (c) { case 'h': { tamFile fph = sam_open(optarg); if (fph == 0) { fprintf(stderr, "[%s] ERROR: fail to read the header from '%s'.\n", __FUNCTION__, argv[1]); return 1; } h = sam_header_read(fph); sam_close(fph); break; } case 'o': outfn = strdup(optarg); break; } } if (argc - optind < 2) { fprintf(stderr, "Usage: samtools cat [-h header.sam] [-o out.bam] <in1.bam> <in2.bam> [...]\n"); return 1; } ret = bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-"); free(outfn); return ret; }
int bam_merge_core2(int by_qname, const char *out, const char *headers, int n, char * const *fn, int flag, const char *reg, int level) #endif { bamFile fpout, *fp; heap1_t *heap; bam_header_t *hout = 0; bam_header_t *hheaders = NULL; int i, j, *RG_len = 0; uint64_t idx = 0; char **RG = 0, mode[8]; bam_iter_t *iter = 0; if (headers) { tamFile fpheaders = sam_open(headers); if (fpheaders == 0) { const char *message = strerror(errno); fprintf(stderr, "[bam_merge_core] cannot open '%s': %s\n", headers, message); return -1; } hheaders = sam_header_read(fpheaders); sam_close(fpheaders); } g_is_by_qname = by_qname; fp = (bamFile*)calloc(n, sizeof(bamFile)); heap = (heap1_t*)calloc(n, sizeof(heap1_t)); iter = (bam_iter_t*)calloc(n, sizeof(bam_iter_t)); // prepare RG tag if (flag & MERGE_RG) { RG = (char**)calloc(n, sizeof(void*)); RG_len = (int*)calloc(n, sizeof(int)); for (i = 0; i != n; ++i) { int l = strlen(fn[i]); const char *s = fn[i]; if (l > 4 && strcmp(s + l - 4, ".bam") == 0) l -= 4; for (j = l - 1; j >= 0; --j) if (s[j] == '/') break; ++j; l -= j; RG[i] = calloc(l + 1, 1); RG_len[i] = l; strncpy(RG[i], s + j, l); } } // read the first for (i = 0; i != n; ++i) { bam_header_t *hin; fp[i] = bam_open(fn[i], "r"); if (fp[i] == 0) { int j; fprintf(stderr, "[bam_merge_core] fail to open file %s\n", fn[i]); for (j = 0; j < i; ++j) bam_close(fp[j]); free(fp); free(heap); // FIXME: possible memory leak return -1; } hin = bam_header_read(fp[i]); if (i == 0) { // the first BAM hout = hin; } else { // validate multiple baf int min_n_targets = hout->n_targets; if (hin->n_targets < min_n_targets) min_n_targets = hin->n_targets; for (j = 0; j < min_n_targets; ++j) if (strcmp(hout->target_name[j], hin->target_name[j]) != 0) { fprintf(stderr, "[bam_merge_core] different target sequence name: '%s' != '%s' in file '%s'\n", hout->target_name[j], hin->target_name[j], fn[i]); return -1; } // If this input file has additional target reference sequences, // add them to the headers to be output if (hin->n_targets > hout->n_targets) { swap_header_targets(hout, hin); // FIXME Possibly we should also create @SQ text headers // for the newly added reference sequences } bam_header_destroy(hin); } } if (hheaders) { // If the text headers to be swapped in include any @SQ headers, // check that they are consistent with the existing binary list // of reference information. if (hheaders->n_targets > 0) { if (hout->n_targets != hheaders->n_targets) { fprintf(stderr, "[bam_merge_core] number of @SQ headers in '%s' differs from number of target sequences\n", headers); if (!reg) return -1; } for (j = 0; j < hout->n_targets; ++j) if (strcmp(hout->target_name[j], hheaders->target_name[j]) != 0) { fprintf(stderr, "[bam_merge_core] @SQ header '%s' in '%s' differs from target sequence\n", hheaders->target_name[j], headers); if (!reg) return -1; } } swap_header_text(hout, hheaders); bam_header_destroy(hheaders); } if (reg) { int tid, beg, end; if (bam_parse_region(hout, reg, &tid, &beg, &end) < 0) { fprintf(stderr, "[%s] Malformated region string or undefined reference name\n", __func__); return -1; } for (i = 0; i < n; ++i) { bam_index_t *idx; idx = bam_index_load(fn[i]); iter[i] = bam_iter_query(idx, tid, beg, end); bam_index_destroy(idx); } } for (i = 0; i < n; ++i) { heap1_t *h = heap + i; h->i = i; h->b = (bam1_t*)calloc(1, sizeof(bam1_t)); if (bam_iter_read(fp[i], iter[i], h->b) >= 0) { h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam1_strand(h->b); h->idx = idx++; } else h->pos = HEAP_EMPTY; } if (flag & MERGE_UNCOMP) level = 0; else if (flag & MERGE_LEVEL1) level = 1; strcpy(mode, "w"); if (level >= 0) sprintf(mode + 1, "%d", level < 9? level : 9); if ((fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w")) == 0) { fprintf(stderr, "[%s] fail to create the output file.\n", __func__); return -1; } bam_header_write(fpout, hout); bam_header_destroy(hout); #ifndef _PBGZF_USE if (!(flag & MERGE_UNCOMP)) bgzf_mt(fpout, n_threads, 256); #endif ks_heapmake(heap, n, heap); while (heap->pos != HEAP_EMPTY) { bam1_t *b = heap->b; if (flag & MERGE_RG) { uint8_t *rg = bam_aux_get(b, "RG"); if (rg) bam_aux_del(b, rg); bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]); } bam_write1_core(fpout, &b->core, b->data_len, b->data); if ((j = bam_iter_read(fp[heap->i], iter[heap->i], b)) >= 0) { heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam1_strand(b); heap->idx = idx++; } else if (j == -1) { heap->pos = HEAP_EMPTY; free(heap->b->data); free(heap->b); heap->b = 0; } else fprintf(stderr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]); ks_heapadjust(heap, 0, n, heap); } if (flag & MERGE_RG) { for (i = 0; i != n; ++i) free(RG[i]); free(RG); free(RG_len); } for (i = 0; i != n; ++i) { bam_iter_destroy(iter[i]); bam_close(fp[i]); } bam_close(fpout); free(fp); free(heap); free(iter); return 0; }
samfile_t *samopen(const char *fn, const char *mode, const void *aux) { samfile_t *fp; fp = (samfile_t*)calloc(1, sizeof(samfile_t)); if (mode[0] == 'r') { // read fp->type |= TYPE_READ; if (mode[1] == 'b') { // binary fp->type |= TYPE_BAM; fp->x.bam = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r"); if (fp->x.bam == 0) goto open_err_ret; fp->header = bam_header_read(fp->x.bam); } else { // text fp->x.tamr = sam_open(fn); if (fp->x.tamr == 0) goto open_err_ret; fp->header = sam_header_read(fp->x.tamr); if (fp->header->n_targets == 0) { // no @SQ fields if (aux) { // check if aux is present bam_header_destroy(fp->header); fp->header = sam_header_read2((const char*)aux); } if (fp->header->n_targets == 0) fprintf(stderr, "[samopen] empty header.\n"); } else fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets); } } else if (mode[0] == 'w') { // write fp->header = bam_header_dup((const bam_header_t*)aux); if (mode[1] == 'b') { // binary fp->type |= TYPE_BAM; fp->x.bam = strcmp(fn, "-")? bam_open(fn, "w") : bam_dopen(fileno(stdout), "w"); if (fp->x.bam == 0) goto open_err_ret; bam_header_write(fp->x.bam, fp->header); } else { // text // open file fp->x.tamw = strcmp(fn, "-")? fopen(fn, "w") : stdout; if (fp->x.tamr == 0) goto open_err_ret; // write header if (strstr(mode, "h")) { int i; bam_header_t *alt; // parse the header text alt = bam_header_init(); alt->l_text = fp->header->l_text; alt->text = fp->header->text; sam_header_parse(alt); alt->l_text = 0; alt->text = 0; // check if there are @SQ lines in the header if (alt->n_targets) { // then write the header text without dumping ->target_{name,len} if (alt->n_targets != fp->header->n_targets) fprintf(stderr, "[samopen] inconsistent number of target sequences.\n"); fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw); } else { // then dump ->target_{name,len} for (i = 0; i < fp->header->n_targets; ++i) fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]); } bam_header_destroy(alt); } } } return fp; open_err_ret: free(fp); return 0; }