// currently, this function ONLY works if each read has one hit void bam_mating_core(bamFile in, bamFile out) { bam_header_t *header; bam1_t *b[2]; int curr, has_prev, pre_end = 0, cur_end; kstring_t str; str.l = str.m = 0; str.s = 0; header = bam_header_read(in); bam_header_write(out, header); b[0] = bam_init1(); b[1] = bam_init1(); curr = 0; has_prev = 0; while (bam_read1(in, b[curr]) >= 0) { bam1_t *cur = b[curr], *pre = b[1-curr]; if (cur->core.tid < 0) continue; cur_end = bam_calend(&cur->core, bam1_cigar(cur)); if (cur_end > (int)header->target_len[cur->core.tid]) cur->core.flag |= BAM_FUNMAP; if (cur->core.flag & BAM_FSECONDARY) continue; // skip secondary alignments if (has_prev) { if (strcmp(bam1_qname(cur), bam1_qname(pre)) == 0) { // identical pair name cur->core.mtid = pre->core.tid; cur->core.mpos = pre->core.pos; pre->core.mtid = cur->core.tid; pre->core.mpos = cur->core.pos; if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP)) && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) // set TLEN/ISIZE { uint32_t cur5, pre5; cur5 = (cur->core.flag&BAM_FREVERSE)? cur_end : cur->core.pos; pre5 = (pre->core.flag&BAM_FREVERSE)? pre_end : pre->core.pos; cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5; } else cur->core.isize = pre->core.isize = 0; if (pre->core.flag&BAM_FREVERSE) cur->core.flag |= BAM_FMREVERSE; else cur->core.flag &= ~BAM_FMREVERSE; if (cur->core.flag&BAM_FREVERSE) pre->core.flag |= BAM_FMREVERSE; else pre->core.flag &= ~BAM_FMREVERSE; if (cur->core.flag & BAM_FUNMAP) { pre->core.flag |= BAM_FMUNMAP; pre->core.flag &= ~BAM_FPROPER_PAIR; } if (pre->core.flag & BAM_FUNMAP) { cur->core.flag |= BAM_FMUNMAP; cur->core.flag &= ~BAM_FPROPER_PAIR; } bam_template_cigar(pre, cur, &str); bam_write1(out, pre); bam_write1(out, cur); has_prev = 0; } else { // unpaired or singleton pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0; if (pre->core.flag & BAM_FPAIRED) { pre->core.flag |= BAM_FMUNMAP; pre->core.flag &= ~BAM_FMREVERSE & ~BAM_FPROPER_PAIR; } bam_write1(out, pre); } } else has_prev = 1; curr = 1 - curr; pre_end = cur_end; } if (has_prev) bam_write1(out, b[1-curr]); bam_header_destroy(header); bam_destroy1(b[0]); bam_destroy1(b[1]); free(str.s); }
int bam_reheader(BGZF *in, const bam_header_t *h, int fd) { BGZF *fp; bam_header_t *old; int len; uint8_t *buf; if (in->open_mode != 'r') return -1; buf = malloc(BUF_SIZE); old = bam_header_read(in); fp = bgzf_dopen(fd, "w"); bam_header_write(fp, h); if (in->block_offset < in->block_length) { bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset); bgzf_flush(fp); } #ifdef _USE_KNETFILE while ((len = knet_read(in->fp, buf, BUF_SIZE)) > 0) fwrite(buf, 1, len, fp->fp); #else while (!feof(in->fp) && (len = fread(buf, 1, BUF_SIZE, in->fp)) > 0) fwrite(buf, 1, len, fp->fp); #endif free(buf); fp->block_offset = in->block_offset = 0; bgzf_close(fp); return 0; }
samfile_t* samopen_in(quip_reader_t reader, void* reader_data, bool binary, void* aux) { samfile_t *fp; fp = (samfile_t*)calloc(1, sizeof(samfile_t)); fp->type |= TYPE_READ; if (binary) { // binary fp->type |= TYPE_BAM; fp->x.bam = bam_open_in(reader, reader_data); if (fp->x.bam == 0) goto open_err_ret; fp->header = bam_header_read(fp->x.bam); } else { // text fp->x.tamr = sam_open_in(reader, reader_data); if (fp->x.tamr == 0) goto open_err_ret; fp->header = sam_header_read(fp->x.tamr); if (fp->header->n_targets == 0) { // no @SQ fields if (aux) { // check if aux is present bam_header_t *textheader = fp->header; fp->header = sam_header_read2((const char*)aux); if (fp->header == 0) goto open_err_ret; append_header_text(fp->header, textheader->text, textheader->l_text); bam_header_destroy(textheader); } if (fp->header->n_targets == 0 && bam_verbose >= 1) fprintf(stderr, "[samopen] no @SQ lines in the header.\n"); } else if (bam_verbose >= 2) fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets); } return fp; open_err_ret: free(fp); return 0; }
int bam_flagstat(int argc, char *argv[]) { bamFile fp; bam_header_t *header; bam_flagstat_t *s; if (argc == optind) { fprintf(pysamerr, "Usage: samtools flagstat <in.bam>\n"); return 1; } fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r"); assert(fp); header = bam_header_read(fp); s = bam_flagstat_core(fp); printf("%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]); printf("%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]); printf("%lld + %lld mapped (%.2f%%:%.2f%%)\n", s->n_mapped[0], s->n_mapped[1], (float)s->n_mapped[0] / s->n_reads[0] * 100.0, (float)s->n_mapped[1] / s->n_reads[1] * 100.0); printf("%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]); printf("%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]); printf("%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]); printf("%lld + %lld properly paired (%.2f%%:%.2f%%)\n", s->n_pair_good[0], s->n_pair_good[1], (float)s->n_pair_good[0] / s->n_pair_all[0] * 100.0, (float)s->n_pair_good[1] / s->n_pair_all[1] * 100.0); printf("%lld + %lld with itself and mate mapped\n", s->n_pair_map[0], s->n_pair_map[1]); printf("%lld + %lld singletons (%.2f%%:%.2f%%)\n", s->n_sgltn[0], s->n_sgltn[1], (float)s->n_sgltn[0] / s->n_pair_all[0] * 100.0, (float)s->n_sgltn[1] / s->n_pair_all[1] * 100.0); printf("%lld + %lld with mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]); printf("%lld + %lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]); free(s); bam_header_destroy(header); bam_close(fp); return 0; }
// read the header of a bam file SR_BamHeader* SR_BamInStreamLoadHeader(SR_BamInStream* pBamInStream) { bam_header_t* pOrigHeader = bam_header_read(pBamInStream->fpBamInput); if (pOrigHeader == NULL) return NULL; SR_BamHeader* pBamHeader = SR_BamHeaderAlloc(); pBamHeader->pOrigHeader = pOrigHeader; pBamHeader->pMD5s = (const char**) calloc(pOrigHeader->n_targets, sizeof(char*)); if (pBamHeader->pMD5s == NULL) SR_ErrQuit("ERROR: Not enough memory for md5 string"); unsigned int numMD5 = 0; for (const char* md5Pos = pOrigHeader->text; numMD5 <= pOrigHeader->n_targets && (md5Pos = strstr(md5Pos, "M5:")) != NULL; ++numMD5, ++md5Pos) { pBamHeader->pMD5s[numMD5] = md5Pos + 3; } if (numMD5 != pOrigHeader->n_targets) { free(pBamHeader->pMD5s); pBamHeader->pMD5s = NULL; if (numMD5 != 0) SR_ErrMsg("WARNING: Number of MD5 string is not consistent with number of chromosomes."); } return pBamHeader; }
/* check match between reference and bam files. prints an error * message and return non-zero on mismatch */ int checkref(char *fasta_file, char *bam_file) { int i = -1; bam_header_t *header; faidx_t *fai; char *ref; int ref_len = -1; bamFile bam_fp; if (! file_exists(fasta_file)) { LOG_FATAL("Fsata file %s does not exist. Exiting...\n", fasta_file); return 1; } if (0 != strcmp(bam_file, "-") && ! file_exists(bam_file)) { LOG_FATAL("BAM file %s does not exist. Exiting...\n", bam_file); return 1; } bam_fp = strcmp(bam_file, "-") == 0 ? bam_dopen(fileno(stdin), "r") : bam_open(bam_file, "r"); header = bam_header_read(bam_fp); if (!header) { LOG_FATAL("Failed to read BAM header from %s\n", bam_file); return 1; } fai = fai_load(fasta_file); if (!fai) { LOG_FATAL("Failed to fasta index for %s\n", fasta_file); return 1; } for (i=0; i < header->n_targets; i++) { LOG_DEBUG("BAM header target %d of %d: name=%s len=%d\n", i+1, header->n_targets, header->target_name[i], header->target_len[i]); ref = faidx_fetch_seq(fai, header->target_name[i], 0, 0x7fffffff, &ref_len); if (NULL == ref) { LOG_FATAL("Failed to fetch sequence %s from fasta file\n", header->target_name[i]); return -1; } if (header->target_len[i] != ref_len) { LOG_FATAL("Sequence length mismatch for sequence %s (%dbp in fasta; %dbp in bam)\n", header->target_name[i], header->target_len[i], ref_len); return -1; } free(ref); } fai_destroy(fai); bam_header_destroy(header); bam_close(bam_fp); return 0; }
int bam_pad2unpad(bamFile in, bamFile out) { bam_header_t *h; bam1_t *b; kstring_t r, q; uint32_t *cigar2 = 0; int n2 = 0, m2 = 0, *posmap = 0; h = bam_header_read(in); bam_header_write(out, h); b = bam_init1(); r.l = r.m = q.l = q.m = 0; r.s = q.s = 0; while (bam_read1(in, b) >= 0) { uint32_t *cigar = bam1_cigar(b); n2 = 0; if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam1_qname(b), h->target_name[b->core.tid]) == 0) { int i, k; unpad_seq(b, &r); write_cigar(cigar2, n2, m2, bam_cigar_gen(b->core.l_qseq, BAM_CMATCH)); replace_cigar(b, n2, cigar2); posmap = realloc(posmap, r.m * sizeof(int)); for (i = k = 0; i < r.l; ++i) { posmap[i] = k; // note that a read should NOT start at a padding if (r.s[i]) ++k; } } else { int i, k, op; unpad_seq(b, &q); if (bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) write_cigar(cigar2, n2, m2, cigar[0]); for (i = 0, k = b->core.pos; i < q.l; ++i, ++k) q.s[i] = q.s[i]? (r.s[k]? BAM_CMATCH : BAM_CINS) : (r.s[k]? BAM_CDEL : BAM_CPAD); for (i = k = 1, op = q.s[0]; i < q.l; ++i) { if (op != q.s[i]) { write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op)); op = q.s[i]; k = 1; } else ++k; } write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op)); if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CSOFT_CLIP) write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]); for (i = 2; i < n2; ++i) if (bam_cigar_op(cigar2[i]) == BAM_CMATCH && bam_cigar_op(cigar2[i-1]) == BAM_CPAD && bam_cigar_op(cigar2[i-2]) == BAM_CMATCH) cigar2[i] += cigar2[i-2], cigar2[i-2] = cigar2[i-1] = 0; for (i = k = 0; i < n2; ++i) if (cigar2[i]) cigar2[k++] = cigar2[i]; n2 = k; replace_cigar(b, n2, cigar2); b->core.pos = posmap[b->core.pos]; } bam_write1(out, b); } free(r.s); free(q.s); free(posmap); bam_destroy1(b); bam_header_destroy(h); return 0; }
bwa_seqio_t *bwa_bam_open(const char *fn, int which) { bwa_seqio_t *bs; bam_header_t *h; bs = (bwa_seqio_t*) calloc(1, sizeof(bwa_seqio_t)); bs->is_bam = 1; bs->which = which; bs->fp = bam_open(fn, "r"); h = bam_header_read(bs->fp); bam_header_destroy(h); return bs; }
bam_header_t* bam_header_new(int specie, int assembly, char* file_path) { bamFile bam_header_file; bam_header_t* bam_header_p; if ((specie == HUMAN) && (assembly == NCBI37)) { bam_header_file = bam_open(file_path, "r"); bam_header_p = bam_header_read(bam_header_file); bam_close(bam_header_file); } return bam_header_p; }
int main(int argc, char** argv) { if(argc < 3) { printf("No input nor output files provided"); return -1; } bamFile in = bam_open(argv[1], "r"); bam_header_t* header; if (in == NULL) { printf("opening input file failed"); return -1; } bam1_t* b = bam_init1(); bamFile out = bam_open(argv[2], "w"); if (out == NULL) { printf("opening input file failed"); return -1; } header = bam_header_read(in); if(bam_header_write(out, header) < 0) { printf("writing header failed"); } long nextPrunedId; if(!scanf ("%lu", &nextPrunedId)) { printf("warning: no ids provided"); return -1; } long id = 0; while (bam_read1(in, b) >= 0) { // write BAM back if (nextPrunedId != id++) { bam_write1(out, b); } else { // fprintf(stderr, "pruning: id: %lu, pos: %d, length: %d\n", nextPrunedId, b->core.pos, b->core.l_qseq); if(!scanf ("%lu", &nextPrunedId)) { break; } } } // closing all resources bam_header_destroy(header); bam_close(in); bam_close(out); bam_destroy1(b); return 0; }
bwa_seqio_t *bwa_bam_open(const char *fn, int which) { bwa_seqio_t *bs; bam_header_t *h; bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t)); bs->is_bam = 1; bs->which = which; bs->fp = bam_open(fn, "r"); if (0 == bs->fp) err_fatal_simple("Couldn't open bam file"); h = bam_header_read(bs->fp); bam_header_destroy(h); return bs; }
// Construct a map linking tid's to sequence names (chromosome names). This code // assumes all input bam files have identical set of sequence (identical both in // name and order). std::map<int, std::string> get_sequence_name_dictionary(ControlState& state) { std::map<int, std::string> dict; std::vector<bam_info>::iterator bam_info_iter; for (bam_info_iter = state.bams_to_parse.begin(); bam_info_iter != state.bams_to_parse.end(); ++bam_info_iter) { bamFile fp = bam_open((*bam_info_iter).BamFile.c_str(), "r"); bam_header_t *header = bam_header_read(fp); bam_init_header_hash(header); for (int tid = 0; tid < header->n_targets; tid++) { dict.insert(std::make_pair(tid, header->target_name[tid])); } break; // Skip other bam files, the sequences should be identical. } return dict; }
static int load_discordant_reads(MEI_data& mei_data, std::vector<bam_info>& bam_sources, const std::string& chr_name, const SearchWindow& window, UserDefinedSettings* userSettings) { // Loop over associated bam files. for (size_t i = 0; i < bam_sources.size(); i++) { // Locate file. bam_info source = bam_sources.at(i); LOG_DEBUG(*logStream << time_log() << "Loading discordant reads from " << source.BamFile << std::endl); // Setup link to bamfile, its index and header. bamFile fp = bam_open(source.BamFile.c_str(), "r"); bam_index_t *idx = bam_index_load(source.BamFile.c_str()); if (idx == NULL) { LOG_WARN(*logStream << time_log() << "Failed to load index for " << source.BamFile.c_str() << std::endl); LOG_WARN(*logStream << "Skipping window: " << chr_name << ", " << window.getStart() << "--" << window.getEnd() << " for BAM-file: " << source.BamFile.c_str() << std::endl); continue; } bam_header_t *header = bam_header_read(fp); bam_init_header_hash(header); int tid = bam_get_tid(header, chr_name.c_str()); if (tid < 0) { LOG_WARN(*logStream << time_log() << "Could not find sequence in alignment file: '" << chr_name << "'" << std::endl); LOG_WARN(*logStream << "Skipping window: " << chr_name << ", " << window.getStart() << "--" << window.getEnd() << " for BAM-file: " << source.BamFile.c_str() << std::endl); continue; } mei_data.sample_names = get_sample_dictionary(header); // Save insert size of current bamfile in data object provided for callback function. // Note: the insert size should ideally be separate from the MEI_data object, tried to do // this using a std::pair object, which did not work. Suggestions are welcome here. mei_data.current_insert_size = source.InsertSize; mei_data.current_chr_name = chr_name; // Set up environment variable for callback function. std::pair<MEI_data*, UserDefinedSettings*> env = std::make_pair(&mei_data, userSettings); // Load discordant reads into mei_data. bam_fetch(fp, idx, tid, window.getStart(), window.getEnd(), &env, fetch_disc_read_callback); bam_index_destroy(idx); } return 0; }
bwa_seqio_t *bwa_bam_open(const char *fn, int which, char **saif, gap_opt_t *o0, bam_header_t **hh) { int c, b=0; bwa_seqio_t *bs; bam_header_t *h; bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t)); bs->is_bam = 1; bs->which = which; bs->fp = (fn[0]!='-' || fn[1]) ? bam_open(fn, "r") : bam_dopen(0, "r") ; h = bam_header_read(bs->fp); if(hh) *hh=h; else bam_header_destroy(h); if( saif ) for(c=0; c!=3; ++c) { gap_opt_t opt; if( saif[c] ) { bs->sai[c] = xopen(saif[c], "r"); if( 1 > fread(&opt, sizeof(gap_opt_t), 1, bs->sai[c]) ) { fclose(bs->sai[c]); bs->sai[c] = 0; } opt.n_threads=o0->n_threads; if(o0) { if(b) { opt.mode=o0->mode; if( memcmp(o0, &opt, sizeof(gap_opt_t)) ) { fprintf( stderr, "[bwa_bam_open] options from sai file \"%s\" conflict with others.\n", saif[c] ) ; exit(1); } fprintf( stderr, "[bwa_bam_open] options from sai file \"%s\" match.\n", saif[c] ) ; } else { fprintf( stderr, "[bwa_bam_open] recovered options from sai file \"%s\".\n", saif[c] ) ; memcpy(o0, &opt, sizeof(gap_opt_t)); b=1; } } } } return bs; }
int bam_reheader(BGZF *in, const bam_header_t *h, int fd) { BGZF *fp; bam_header_t *old; ssize_t len; uint8_t *buf; if (in->is_write) return -1; buf = malloc(BUF_SIZE); old = bam_header_read(in); fp = bgzf_fdopen(fd, "w"); bam_header_write(fp, h); if (in->block_offset < in->block_length) { bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset); bgzf_flush(fp); } while ((len = bgzf_raw_read(in, buf, BUF_SIZE)) > 0) bgzf_raw_write(fp, buf, len); free(buf); fp->block_offset = in->block_offset = 0; bgzf_close(fp); return 0; }
void OpenBamFile(BamReaderData * data, char * filename) { // Allocate space data->data = (mplp_aux_t *) calloc(1, sizeof(mplp_aux_t)); // read the header and initialize data if (strcmp(filename, "-")) data->data->fp = bam_open(filename, "r"); else data->data->fp = bam_dopen(fileno(stdin), "r"); data->data->conf = data->conf; data->data->h = bam_header_read(data->data->fp); // Load index data->idx = bam_index_load(filename); if (data->idx == 0) { fprintf(stderr, "[%s] fail to load index for input.\n", __func__); exit(1); } // Start reading data->ref_tid = -1; seekRegion(data); }
bwa_seqio_t *bwa_bam_open(const char *fn, int which) { bwa_seqio_t *bs; #ifndef USE_HTSLIB bam_header_t *h; #endif bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t)); bs->is_bam = 1; bs->which = which; #ifdef USE_HTSLIB bs->fp = sam_open(fn, "rb"); #else bs->fp = bam_open(fn, "r"); #endif if (0 == bs->fp) err_fatal_simple("Couldn't open bam file"); #ifdef USE_HTSLIB bs->h = sam_hdr_read(bs->fp); #else h = bam_header_read(bs->fp); bam_header_destroy(h); #endif return bs; }
static int mpileup(mplp_conf_t *conf, int n, char **fn) { extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list); extern void bcf_call_del_rghash(void *rghash); mplp_aux_t **data; int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth; const bam_pileup1_t **plp; bam_mplp_t iter; bam_header_t *h = 0; char *ref; void *rghash = 0; bcf_callaux_t *bca = 0; bcf_callret1_t *bcr = 0; bcf_call_t bc; bcf_t *bp = 0; bcf_hdr_t *bh = 0; bam_sample_t *sm = 0; kstring_t buf; mplp_pileup_t gplp; memset(&gplp, 0, sizeof(mplp_pileup_t)); memset(&buf, 0, sizeof(kstring_t)); memset(&bc, 0, sizeof(bcf_call_t)); data = calloc(n, sizeof(void*)); plp = calloc(n, sizeof(void*)); n_plp = calloc(n, sizeof(int*)); sm = bam_smpl_init(); // read the header and initialize data for (i = 0; i < n; ++i) { bam_header_t *h_tmp; data[i] = calloc(1, sizeof(mplp_aux_t)); data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r"); data[i]->conf = conf; h_tmp = bam_header_read(data[i]->fp); data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); if (conf->reg) { int beg, end; bam_index_t *idx; idx = bam_index_load(fn[i]); if (idx == 0) { fprintf(stderr, "[%s] fail to load index for %d-th input.\n", __func__, i+1); exit(1); } if (bam_parse_region(h_tmp, conf->reg, &tid, &beg, &end) < 0) { fprintf(stderr, "[%s] malformatted region or wrong seqname for %d-th input.\n", __func__, i+1); exit(1); } if (i == 0) tid0 = tid, beg0 = beg, end0 = end; data[i]->iter = bam_iter_query(idx, tid, beg, end); bam_index_destroy(idx); } if (i == 0) h = h_tmp; else { // FIXME: to check consistency bam_header_destroy(h_tmp); } } gplp.n = sm->n; gplp.n_plp = calloc(sm->n, sizeof(int)); gplp.m_plp = calloc(sm->n, sizeof(int)); gplp.plp = calloc(sm->n, sizeof(void*)); fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n); // write the VCF header if (conf->flag & MPLP_GLF) { kstring_t s; bh = calloc(1, sizeof(bcf_hdr_t)); s.l = s.m = 0; s.s = 0; bp = bcf_open("-", (conf->flag&MPLP_NO_COMP)? "wu" : "w"); for (i = 0; i < h->n_targets; ++i) { kputs(h->target_name[i], &s); kputc('\0', &s); } bh->l_nm = s.l; bh->name = malloc(s.l); memcpy(bh->name, s.s, s.l); s.l = 0; for (i = 0; i < sm->n; ++i) { kputs(sm->smpl[i], &s); kputc('\0', &s); } bh->l_smpl = s.l; bh->sname = malloc(s.l); memcpy(bh->sname, s.s, s.l); bh->txt = malloc(strlen(BAM_VERSION) + 64); bh->l_txt = 1 + sprintf(bh->txt, "##samtoolsVersion=%s\n", BAM_VERSION); free(s.s); bcf_hdr_sync(bh); bcf_hdr_write(bp, bh); bca = bcf_call_init(-1., conf->min_baseQ); bcr = calloc(sm->n, sizeof(bcf_callret1_t)); bca->rghash = rghash; bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ; bca->min_frac = conf->min_frac; bca->min_support = conf->min_support; } if (tid0 >= 0 && conf->fai) { // region is set ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len); ref_tid = tid0; for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0; } else ref_tid = -1, ref = 0; iter = bam_mplp_init(n, mplp_func, (void**)data); max_depth = conf->max_depth; if (max_depth * sm->n > 1<<20) fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__); if (max_depth * sm->n < 8000) { max_depth = 8000 / sm->n; fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth); } max_indel_depth = conf->max_indel_depth * sm->n; bam_mplp_set_maxcnt(iter, max_depth); while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) { if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; if (tid != ref_tid) { free(ref); ref = 0; if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len); for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid; ref_tid = tid; } if (conf->flag & MPLP_GLF) { int total_depth, _ref0, ref16; bcf1_t *b = calloc(1, sizeof(bcf1_t)); for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i]; group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG); _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; ref16 = bam_nt16_table[_ref0]; for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i); bcf_call_combine(gplp.n, bcr, ref16, &bc); bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, (conf->flag&MPLP_FMT_SP), 0, 0); bcf_write(bp, bh, b); bcf_destroy(b); // call indels if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i); if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) { b = calloc(1, sizeof(bcf1_t)); bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, (conf->flag&MPLP_FMT_SP), bca, ref); bcf_write(bp, bh, b); bcf_destroy(b); } } } else { printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); for (i = 0; i < n; ++i) { int j; printf("\t%d\t", n_plp[i]); if (n_plp[i] == 0) { printf("*\t*"); // FIXME: printf() is very slow... if (conf->flag & MPLP_PRINT_POS) printf("\t*"); } else { for (j = 0; j < n_plp[i]; ++j) pileup_seq(plp[i] + j, pos, ref_len, ref); putchar('\t'); for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; int c = bam1_qual(p->b)[p->qpos] + 33; if (c > 126) c = 126; putchar(c); } if (conf->flag & MPLP_PRINT_MAPQ) { putchar('\t'); for (j = 0; j < n_plp[i]; ++j) { int c = plp[i][j].b->core.qual + 33; if (c > 126) c = 126; putchar(c); } } if (conf->flag & MPLP_PRINT_POS) { putchar('\t'); for (j = 0; j < n_plp[i]; ++j) { if (j > 0) putchar(','); printf("%d", plp[i][j].qpos + 1); // FIXME: printf() is very slow... } } } } putchar('\n'); } } bcf_close(bp); bam_smpl_destroy(sm); free(buf.s); for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]); free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp); bcf_call_del_rghash(rghash); bcf_hdr_destroy(bh); bcf_call_destroy(bca); free(bc.PL); free(bcr); bam_mplp_destroy(iter); bam_header_destroy(h); for (i = 0; i < n; ++i) { bam_close(data[i]->fp); if (data[i]->iter) bam_iter_destroy(data[i]->iter); free(data[i]); } free(data); free(plp); free(ref); free(n_plp); return 0; }
samfile_t *samopen(const char *fn, const char *mode, const void *aux) { samfile_t *fp; fp = (samfile_t*)calloc(1, sizeof(samfile_t)); if (strchr(mode, 'r')) { // read fp->type |= TYPE_READ; if (strchr(mode, 'b')) { // binary fp->type |= TYPE_BAM; fp->x.bam = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r"); if (fp->x.bam == 0) goto open_err_ret; fp->header = bam_header_read(fp->x.bam); } else { // text fp->x.tamr = sam_open(fn); if (fp->x.tamr == 0) goto open_err_ret; fp->header = sam_header_read(fp->x.tamr); if (fp->header->n_targets == 0) { // no @SQ fields if (aux) { // check if aux is present bam_header_t *textheader = fp->header; fp->header = sam_header_read2((const char*)aux); if (fp->header == 0) goto open_err_ret; append_header_text(fp->header, textheader->text, textheader->l_text); bam_header_destroy(textheader); } if (fp->header->n_targets == 0 && bam_verbose >= 1) fprintf(stderr, "[samopen] no @SQ lines in the header.\n"); } //else if (bam_verbose >= 2) fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets); } } else if (strchr(mode, 'w')) { // write fp->header = bam_header_dup((const bam_header_t*)aux); if (strchr(mode, 'b')) { // binary char bmode[3]; int i, compress_level = -1; for (i = 0; mode[i]; ++i) if (mode[i] >= '0' && mode[i] <= '9') break; if (mode[i]) compress_level = mode[i] - '0'; if (strchr(mode, 'u')) compress_level = 0; bmode[0] = 'w'; bmode[1] = compress_level < 0? 0 : compress_level + '0'; bmode[2] = 0; fp->type |= TYPE_BAM; fp->x.bam = strcmp(fn, "-")? bam_open(fn, bmode) : bam_dopen(fileno(stdout), bmode); if (fp->x.bam == 0) goto open_err_ret; bam_header_write(fp->x.bam, fp->header); } else { // text // open file fp->x.tamw = strcmp(fn, "-")? fopen(fn, "w") : stdout; if (fp->x.tamw == 0) goto open_err_ret; if (strchr(mode, 'X')) fp->type |= BAM_OFSTR<<2; else if (strchr(mode, 'x')) fp->type |= BAM_OFHEX<<2; else fp->type |= BAM_OFDEC<<2; // write header if (strchr(mode, 'h')) { int i; bam_header_t *alt; // parse the header text alt = bam_header_init(); alt->l_text = fp->header->l_text; alt->text = fp->header->text; sam_header_parse(alt); alt->l_text = 0; alt->text = 0; // check if there are @SQ lines in the header fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw); // FIXME: better to skip the trailing NULL if (alt->n_targets) { // then write the header text without dumping ->target_{name,len} if (alt->n_targets != fp->header->n_targets && bam_verbose >= 1) fprintf(stderr, "[samopen] inconsistent number of target sequences. Output the text header.\n"); } else { // then dump ->target_{name,len} for (i = 0; i < fp->header->n_targets; ++i) fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]); } bam_header_destroy(alt); } } } return fp; open_err_ret: free(fp); return 0; }
int main(int argc, char *argv[]) { bamFile in; sqlite3 * db; sqlite3_stmt * stmt; char * sErrMsg = NULL; char * tail = 0; int nRetCode; char sSQL [BUFFER_SIZE] = "\0"; char database[BUFFER_SIZE]; clock_t startClock,startClock2; if (argc != 2) { fprintf(stderr, "Usage: bamRindex <in.bam>\n"); return 1; } // Open file and exit if error //in = strcmp(argv[1], "-")? bam_open(argv[1], "rb") : bam_dopen(fileno(stdin), "rb"); //fprintf(stderr,"Options ok\n"); in = bam_open(argv[1], "rb"); if (in == 0 ) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]); return 1; } //fprintf(stderr,"BAM opened\n"); assert(strcpy(database,argv[1])!=NULL); assert(strcat(database,".ridx")!=NULL); remove(database); // *********** // Read header bam_header_t *header; header = bam_header_read(in); // sorted by name? // Should not rely on the value in SO bam1_t *aln=bam_init1(); unsigned long num_alns=0; /*********************************************/ /* Open the Database and create the Schema */ // TODO: check the errors sqlite3_open(database, &db); sqlite3_exec(db, TABLE, NULL, NULL, &sErrMsg); // create the table SQLITE_CHECK_ERROR(); startClock = clock(); sqlite3_exec(db, "PRAGMA synchronous = 0;", NULL, NULL, &sErrMsg); SQLITE_CHECK_ERROR(); sqlite3_exec(db, "PRAGMA journal_mode = OFF;", NULL, NULL, &sErrMsg); SQLITE_CHECK_ERROR(); // Use up to 8GB of memory sqlite3_exec(db, "PRAGMA cache_size = -8000000;", NULL, NULL, &sErrMsg); SQLITE_CHECK_ERROR(); sqlite3_exec(db, "BEGIN TRANSACTION;", NULL, NULL, &sErrMsg); SQLITE_CHECK_ERROR(); while(bam_read1(in,aln)>=0) { // read alignment //aln->core.tid < 0 ? uint8_t *nh = bam_aux_get(aln, "NH"); uint8_t *nm = bam_aux_get(aln, "NM"); uint8_t *xs = bam_aux_get(aln, "XS"); BOOLEAN isPrimary; BOOLEAN isMapped; BOOLEAN notMapped; BOOLEAN isDuplicate; BOOLEAN isNotPassingQualityControls; BOOLEAN isPaired; BOOLEAN isSecondMateRead,isProperPair; //secondary alignment notMapped=(aln->core.flag & BAM_FUNMAP) ? TRUE: FALSE; //notMapped=((aln->core.flag & BAM_FUNMAP) || (aln->core.mtid ==0)) ? TRUE: FALSE; isMapped=!notMapped; isPrimary= (aln->core.flag & BAM_FSECONDARY) ? FALSE:TRUE; isProperPair=(aln->core.flag & BAM_FPROPER_PAIR) ? TRUE:FALSE; isPaired=(aln->core.flag & BAM_FPAIRED ) ? TRUE:FALSE; isSecondMateRead=(aln->core.flag & BAM_FREAD2 ) ? TRUE: FALSE; isNotPassingQualityControls=(aln->core.flag & BAM_FQCFAIL ) ? TRUE:FALSE; isDuplicate=(aln->core.flag & BAM_FDUP) ? TRUE: FALSE; BOOLEAN isSpliced=FALSE; BOOLEAN hasSimpleCigar=TRUE; int nSpliced=0; int i; if (aln->core.n_cigar != 0) { for (i = 0; i < aln->core.n_cigar; ++i) { char l="MIDNSHP=X"[bam1_cigar(aln)[i]&BAM_CIGAR_MASK]; //fprintf(stderr,"%c",l); if ( l == 'N' ) { isSpliced=TRUE; hasSimpleCigar=FALSE;++nSpliced;} if ( l != 'M' && l!='=' ) { hasSimpleCigar=FALSE;} } } //fprintf(stderr,"read %ld\n",num_alns); // isDuplicate,isNotPassingQualityControls, // isSpliced,isPAired,isPrimary,hasSimpleCigar,isSecondMateRead,isProperPair,nh,nm,qual/mapq,xs sprintf(sSQL,"INSERT into bam_index values (%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,'%c')", isDuplicate,isNotPassingQualityControls, nSpliced,isPaired,isPrimary,isMapped,hasSimpleCigar,isSecondMateRead,isProperPair, (nh==0?0:bam_aux2i(nh)),(nm==0?0:bam_aux2i(nm)), aln->core.qual, (xs==0?' ':(bam_aux2A(xs)==0?' ':bam_aux2A(xs)))); sqlite3_exec(db, sSQL, NULL, NULL, &sErrMsg); SQLITE_CHECK_ERROR(); ++num_alns; PRINT_ALNS_PROCESSED(num_alns); } bam_close(in); sqlite3_exec(db, "END TRANSACTION;", NULL, NULL, &sErrMsg); SQLITE_CHECK_ERROR(); printf("\nImported %d records in %4.2f seconds\n", num_alns, ( (double) (clock() - startClock))/CLOCKS_PER_SEC); // Create the indexes startClock2 = clock(); // generating the indexes does not pay off //sqlite3_exec(db, INDEXES, NULL, NULL, &sErrMsg); //printf("Indexed %d records in %4.2f seconds\n", num_alns, ( (double) (clock() - startClock2))/CLOCKS_PER_SEC); printf("Total time: %4.2f seconds\n", ((double)(clock() - startClock))/CLOCKS_PER_SEC); sqlite3_close(db); return 0; }
int main_depth(int argc, char *argv[]) #endif { int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0; const bam_pileup1_t **plp; char *reg = 0; // specified region void *bed = 0; // BED data structure bam_header_t *h = 0; // BAM header of the 1st input aux_t **data; bam_mplp_t mplp; // parse the command line while ((n = getopt(argc, argv, "r:b:q:Q:")) >= 0) { switch (n) { case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header case 'b': bed = bed_read(optarg); break; // BED or position list file can be parsed now case 'q': baseQ = atoi(optarg); break; // base quality threshold case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold } } if (optind == argc) { fprintf(stderr, "Usage: bam2depth [-r reg] [-q baseQthres] [-Q mapQthres] [-b in.bed] <in1.bam> [...]\n"); return 1; } // initialize the auxiliary data structures n = argc - optind; // the number of BAMs on the command line data = (aux_t **) calloc(n, sizeof(void*)); // data[i] for the i-th input beg = 0; end = 1<<30; tid = -1; // set the default region for (i = 0; i < n; ++i) { bam_header_t *htmp; data[i] = (aux_t *) calloc(1, sizeof(aux_t)); data[i]->fp = bam_open(argv[optind+i], "r"); // open BAM data[i]->min_mapQ = mapQ; // set the mapQ filter htmp = bam_header_read(data[i]->fp); // read the BAM header if (i == 0) { h = htmp; // keep the header of the 1st BAM if (reg) bam_parse_region(h, reg, &tid, &beg, &end); // also parse the region } else bam_header_destroy(htmp); // if not the 1st BAM, trash the header if (tid >= 0) { // if a region is specified and parsed successfully bam_index_t *idx = bam_index_load(argv[optind+i]); // load the index data[i]->iter = bam_iter_query(idx, tid, beg, end); // set the iterator bam_index_destroy(idx); // the index is not needed any more; phase out of the memory } } // the core multi-pileup loop mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization n_plp = (int*) calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM plp = (bam_pileup1_t **) calloc(n, sizeof(void*)); // plp[i] points to the array of covering reads (internal in mplp) while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) { // come to the next covered position if (pos < beg || pos >= end) continue; // out of range; skip if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster for (i = 0; i < n; ++i) { // base level filters have to go here int j, m = 0; for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos else if (bam1_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality } printf("\t%d", n_plp[i] - m); // this the depth to output } putchar('\n'); } free(n_plp); free(plp); bam_mplp_destroy(mplp); bam_header_destroy(h); for (i = 0; i < n; ++i) { bam_close(data[i]->fp); if (data[i]->iter) bam_iter_destroy(data[i]->iter); free(data[i]); } free(data); free(reg); if (bed) bed_destroy(bed); return 0; }
int main_depth(int argc, char *argv[]) #endif { int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0, nfiles; const bam_pileup1_t **plp; char *reg = 0; // specified region void *bed = 0; // BED data structure char *file_list = NULL, **fn = NULL; bam_header_t *h = 0; // BAM header of the 1st input aux_t **data; bam_mplp_t mplp; // parse the command line while ((n = getopt(argc, argv, "r:b:q:Q:l:f:")) >= 0) { switch (n) { case 'l': min_len = atoi(optarg); break; // minimum query length case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header case 'b': bed = bed_read(optarg); break; // BED or position list file can be parsed now case 'q': baseQ = atoi(optarg); break; // base quality threshold case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold case 'f': file_list = optarg; break; } } if (optind == argc && !file_list) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: samtools depth [options] in1.bam [in2.bam [...]]\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, " -b <bed> list of positions or regions\n"); fprintf(stderr, " -f <list> list of input BAM filenames, one per line [null]\n"); fprintf(stderr, " -l <int> minQLen\n"); fprintf(stderr, " -q <int> base quality threshold\n"); fprintf(stderr, " -Q <int> mapping quality threshold\n"); fprintf(stderr, " -r <chr:from-to> region\n"); fprintf(stderr, "\n"); return 1; } // initialize the auxiliary data structures if (file_list) { if ( read_file_list(file_list,&nfiles,&fn) ) return 1; n = nfiles; argv = fn; optind = 0; } else n = argc - optind; // the number of BAMs on the command line data = calloc(n, sizeof(void*)); // data[i] for the i-th input beg = 0; end = 1<<30; tid = -1; // set the default region for (i = 0; i < n; ++i) { bam_header_t *htmp; data[i] = calloc(1, sizeof(aux_t)); data[i]->fp = bam_open(argv[optind+i], "r"); // open BAM data[i]->min_mapQ = mapQ; // set the mapQ filter data[i]->min_len = min_len; // set the qlen filter htmp = bam_header_read(data[i]->fp); // read the BAM header if (i == 0) { h = htmp; // keep the header of the 1st BAM if (reg) bam_parse_region(h, reg, &tid, &beg, &end); // also parse the region } else bam_header_destroy(htmp); // if not the 1st BAM, trash the header if (tid >= 0) { // if a region is specified and parsed successfully bam_index_t *idx = bam_index_load(argv[optind+i]); // load the index data[i]->iter = bam_iter_query(idx, tid, beg, end); // set the iterator bam_index_destroy(idx); // the index is not needed any more; phase out of the memory } } // the core multi-pileup loop mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization bam_mplp_set_maxcnt(mplp,2147483647); // set max_depth to int max n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM plp = calloc(n, sizeof(void*)); // plp[i] points to the array of covering reads (internal in mplp) while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) { // come to the next covered position if (pos < beg || pos >= end) continue; // out of range; skip if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster for (i = 0; i < n; ++i) { // base level filters have to go here int j, m = 0; for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos else if (bam1_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality } printf("\t%d", n_plp[i] - m); // this the depth to output } putchar('\n'); } free(n_plp); free(plp); bam_mplp_destroy(mplp); bam_header_destroy(h); for (i = 0; i < n; ++i) { bam_close(data[i]->fp); if (data[i]->iter) bam_iter_destroy(data[i]->iter); free(data[i]); } free(data); free(reg); if (bed) bed_destroy(bed); if ( file_list ) { for (i=0; i<n; i++) free(fn[i]); free(fn); } return 0; }
int bam_cat(int nfn, char * const *fn, const bam_header_t *h, const char* outbam) { BGZF *fp; FILE* fp_file; uint8_t *buf; uint8_t ebuf[BGZF_EMPTY_BLOCK_SIZE]; const int es=BGZF_EMPTY_BLOCK_SIZE; int i; fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(_fileno(stdout), "w"); if (fp == 0) { fprintf(stderr, "[%s] ERROR: fail to open output file '%s'.\n", __FUNCTION__, outbam); return 1; } if (h) bam_header_write(fp, h); buf = (uint8_t*) malloc(BUF_SIZE); for(i = 0; i < nfn; ++i){ BGZF *in; bam_header_t *old; int len,j; in = strcmp(fn[i], "-")? bam_open(fn[i], "r") : bam_dopen(_fileno(stdin), "r"); if (in == 0) { fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __FUNCTION__, fn[i]); return -1; } if (in->open_mode != 'r') return -1; old = bam_header_read(in); if (h == 0 && i == 0) bam_header_write(fp, old); if (in->block_offset < in->block_length) { bgzf_write(fp, (uint8_t*)in->uncompressed_block + in->block_offset, in->block_length - in->block_offset); bgzf_flush(fp); } j=0; #ifdef _USE_KNETFILE fp_file=fp->x.fpw; while ((len = knet_read(in->x.fpr, buf, BUF_SIZE)) > 0) { #else fp_file=fp->file; while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0) { #endif if(len<es){ int diff=es-len; if(j==0) { fprintf(stderr, "[%s] ERROR: truncated file?: '%s'.\n", __FUNCTION__, fn[i]); return -1; } fwrite(ebuf, 1, len, fp_file); memcpy(ebuf,ebuf+len,diff); memcpy(ebuf+diff,buf,len); } else { if(j!=0) fwrite(ebuf, 1, es, fp_file); len-= es; memcpy(ebuf,buf+len,es); fwrite(buf, 1, len, fp_file); } j=1; } /* check final gzip block */ { const uint8_t gzip1=ebuf[0]; const uint8_t gzip2=ebuf[1]; const uint32_t isize=*((uint32_t*)(ebuf+es-4)); if(((gzip1!=GZIPID1) || (gzip2!=GZIPID2)) || (isize!=0)) { fprintf(stderr, "[%s] WARNING: Unexpected block structure in file '%s'.", __FUNCTION__, fn[i]); fprintf(stderr, " Possible output corruption.\n"); fwrite(ebuf, 1, es, fp_file); } } bam_header_destroy(old); bgzf_close(in); } free(buf); bgzf_close(fp); return 0; } int main_cat(int argc, char *argv[]) { bam_header_t *h = 0; char *outfn = 0; int c, ret; while ((c = getopt(argc, argv, "h:o:")) >= 0) { switch (c) { case 'h': { tamFile fph = sam_open(optarg); if (fph == 0) { fprintf(stderr, "[%s] ERROR: fail to read the header from '%s'.\n", __FUNCTION__, argv[1]); return 1; } h = sam_header_read(fph); sam_close(fph); break; } case 'o': outfn = strdup(optarg); break; } } if (argc - optind < 2) { fprintf(stderr, "Usage: samtools cat [-h header.sam] [-o out.bam] <in1.bam> <in2.bam> [...]\n"); return 1; } ret = bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-"); free(outfn); return ret; }
int bam_merge_core2(int by_qname, const char *out, const char *headers, int n, char * const *fn, int flag, const char *reg, int level) #endif { bamFile fpout, *fp; heap1_t *heap; bam_header_t *hout = 0; bam_header_t *hheaders = NULL; int i, j, *RG_len = 0; uint64_t idx = 0; char **RG = 0, mode[8]; bam_iter_t *iter = 0; if (headers) { tamFile fpheaders = sam_open(headers); if (fpheaders == 0) { const char *message = strerror(errno); fprintf(stderr, "[bam_merge_core] cannot open '%s': %s\n", headers, message); return -1; } hheaders = sam_header_read(fpheaders); sam_close(fpheaders); } g_is_by_qname = by_qname; fp = (bamFile*)calloc(n, sizeof(bamFile)); heap = (heap1_t*)calloc(n, sizeof(heap1_t)); iter = (bam_iter_t*)calloc(n, sizeof(bam_iter_t)); // prepare RG tag if (flag & MERGE_RG) { RG = (char**)calloc(n, sizeof(void*)); RG_len = (int*)calloc(n, sizeof(int)); for (i = 0; i != n; ++i) { int l = strlen(fn[i]); const char *s = fn[i]; if (l > 4 && strcmp(s + l - 4, ".bam") == 0) l -= 4; for (j = l - 1; j >= 0; --j) if (s[j] == '/') break; ++j; l -= j; RG[i] = calloc(l + 1, 1); RG_len[i] = l; strncpy(RG[i], s + j, l); } } // read the first for (i = 0; i != n; ++i) { bam_header_t *hin; fp[i] = bam_open(fn[i], "r"); if (fp[i] == 0) { int j; fprintf(stderr, "[bam_merge_core] fail to open file %s\n", fn[i]); for (j = 0; j < i; ++j) bam_close(fp[j]); free(fp); free(heap); // FIXME: possible memory leak return -1; } hin = bam_header_read(fp[i]); if (i == 0) { // the first BAM hout = hin; } else { // validate multiple baf int min_n_targets = hout->n_targets; if (hin->n_targets < min_n_targets) min_n_targets = hin->n_targets; for (j = 0; j < min_n_targets; ++j) if (strcmp(hout->target_name[j], hin->target_name[j]) != 0) { fprintf(stderr, "[bam_merge_core] different target sequence name: '%s' != '%s' in file '%s'\n", hout->target_name[j], hin->target_name[j], fn[i]); return -1; } // If this input file has additional target reference sequences, // add them to the headers to be output if (hin->n_targets > hout->n_targets) { swap_header_targets(hout, hin); // FIXME Possibly we should also create @SQ text headers // for the newly added reference sequences } bam_header_destroy(hin); } } if (hheaders) { // If the text headers to be swapped in include any @SQ headers, // check that they are consistent with the existing binary list // of reference information. if (hheaders->n_targets > 0) { if (hout->n_targets != hheaders->n_targets) { fprintf(stderr, "[bam_merge_core] number of @SQ headers in '%s' differs from number of target sequences\n", headers); if (!reg) return -1; } for (j = 0; j < hout->n_targets; ++j) if (strcmp(hout->target_name[j], hheaders->target_name[j]) != 0) { fprintf(stderr, "[bam_merge_core] @SQ header '%s' in '%s' differs from target sequence\n", hheaders->target_name[j], headers); if (!reg) return -1; } } swap_header_text(hout, hheaders); bam_header_destroy(hheaders); } if (reg) { int tid, beg, end; if (bam_parse_region(hout, reg, &tid, &beg, &end) < 0) { fprintf(stderr, "[%s] Malformated region string or undefined reference name\n", __func__); return -1; } for (i = 0; i < n; ++i) { bam_index_t *idx; idx = bam_index_load(fn[i]); iter[i] = bam_iter_query(idx, tid, beg, end); bam_index_destroy(idx); } } for (i = 0; i < n; ++i) { heap1_t *h = heap + i; h->i = i; h->b = (bam1_t*)calloc(1, sizeof(bam1_t)); if (bam_iter_read(fp[i], iter[i], h->b) >= 0) { h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam1_strand(h->b); h->idx = idx++; } else h->pos = HEAP_EMPTY; } if (flag & MERGE_UNCOMP) level = 0; else if (flag & MERGE_LEVEL1) level = 1; strcpy(mode, "w"); if (level >= 0) sprintf(mode + 1, "%d", level < 9? level : 9); if ((fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w")) == 0) { fprintf(stderr, "[%s] fail to create the output file.\n", __func__); return -1; } bam_header_write(fpout, hout); bam_header_destroy(hout); #ifndef _PBGZF_USE if (!(flag & MERGE_UNCOMP)) bgzf_mt(fpout, n_threads, 256); #endif ks_heapmake(heap, n, heap); while (heap->pos != HEAP_EMPTY) { bam1_t *b = heap->b; if (flag & MERGE_RG) { uint8_t *rg = bam_aux_get(b, "RG"); if (rg) bam_aux_del(b, rg); bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]); } bam_write1_core(fpout, &b->core, b->data_len, b->data); if ((j = bam_iter_read(fp[heap->i], iter[heap->i], b)) >= 0) { heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam1_strand(b); heap->idx = idx++; } else if (j == -1) { heap->pos = HEAP_EMPTY; free(heap->b->data); free(heap->b); heap->b = 0; } else fprintf(stderr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]); ks_heapadjust(heap, 0, n, heap); } if (flag & MERGE_RG) { for (i = 0; i != n; ++i) free(RG[i]); free(RG); free(RG_len); } for (i = 0; i != n; ++i) { bam_iter_destroy(iter[i]); bam_close(fp[i]); } bam_close(fpout); free(fp); free(heap); free(iter); return 0; }
/*! @abstract Sort an unsorted BAM file based on the chromosome order and the leftmost position of an alignment @param is_by_qname whether to sort by query name @param fn name of the file to be sorted @param prefix prefix of the output and the temporary files; upon sucessess, prefix.bam will be written. @param max_mem approxiate maximum memory (very inaccurate) @discussion It may create multiple temporary subalignment files and then merge them by calling bam_merge_core(). This function is NOT thread safe. */ void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size_t _max_mem, int is_stdout, int n_threads, int level, int sort_type) { int ret, i, n_files = 0; size_t mem, max_k, k, max_mem; bam_header_t *header; bamFile fp; bam1_t *b, **buf; char *fnout = 0; if (n_threads < 2) n_threads = 1; g_is_by_qname = is_by_qname; max_k = k = 0; mem = 0; max_mem = _max_mem * n_threads; buf = 0; fp = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r"); if (fp == 0) { fprintf(stderr, "[bam_sort_core] fail to open file %s\n", fn); return; } header = bam_header_read(fp); if (is_by_qname) change_SO(header, "queryname"); else change_SO(header, "coordinate"); // write sub files for (;;) { if (k == max_k) { size_t old_max = max_k; max_k = max_k? max_k<<1 : 0x10000; buf = realloc(buf, max_k * sizeof(void*)); memset(buf + old_max, 0, sizeof(void*) * (max_k - old_max)); } if (buf[k] == 0) buf[k] = (bam1_t*)calloc(1, sizeof(bam1_t)); b = buf[k]; if ((ret = bam_read1(fp, b)) < 0) break; if (b->data_len < b->m_data>>2) { // shrink b->m_data = b->data_len; kroundup32(b->m_data); b->data = realloc(b->data, b->m_data); } mem += sizeof(bam1_t) + b->m_data + sizeof(void*) + sizeof(void*); // two sizeof(void*) for the data allocated to pointer arrays ++k; if (mem >= max_mem) { n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads, sort_type); mem = k = 0; } } if (ret != -1) fprintf(stderr, "[bam_sort_core] truncated file. Continue anyway.\n"); // output file name fnout = calloc(strlen(prefix) + 20, 1); if (is_stdout) sprintf(fnout, "-"); else sprintf(fnout, "%s.bam", prefix); // write the final output if (n_files == 0) { // a single block char mode[8]; strcpy(mode, "w"); if (level >= 0) sprintf(mode + 1, "%d", level < 9? level : 9); sort_aux_core(k, buf, sort_type); #ifndef _PBGZF_USE write_buffer(fnout, mode, k, buf, header, n_threads); #else write_buffer(fnout, mode, k, buf, header); #endif } else { // then merge char **fns; n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads, sort_type); fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n_files); fns = (char**)calloc(n_files, sizeof(char*)); for (i = 0; i < n_files; ++i) { fns[i] = (char*)calloc(strlen(prefix) + 20, 1); sprintf(fns[i], "%s.%.4d.bam", prefix, i); } #ifndef _PBGZF_USE bam_merge_core2(is_by_qname, fnout, 0, n_files, fns, 0, 0, n_threads, level); #else bam_merge_core2(is_by_qname, fnout, 0, n_files, fns, 0, 0, level); #endif for (i = 0; i < n_files; ++i) { unlink(fns[i]); free(fns[i]); } free(fns); } free(fnout); // free for (k = 0; k < max_k; ++k) { if (!buf[k]) continue; free(buf[k]->data); free(buf[k]); } free(buf); bam_header_destroy(header); bam_close(fp); }
samfile_t *samopen(const char *fn, const char *mode, const void *aux) { samfile_t *fp; fp = (samfile_t*)calloc(1, sizeof(samfile_t)); if (mode[0] == 'r') { // read fp->type |= TYPE_READ; if (mode[1] == 'b') { // binary fp->type |= TYPE_BAM; fp->x.bam = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r"); if (fp->x.bam == 0) goto open_err_ret; fp->header = bam_header_read(fp->x.bam); } else { // text fp->x.tamr = sam_open(fn); if (fp->x.tamr == 0) goto open_err_ret; fp->header = sam_header_read(fp->x.tamr); if (fp->header->n_targets == 0) { // no @SQ fields if (aux) { // check if aux is present bam_header_destroy(fp->header); fp->header = sam_header_read2((const char*)aux); } if (fp->header->n_targets == 0) fprintf(stderr, "[samopen] empty header.\n"); } else fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets); } } else if (mode[0] == 'w') { // write fp->header = bam_header_dup((const bam_header_t*)aux); if (mode[1] == 'b') { // binary fp->type |= TYPE_BAM; fp->x.bam = strcmp(fn, "-")? bam_open(fn, "w") : bam_dopen(fileno(stdout), "w"); if (fp->x.bam == 0) goto open_err_ret; bam_header_write(fp->x.bam, fp->header); } else { // text // open file fp->x.tamw = strcmp(fn, "-")? fopen(fn, "w") : stdout; if (fp->x.tamr == 0) goto open_err_ret; // write header if (strstr(mode, "h")) { int i; bam_header_t *alt; // parse the header text alt = bam_header_init(); alt->l_text = fp->header->l_text; alt->text = fp->header->text; sam_header_parse(alt); alt->l_text = 0; alt->text = 0; // check if there are @SQ lines in the header if (alt->n_targets) { // then write the header text without dumping ->target_{name,len} if (alt->n_targets != fp->header->n_targets) fprintf(stderr, "[samopen] inconsistent number of target sequences.\n"); fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw); } else { // then dump ->target_{name,len} for (i = 0; i < fp->header->n_targets; ++i) fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]); } bam_header_destroy(alt); } } } return fp; open_err_ret: free(fp); return 0; }
/*! @abstract Sort an unsorted BAM file based on the chromosome order and the leftmost position of an alignment @param is_by_qname whether to sort by query name @param fn name of the file to be sorted @param prefix prefix of the output and the temporary files; upon sucessess, prefix.bam will be written. @param max_mem approxiate maximum memory (very inaccurate) @discussion It may create multiple temporary subalignment files and then merge them by calling bam_merge_core(). This function is NOT thread safe. */ void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size_t max_mem, int is_stdout) { int n, ret, k, i; size_t mem; bam_header_t *header; bamFile fp; bam1_t *b, **buf; g_is_by_qname = is_by_qname; n = k = 0; mem = 0; fp = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r"); if (fp == 0) { fprintf(stderr, "[bam_sort_core] fail to open file %s\n", fn); return; } header = bam_header_read(fp); if (is_by_qname) change_SO(header, "queryname"); else change_SO(header, "coordinate"); buf = (bam1_t**)calloc(max_mem / BAM_CORE_SIZE, sizeof(bam1_t*)); // write sub files for (;;) { if (buf[k] == 0) buf[k] = (bam1_t*)calloc(1, sizeof(bam1_t)); b = buf[k]; if ((ret = bam_read1(fp, b)) < 0) break; mem += ret; ++k; if (mem >= max_mem) { sort_blocks(n++, k, buf, prefix, header, 0); mem = 0; k = 0; } } if (ret != -1) fprintf(stderr, "[bam_sort_core] truncated file. Continue anyway.\n"); if (n == 0) sort_blocks(-1, k, buf, prefix, header, is_stdout); else { // then merge char **fns, *fnout; fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n+1); sort_blocks(n++, k, buf, prefix, header, 0); fnout = (char*)calloc(strlen(prefix) + 20, 1); if (is_stdout) sprintf(fnout, "-"); else sprintf(fnout, "%s.bam", prefix); fns = (char**)calloc(n, sizeof(char*)); for (i = 0; i < n; ++i) { fns[i] = (char*)calloc(strlen(prefix) + 20, 1); sprintf(fns[i], "%s.%.4d.bam", prefix, i); } bam_merge_core(is_by_qname, fnout, 0, n, fns, 0, 0); free(fnout); for (i = 0; i < n; ++i) { unlink(fns[i]); free(fns[i]); } free(fns); } for (k = 0; (size_t)k < max_mem / BAM_CORE_SIZE; ++k) { if (buf[k]) { free(buf[k]->data); free(buf[k]); } } free(buf); bam_header_destroy(header); bam_close(fp); }
static int mpileup(mplp_conf_t *conf, int n, char **fn) { extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list); extern void bcf_call_del_rghash(void *rghash); mplp_aux_t **data; int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth; const bam_pileup1_t **plp; bam_mplp_t iter; bam_header_t *h = 0; char *ref; void *rghash = 0; bcf_callaux_t *bca = 0; bcf_callret1_t *bcr = 0; bcf_call_t bc; bcf_t *bp = 0; bcf_hdr_t *bh = 0; bam_sample_t *sm = 0; kstring_t buf; mplp_pileup_t gplp; memset(&gplp, 0, sizeof(mplp_pileup_t)); memset(&buf, 0, sizeof(kstring_t)); memset(&bc, 0, sizeof(bcf_call_t)); data = calloc(n, sizeof(void*)); plp = calloc(n, sizeof(void*)); n_plp = calloc(n, sizeof(int*)); sm = bam_smpl_init(); // read the header and initialize data for (i = 0; i < n; ++i) { bam_header_t *h_tmp; data[i] = calloc(1, sizeof(mplp_aux_t)); data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r"); data[i]->conf = conf; h_tmp = bam_header_read(data[i]->fp); data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); if (conf->reg) { int beg, end; bam_index_t *idx; idx = bam_index_load(fn[i]); if (idx == 0) { fprintf(stderr, "[%s] fail to load index for %d-th input.\n", __func__, i+1); exit(1); } if (bam_parse_region(h_tmp, conf->reg, &tid, &beg, &end) < 0) { fprintf(stderr, "[%s] malformatted region or wrong seqname for %d-th input.\n", __func__, i+1); exit(1); } if (i == 0) tid0 = tid, beg0 = beg, end0 = end; data[i]->iter = bam_iter_query(idx, tid, beg, end); bam_index_destroy(idx); } if (i == 0) h = h_tmp; else { // FIXME: to check consistency bam_header_destroy(h_tmp); } } gplp.n = sm->n; gplp.n_plp = calloc(sm->n, sizeof(int)); gplp.m_plp = calloc(sm->n, sizeof(int)); gplp.plp = calloc(sm->n, sizeof(void*)); fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n); // write the VCF header if (conf->flag & MPLP_GLF) { kstring_t s; bh = calloc(1, sizeof(bcf_hdr_t)); s.l = s.m = 0; s.s = 0; bp = bcf_open("-", (conf->flag&MPLP_NO_COMP)? "wu" : "w"); for (i = 0; i < h->n_targets; ++i) { kputs(h->target_name[i], &s); kputc('\0', &s); } bh->l_nm = s.l; bh->name = malloc(s.l); memcpy(bh->name, s.s, s.l); s.l = 0; for (i = 0; i < sm->n; ++i) { kputs(sm->smpl[i], &s); kputc('\0', &s); } bh->l_smpl = s.l; bh->sname = malloc(s.l); memcpy(bh->sname, s.s, s.l); bh->txt = malloc(strlen(BAM_VERSION) + 64); bh->l_txt = 1 + sprintf(bh->txt, "##samtoolsVersion=%s\n", BAM_VERSION); free(s.s); bcf_hdr_sync(bh); bcf_hdr_write(bp, bh); bca = bcf_call_init(-1., conf->min_baseQ); bcr = calloc(sm->n, sizeof(bcf_callret1_t)); bca->rghash = rghash; bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ; bca->min_frac = conf->min_frac; bca->min_support = conf->min_support; } if (tid0 >= 0 && conf->fai) { // region is set ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len); ref_tid = tid0; for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0; } else ref_tid = -1, ref = 0; iter = bam_mplp_init(n, mplp_func, (void**)data); max_depth = conf->max_depth; if (max_depth * sm->n > 1<<20) fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__); if (max_depth * sm->n < 8000) { max_depth = 8000 / sm->n; fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth); } max_indel_depth = conf->max_indel_depth * sm->n; bam_mplp_set_maxcnt(iter, max_depth); int storeSize = 100; int delStore[2][100] = {{0},{0}}; typedef char * mstring; while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) { if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; if (tid != ref_tid) { free(ref); ref = 0; if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len); for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid; ref_tid = tid; } if (conf->flag & MPLP_GLF) { int total_depth, _ref0, ref16; bcf1_t *b = calloc(1, sizeof(bcf1_t)); for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i]; group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG); _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; ref16 = bam_nt16_table[_ref0]; for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i); bcf_call_combine(gplp.n, bcr, ref16, &bc); bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, (conf->flag&MPLP_FMT_SP), 0, 0); bcf_write(bp, bh, b); bcf_destroy(b); // call indels if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i); if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) { b = calloc(1, sizeof(bcf1_t)); bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, (conf->flag&MPLP_FMT_SP), bca, ref); bcf_write(bp, bh, b); bcf_destroy(b); } } } else { printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); for (i = 0; i < n; ++i) { int j; printf("\t%d\t", n_plp[i]); if (n_plp[i] == 0) { printf("*\t*"); // FIXME: printf() is very slow... if (conf->flag & MPLP_PRINT_POS) printf("\t*"); } else { //MDW start //for each position in the pileup column int charLen = 16; int countChars[ charLen ][2]; int countiChars[ charLen ][2]; int countGap[2]={0,0}; //double qvTotal=0; int numStruck=0; int numGood=0; int tti; int ttj; mstring insAllele[100]; int insAlleleCnt[100]; int sf=0; int flag=0; //typedef char * string; char insStr0[10000]; int iCnt0=0; char insStr1[10000]; int iCnt1=0; char delStr0[10000]; int dCnt0=0; char delStr1[10000]; int dCnt1=0; float qposP[10000]; int qposCnt=0; //initialize with zeros for(tti=0;tti<charLen;tti++){ countChars[tti][0]=0; countChars[tti][1]=0; } // define repeat length here; look back up to 10 prior positions // start one position away. int replC=0; // for(tti=1;tti<=15;tti++){ // check for greater than zero if(toupper(ref[pos-1])==toupper(ref[pos-tti])){ replC++; }else{ // breaks the chain at first non identical to current position not strict homopolymer break; } } int reprC=0; // for(tti=1;tti<=15;tti++){ // check for greater than zero if(toupper(ref[pos+1])==toupper(ref[pos+tti])){ reprC++; }else{ // breaks the chain at first non identical to current position not strict homopolymer break; } } int repT = replC; if(replC < reprC){ repT=reprC; } for (j = 0; j < n_plp[i]; ++j){ const bam_pileup1_t *p = plp[i] + j; /* SAME LOGIC AS pileup_seq() */ if(p->is_refskip){ // never count intron gaps in numStruck continue; } if(p->is_del){ // skip deletion gap, after first position which is the first aligned char continue; } if( p->b->core.qual < conf->min_mqToCount || // mapping quality conf->maxrepC < (repT) || // max homopolymer run, this will not (!p->is_del && bam1_qual(p->b)[p->qpos] < conf->min_baseQ) || // base quality for matches p->alignedQPosBeg <= (conf->trimEnd ) || p->alignedQPosEnd <= (conf->trimEnd ) || // trimEnd is 1-based p->zf == 1 || // fusion tag p->ih > conf->maxIH || // max hit index (p->nmd > conf->maxNM) || // max mismatch (conf->flagFilter == 1 && !(p->b->core.flag&BAM_FPROPER_PAIR)) || // optionally keep only proper pairs (conf->flagFilter == 2 && p->b->core.flag&BAM_FSECONDARY) || // optionally strike secondary (conf->flagFilter == 3 && p->b->core.flag&BAM_FDUP) || // optionally strike dup (conf->flagFilter == 4 && (p->b->core.flag&BAM_FDUP || p->b->core.flag&BAM_FSECONDARY)) || // optionally strike secondary or dup (conf->flagFilter == 5 && (p->b->core.flag&BAM_FDUP || p->b->core.flag&BAM_FSECONDARY || p->b->core.flag&BAM_FQCFAIL || !(p->b->core.flag&BAM_FPROPER_PAIR) )) // optionally strike secondary, dup and QCfail ){ numStruck++; continue; } //printf("repT=%d: %d %c %c %c %c \n",repT,p->indel,ref[pos],ref[pos-1],ref[pos-2],ref[pos-3]); if(!p->is_del && p->indel==0){ countChars[ bam1_seqi(bam1_seq(p->b), p->qpos) ][ bam1_strand(p->b) ] ++; numGood++; }else if(p->is_refskip){ countGap[ bam1_strand(p->b) ]++; } if(p->indel<0){ numGood++; if(bam1_strand(p->b) ==0){ for(tti=1;tti<= -p->indel; tti++) { // current spot, starting at 0 in store, because indel<0 refers to next position delStr0[dCnt0] = ref[pos+tti]; dCnt0++; } delStr0[dCnt0] = ','; dCnt0++; }else{ for(tti=1;tti<= -p->indel; tti++) { // current spot, starting at 0 in store, because indel<0 refers to next position delStr1[dCnt1] = ref[pos+tti]; dCnt1++; } delStr1[dCnt1] = ','; dCnt1++; } }else if(p->indel>0){ numGood++; if(bam1_strand(p->b) ==0){ for(tti=1;tti<= p->indel; tti++) { // current spot, starting at 0 in store, because indel<0 refers to next position insStr0[iCnt0] = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + tti)]; iCnt0++; } insStr0[iCnt0] = ','; iCnt0++; }else{ for(tti=1;tti<= p->indel; tti++) { // current spot, starting at 0 in store, because indel<0 refers to next position insStr1[iCnt1] = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + tti)]; iCnt1++; } insStr1[iCnt1] = ','; iCnt1++; } } //calculate position of variant within aligned read - no soft clips if( toupper(ref[pos]) != toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]) || p->indel>0 || p->indel<0 ){ //distance to end; calculate distance to end of aligned read. removes soft clips. int distToEnd = (p->alignedQPosBeg < p->alignedQPosEnd) ? p->alignedQPosBeg : p->alignedQPosEnd; qposP[qposCnt] = distToEnd; qposCnt++; // printf("id=%s, pos=%d",bam1_qname(p->b),distToEnd); } } // //print A,C,G,T, by +/- printf("\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d", countChars[1][0],countChars[1][1], countChars[2][0],countChars[2][1], countChars[4][0],countChars[4][1], countChars[8][0],countChars[8][1], countChars[7][0],countChars[7][1]); putchar('\t'); for(tti=0;tti<dCnt0;tti++){ putchar(delStr0[tti]); } putchar('\t'); for(tti=0;tti<dCnt1;tti++){ putchar(delStr1[tti]); } putchar('\t'); for(tti=0;tti<iCnt0;tti++){ putchar(insStr0[tti]); } putchar('\t'); for(tti=0;tti<iCnt1;tti++){ putchar(insStr1[tti]); } printf("\t%d\t%d",numGood,numStruck); // get non-ref qpos variation float medqpos = -1; float medAbsDev = -1; if(qposCnt>0){ medqpos = median(qposCnt,qposP); float absDev[qposCnt]; for(tti=0;tti<qposCnt;tti++){ absDev[tti] = abs(medqpos - qposP[tti]); } medAbsDev = median(qposCnt-1,absDev); } printf("\t%f",medAbsDev); ///END MDW } } putchar('\n'); } } bcf_close(bp); bam_smpl_destroy(sm); free(buf.s); for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]); free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp); bcf_call_del_rghash(rghash); bcf_hdr_destroy(bh); bcf_call_destroy(bca); free(bc.PL); free(bcr); bam_mplp_destroy(iter); bam_header_destroy(h); for (i = 0; i < n; ++i) { bam_close(data[i]->fp); if (data[i]->iter) bam_iter_destroy(data[i]->iter); free(data[i]); } free(data); free(plp); free(ref); free(n_plp); return 0; }
int main(int argc, char *argv[]) { hashtable ht=new_hashtable(HASHSIZE); bamFile in,in2; bamFile out; if (argc != 3) { fprintf(stderr, "Usage: bam_fix_NH <in.bam> <out.bam>\n"); return 1; } // Open file and exit if error //in = strcmp(argv[1], "-")? bam_open(argv[1], "rb") : bam_dopen(fileno(stdin), "rb"); in = bam_open(argv[1], "rb"); out = strcmp(argv[2], "-")? bam_open(argv[2], "w") : bam_dopen(fileno(stdout), "w"); if (in == 0 ) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]); return 1; } if (out == 0) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[2]); return 1; } unsigned long num_alns=0; int ref; // *********** // Copy header bam_header_t *header; header = bam_header_read(in); bam_header_write(out,header); // sorted by name? // Should not rely on the value in SO bam1_t *aln=bam_init1(); bam1_t *prev=bam_init1(); printf("Hashing...\n");flush(stdout); while(bam_read1(in,aln)>=0) { // read alignment if (aln->core.tid < 0) continue;//ignore unaligned reads ++num_alns; new_read_aln(ht,bam1_qname(aln)); } bam_close(in); printf("Hashing complete (%lu alignments)\n",num_alns); printf("Memory used in the hash: %ld MB\n",index_mem/1024/1024); flush(stdout); // reopen in2 = bam_open(argv[1], "rb"); if (in2 == 0 ) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]); return 1; } header = bam_header_read(in2); while(bam_read1(in2,aln)>=0) { // read alignment if (aln->core.tid < 0) continue;//ignore unaligned reads ++num_alns; READ_ALN *r=get_read_aln(ht,bam1_qname(aln)); //assert(r!=NULL); // update the NH field uint8_t *old_nh = bam_aux_get(aln, "NH"); uint8_t nh=r->ctr; if (old_nh) { if (nh!=bam_aux2i(old_nh)) { fprintf(stderr,"warning: value mismatch! replacing>%s %d->%d\n",bam1_qname(aln),bam_aux2i(old_nh),nh); } bam_aux_del(aln, old_nh); bam_aux_append(aln, "NH", 'i', 4, (uint8_t*)&nh); } if (!old_nh) { // add NH bam_aux_append(aln, "NH", 'i', 4, (uint8_t*)&nh); #ifdef DEBUG printf("!>%s %d\n",bam1_qname(aln),bam_aux2i(old_nh)); #endif } // in->header // Also fix the XS:A tag // BAM_FREAD1 // BAM_FREAD2 // BAM_FREVERSE the read is mapped to the reverse strand //bam1_cigar(b) //BAM_CREF_SKIP 3 CIGAR skip on the reference (e.g. spliced alignment) //BAM_FREVERSE 16 the read is mapped to the reverse strand if (aln->core.flag & BAM_FSECONDARY) continue; // skip secondary alignments if (aln->core.flag & ! BAM_FPAIRED) continue; // not paired if (aln->core.flag & ! BAM_FPROPER_PAIR) continue; // not a proper pair if (aln->core.flag & ! BAM_FMUNMAP) continue; // the mate is mapped if (aln->core.flag & BAM_FSECONDARY) continue; // secundary read if (aln->core.flag & BAM_FREAD2) continue; // only count each pair once // core.strand == 0 (f/+) 1 r/- // flag // bam1_qname(b) bam_write1(out,aln); } // bam_destroy1(aln); bam_close(in2); bam_close(out); return 0; /* uint8_t *old_nm = bam_aux_get(b, "NM"); 90 if (c->flag & BAM_FUNMAP) return; 91 if (old_nm) old_nm_i = bam_aux2i(old_nm); 92 if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); 93 else if (nm != old_nm_i) { 94 fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm); 95 bam_aux_del(b, old_nm); 96 bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); 97 } */ }
int main(int argc, char *argv[]) { short out2stdout=0; hashtable ht=new_hashtable(HASHSIZE); bamFile in,in2; bamFile out; int paired;//1 if not paired or pair read 1, 2 otherwise index_mem=sizeof(hashtable)*sizeof(hashnode**)*HASHSIZE*2; if (argc != 3) { fprintf(stderr, "Usage: bam_fix_NH <in.bam> <out.bam or - for stdout>\n"); return 1; } // Open file and exit if error in = bam_open(argv[1], "rb"); out2stdout = strcmp(argv[2], "-")? 0 : 1; out = strcmp(argv[2], "-")? bam_open(argv[2], "w") : bam_dopen(fileno(stdout), "w"); if (in == 0 ) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]); return 1; } if (out == 0) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[2]); return 1; } unsigned long num_alns=0; int ref; // *********** // Copy header bam_header_t *header; header = bam_header_read(in); bam_header_write(out,header); // sorted by name? // Should not rely on the value in SO bam1_t *aln=bam_init1(); bam1_t *prev=bam_init1(); if (!out2stdout) { fprintf(stderr,"bam_fix_NH version %s\n",VERSION); fprintf(stderr,"Processing %s\n",argv[1]); fprintf(stderr,"Hashing...\n");fflush(stderr); } while(bam_read1(in,aln)>=0) { // read alignment if (aln->core.tid < 0) continue;//ignore unaligned reads if (aln->core.flag & BAM_FUNMAP) continue; if (aln->core.flag & BAM_FREAD2) paired=2; else paired=1; ++num_alns; new_read_aln(ht,fix_read_name(bam1_qname(aln),paired)); if(!out2stdout) PRINT_ALNS_PROCESSED(num_alns); } bam_close(in); if(!out2stdout) { fprintf(stderr,"%s%lu\n",BACKLINE,num_alns); fprintf(stderr,"Hashing complete (%lu alignments)\n",num_alns); fprintf(stderr,"Memory used: %ld MB\n",index_mem/1024/1024); fprintf(stderr,"Updating entries with NH and printing BAM...\n"); fflush(stderr); } // reopen in2 = bam_open(argv[1], "rb"); if (in2 == 0 ) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]); return 1; } header = bam_header_read(in2); num_alns=0; while(bam_read1(in2,aln)>=0) { // read alignment paired=1; if (aln->core.tid < 0) continue;//ignore unaligned reads if (aln->core.flag & BAM_FUNMAP) continue; if (aln->core.flag & BAM_FREAD2) paired=2; ++num_alns; READ_ALN *r=get_read_aln(ht,fix_read_name(bam1_qname(aln),paired)); assert(r!=NULL); // update the NH field uint8_t *old_nh = bam_aux_get(aln, "NH"); int32_t nh=r->ctr; if (old_nh) { if (nh!=bam_aux2i(old_nh)) { fprintf(stderr,"warning: value mismatch! replacing>%s %d->%d\n",bam1_qname(aln),bam_aux2i(old_nh),nh); } bam_aux_del(aln, old_nh); bam_aux_append(aln, "NH", 'i', 4, (uint8_t*)&nh); #ifdef DEBUG // printf("!>%s %d\n",bam1_qname(aln),r->ctr); #endif } if (!old_nh) { // add NH bam_aux_append(aln, "NH", 'i', 4, (uint8_t*)&nh); #ifdef DEBUG fprintf(stderr,"!>%s %d\n",bam1_qname(aln),bam_aux2i(old_nh)); #endif } bam_write1(out,aln); if(!out2stdout) PRINT_ALNS_PROCESSED(num_alns); } // bam_destroy1(aln); bam_close(in2); bam_close(out); if(!out2stdout) { fprintf(stderr,"%s%lu\n",BACKLINE,num_alns); fprintf(stderr,"Done.\n"); } return 0; }