void bamFetchAlreadyOpen(samfile_t *samfile, bam_index_t *idx, char *bamFileName, char *position, bam_fetch_f callbackFunc, void *callbackData) /* With the open bam file, return items the same way with the callbacks as with bamFetch() */ /* except in this case use an already-open bam file and index (use bam_index_load and free() for */ /* the index). It seems a little strange to pass the filename in with the open bam, but */ /* it's just used to report errors. */ { int chromId, start, end; int ret = bam_parse_region(samfile->header, position, &chromId, &start, &end); if (ret != 0 && startsWith("chr", position)) ret = bam_parse_region(samfile->header, position+strlen("chr"), &chromId, &start, &end); if (ret != 0) // If the bam file does not cover the current chromosome, OK return; ret = bam_fetch(samfile->x.bam, idx, chromId, start, end, callbackData, callbackFunc); if (ret != 0) warn("bam_fetch(%s, %s (chromId=%d) failed (%d)", bamFileName, position, chromId, ret); }
void bam_streamer:: set_new_region(const char* region) { int ref,beg,end; bam_parse_region(_bfp->header, region, &ref, &beg, &end); // parse the region set_new_region(ref,beg,end); _region=region; }
int MyBamWrap::myPassRegion(region_t & region,string& chrName, uint32_t& lpos, uint32_t& rpos) { char tmp[1024]=""; char tmpnum[128]=""; strcat(tmp,chrName.c_str()); strcat(tmp,":"); sprintf(&tmpnum[0],"%u",lpos); strcat(tmp,tmpnum); strcat(tmp,"-"); sprintf(&tmpnum[0],"%u",rpos); strcat(tmp,tmpnum); bam_parse_region(in->header, tmp, ®ion.tid, ®ion.lpos , ®ion.rpos); return 0; }
bam_streamer:: bam_streamer( const char* filename, const char* region) : _is_record_set(false), _bfp(nullptr), _hidx(nullptr), _hitr(nullptr), _record_no(0), _stream_name(filename), _is_region(false) { assert(nullptr != filename); if ('\0' == *filename) { throw blt_exception("Can't initialize bam_streamer with empty filename\n"); } _bfp = samopen(filename, "rb", 0); if (nullptr == _bfp) { log_os << "ERROR: Failed to open SAM/BAM/CRAM file: " << filename << "\n"; exit(EXIT_FAILURE); } if (nullptr == region) { // read the whole BAM file: if (_bfp->header->n_targets) { // parse a fake region so that header->hash is created std::string fake_region(target_id_to_name(0)); fake_region += ":1-1"; int ref,beg,end; bam_parse_region(_bfp->header, fake_region.c_str(), &ref, &beg, &end); } return; } // read a specific region of the bam file: set_new_region(region); }
void seekRegion(BamReaderData * data) { int tid, beg, end; if (data->conf->reg) { // Create BAM iterator at region if (bam_parse_region(data->data->h, data->conf->reg, &tid, &beg, &end) < 0) { fprintf(stderr, "[%s] malformatted region or wrong seqname for input.\n", __func__); exit(1); } data->data->iter = bam_iter_query(data->idx, tid, beg, end); data->ref_tid = tid; } else { // Create general BAM iterator data->data->iter = NULL; } // Create pileup iterator data->iter = bam_mplp_init(1, mplp_func, (void**) &data->data); }
int sam_fetch(char *ifn, char *ofn, char *reg, void *data, sam_fetch_f func) { int ret = 0; samfile_t *in = samopen(ifn, "rb", 0); samfile_t *out = 0; if (ofn) out = samopen(ofn, "wb", in->header); if (reg) { bam_index_t *idx = bam_index_load(ifn); if (idx == 0) { fprintf(stderr, "[%s:%d] Random alignment retrieval only works for indexed BAM files.\n", __func__, __LINE__); exit(1); } int tid, beg, end; bam_parse_region(in->header, reg, &tid, &beg, &end); if (tid < 0) { fprintf(stderr, "[%s:%d] Region \"%s\" specifies an unknown reference name. \n", __func__, __LINE__, reg); exit(1); } bam_iter_t iter; bam1_t *b = bam_init1(); iter = bam_iter_query(idx, tid, beg, end); while ((ret = bam_iter_read(in->x.bam, iter, b)) >= 0) func(b, in, out, data); bam_iter_destroy(iter); bam_destroy1(b); bam_index_destroy(idx); } else { bam1_t *b = bam_init1(); while ((ret = samread(in, b)) >= 0) func(b, in, out, data); bam_destroy1(b); } if (out) samclose(out); samclose(in); if (ret != -1) { /* truncated is -2 */ fprintf(stderr, "[%s:%d] Alignment retrieval failed due to truncated file\n", __func__, __LINE__); exit(1); } return ret; }
int main(int argc, char *argv[]) { tmpstruct_t tmp; if (argc == 1) { fprintf(stderr, "Usage: calDepth <in.bam> [region]\n"); return 1; } tmp.beg = 0; tmp.end = 0x7fffffff; tmp.in = samopen(argv[1], "rb", 0); if (tmp.in == 0) { fprintf(stderr, "Fail to open BAM file %s\n", argv[1]); return 1; } if (argc == 2) { // if a region is not specified sampileup(tmp.in, -1, pileup_func, &tmp); } else { int ref; bam_index_t *idx; bam_plbuf_t *buf; idx = bam_index_load(argv[1]); // load BAM index if (idx == 0) { fprintf(stderr, "BAM indexing file is not available.\n"); return 1; } bam_parse_region(tmp.in->header, argv[2], &ref, &tmp.beg, &tmp.end); // parse the region if (ref < 0) { fprintf(stderr, "Invalid region %s\n", argv[2]); return 1; } buf = bam_plbuf_init(pileup_func, &tmp); // initialize pileup bam_fetch(tmp.in->x.bam, idx, ref, tmp.beg, tmp.end, buf, fetch_func); bam_plbuf_push(0, buf); // finalize pileup bam_index_destroy(idx); bam_plbuf_destroy(buf); } samclose(tmp.in); return 0; }
bam_streamer:: bam_streamer(const char* filename, const char* region) : _is_record_set(false), _bfp(NULL), _bidx(NULL), _biter(NULL), _record_no(0), _stream_name(filename), _is_region(false) { assert(NULL != filename); assert('\0' != *filename); _bfp = samopen(filename, "rb", 0); if (NULL == _bfp) { log_os << "ERROR: Failed to open SAM/BAM file: " << filename << "\n"; exit(EXIT_FAILURE); } if (NULL == region) { // read the whole BAM file: if (_bfp->header->n_targets) { // parse a fake region so that header->hash is created std::string fake_region(target_id_to_name(0)); fake_region += ":1-1"; int ref,beg,end; bam_parse_region(_bfp->header, fake_region.c_str(), &ref, &beg, &end); } return; } // read a specific region of the bam file: set_new_region(region); }
static int mpileup(mplp_conf_t *conf, int n, char **fn) { extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list); extern void bcf_call_del_rghash(void *rghash); mplp_aux_t **data; int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth; const bam_pileup1_t **plp; bam_mplp_t iter; bam_header_t *h = 0; char *ref; void *rghash = 0; bcf_callaux_t *bca = 0; bcf_callret1_t *bcr = 0; bcf_call_t bc; bcf_t *bp = 0; bcf_hdr_t *bh = 0; bam_sample_t *sm = 0; kstring_t buf; mplp_pileup_t gplp; memset(&gplp, 0, sizeof(mplp_pileup_t)); memset(&buf, 0, sizeof(kstring_t)); memset(&bc, 0, sizeof(bcf_call_t)); data = calloc(n, sizeof(void*)); plp = calloc(n, sizeof(void*)); n_plp = calloc(n, sizeof(int*)); sm = bam_smpl_init(); // read the header and initialize data for (i = 0; i < n; ++i) { bam_header_t *h_tmp; data[i] = calloc(1, sizeof(mplp_aux_t)); data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r"); data[i]->conf = conf; h_tmp = bam_header_read(data[i]->fp); data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); if (conf->reg) { int beg, end; bam_index_t *idx; idx = bam_index_load(fn[i]); if (idx == 0) { fprintf(stderr, "[%s] fail to load index for %d-th input.\n", __func__, i+1); exit(1); } if (bam_parse_region(h_tmp, conf->reg, &tid, &beg, &end) < 0) { fprintf(stderr, "[%s] malformatted region or wrong seqname for %d-th input.\n", __func__, i+1); exit(1); } if (i == 0) tid0 = tid, beg0 = beg, end0 = end; data[i]->iter = bam_iter_query(idx, tid, beg, end); bam_index_destroy(idx); } if (i == 0) h = h_tmp; else { // FIXME: to check consistency bam_header_destroy(h_tmp); } } gplp.n = sm->n; gplp.n_plp = calloc(sm->n, sizeof(int)); gplp.m_plp = calloc(sm->n, sizeof(int)); gplp.plp = calloc(sm->n, sizeof(void*)); fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n); // write the VCF header if (conf->flag & MPLP_GLF) { kstring_t s; bh = calloc(1, sizeof(bcf_hdr_t)); s.l = s.m = 0; s.s = 0; bp = bcf_open("-", (conf->flag&MPLP_NO_COMP)? "wu" : "w"); for (i = 0; i < h->n_targets; ++i) { kputs(h->target_name[i], &s); kputc('\0', &s); } bh->l_nm = s.l; bh->name = malloc(s.l); memcpy(bh->name, s.s, s.l); s.l = 0; for (i = 0; i < sm->n; ++i) { kputs(sm->smpl[i], &s); kputc('\0', &s); } bh->l_smpl = s.l; bh->sname = malloc(s.l); memcpy(bh->sname, s.s, s.l); bh->txt = malloc(strlen(BAM_VERSION) + 64); bh->l_txt = 1 + sprintf(bh->txt, "##samtoolsVersion=%s\n", BAM_VERSION); free(s.s); bcf_hdr_sync(bh); bcf_hdr_write(bp, bh); bca = bcf_call_init(-1., conf->min_baseQ); bcr = calloc(sm->n, sizeof(bcf_callret1_t)); bca->rghash = rghash; bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ; bca->min_frac = conf->min_frac; bca->min_support = conf->min_support; } if (tid0 >= 0 && conf->fai) { // region is set ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len); ref_tid = tid0; for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0; } else ref_tid = -1, ref = 0; iter = bam_mplp_init(n, mplp_func, (void**)data); max_depth = conf->max_depth; if (max_depth * sm->n > 1<<20) fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__); if (max_depth * sm->n < 8000) { max_depth = 8000 / sm->n; fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth); } max_indel_depth = conf->max_indel_depth * sm->n; bam_mplp_set_maxcnt(iter, max_depth); while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) { if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; if (tid != ref_tid) { free(ref); ref = 0; if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len); for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid; ref_tid = tid; } if (conf->flag & MPLP_GLF) { int total_depth, _ref0, ref16; bcf1_t *b = calloc(1, sizeof(bcf1_t)); for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i]; group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG); _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; ref16 = bam_nt16_table[_ref0]; for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i); bcf_call_combine(gplp.n, bcr, ref16, &bc); bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, (conf->flag&MPLP_FMT_SP), 0, 0); bcf_write(bp, bh, b); bcf_destroy(b); // call indels if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i); if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) { b = calloc(1, sizeof(bcf1_t)); bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, (conf->flag&MPLP_FMT_SP), bca, ref); bcf_write(bp, bh, b); bcf_destroy(b); } } } else { printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); for (i = 0; i < n; ++i) { int j; printf("\t%d\t", n_plp[i]); if (n_plp[i] == 0) { printf("*\t*"); // FIXME: printf() is very slow... if (conf->flag & MPLP_PRINT_POS) printf("\t*"); } else { for (j = 0; j < n_plp[i]; ++j) pileup_seq(plp[i] + j, pos, ref_len, ref); putchar('\t'); for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; int c = bam1_qual(p->b)[p->qpos] + 33; if (c > 126) c = 126; putchar(c); } if (conf->flag & MPLP_PRINT_MAPQ) { putchar('\t'); for (j = 0; j < n_plp[i]; ++j) { int c = plp[i][j].b->core.qual + 33; if (c > 126) c = 126; putchar(c); } } if (conf->flag & MPLP_PRINT_POS) { putchar('\t'); for (j = 0; j < n_plp[i]; ++j) { if (j > 0) putchar(','); printf("%d", plp[i][j].qpos + 1); // FIXME: printf() is very slow... } } } } putchar('\n'); } } bcf_close(bp); bam_smpl_destroy(sm); free(buf.s); for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]); free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp); bcf_call_del_rghash(rghash); bcf_hdr_destroy(bh); bcf_call_destroy(bca); free(bc.PL); free(bcr); bam_mplp_destroy(iter); bam_header_destroy(h); for (i = 0; i < n; ++i) { bam_close(data[i]->fp); if (data[i]->iter) bam_iter_destroy(data[i]->iter); free(data[i]); } free(data); free(plp); free(ref); free(n_plp); return 0; }
int main_depth(int argc, char *argv[]) #endif { int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0; const bam_pileup1_t **plp; char *reg = 0; // specified region void *bed = 0; // BED data structure bam_header_t *h = 0; // BAM header of the 1st input aux_t **data; bam_mplp_t mplp; // parse the command line while ((n = getopt(argc, argv, "r:b:q:Q:")) >= 0) { switch (n) { case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header case 'b': bed = bed_read(optarg); break; // BED or position list file can be parsed now case 'q': baseQ = atoi(optarg); break; // base quality threshold case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold } } if (optind == argc) { fprintf(stderr, "Usage: bam2depth [-r reg] [-q baseQthres] [-Q mapQthres] [-b in.bed] <in1.bam> [...]\n"); return 1; } // initialize the auxiliary data structures n = argc - optind; // the number of BAMs on the command line data = (aux_t **) calloc(n, sizeof(void*)); // data[i] for the i-th input beg = 0; end = 1<<30; tid = -1; // set the default region for (i = 0; i < n; ++i) { bam_header_t *htmp; data[i] = (aux_t *) calloc(1, sizeof(aux_t)); data[i]->fp = bam_open(argv[optind+i], "r"); // open BAM data[i]->min_mapQ = mapQ; // set the mapQ filter htmp = bam_header_read(data[i]->fp); // read the BAM header if (i == 0) { h = htmp; // keep the header of the 1st BAM if (reg) bam_parse_region(h, reg, &tid, &beg, &end); // also parse the region } else bam_header_destroy(htmp); // if not the 1st BAM, trash the header if (tid >= 0) { // if a region is specified and parsed successfully bam_index_t *idx = bam_index_load(argv[optind+i]); // load the index data[i]->iter = bam_iter_query(idx, tid, beg, end); // set the iterator bam_index_destroy(idx); // the index is not needed any more; phase out of the memory } } // the core multi-pileup loop mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization n_plp = (int*) calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM plp = (bam_pileup1_t **) calloc(n, sizeof(void*)); // plp[i] points to the array of covering reads (internal in mplp) while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) { // come to the next covered position if (pos < beg || pos >= end) continue; // out of range; skip if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster for (i = 0; i < n; ++i) { // base level filters have to go here int j, m = 0; for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos else if (bam1_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality } printf("\t%d", n_plp[i] - m); // this the depth to output } putchar('\n'); } free(n_plp); free(plp); bam_mplp_destroy(mplp); bam_header_destroy(h); for (i = 0; i < n; ++i) { bam_close(data[i]->fp); if (data[i]->iter) bam_iter_destroy(data[i]->iter); free(data[i]); } free(data); free(reg); if (bed) bed_destroy(bed); return 0; }
int main_depth(int argc, char *argv[]) #endif { int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0, nfiles; const bam_pileup1_t **plp; char *reg = 0; // specified region void *bed = 0; // BED data structure char *file_list = NULL, **fn = NULL; bam_header_t *h = 0; // BAM header of the 1st input aux_t **data; bam_mplp_t mplp; // parse the command line while ((n = getopt(argc, argv, "r:b:q:Q:l:f:")) >= 0) { switch (n) { case 'l': min_len = atoi(optarg); break; // minimum query length case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header case 'b': bed = bed_read(optarg); break; // BED or position list file can be parsed now case 'q': baseQ = atoi(optarg); break; // base quality threshold case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold case 'f': file_list = optarg; break; } } if (optind == argc && !file_list) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: samtools depth [options] in1.bam [in2.bam [...]]\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, " -b <bed> list of positions or regions\n"); fprintf(stderr, " -f <list> list of input BAM filenames, one per line [null]\n"); fprintf(stderr, " -l <int> minQLen\n"); fprintf(stderr, " -q <int> base quality threshold\n"); fprintf(stderr, " -Q <int> mapping quality threshold\n"); fprintf(stderr, " -r <chr:from-to> region\n"); fprintf(stderr, "\n"); return 1; } // initialize the auxiliary data structures if (file_list) { if ( read_file_list(file_list,&nfiles,&fn) ) return 1; n = nfiles; argv = fn; optind = 0; } else n = argc - optind; // the number of BAMs on the command line data = calloc(n, sizeof(void*)); // data[i] for the i-th input beg = 0; end = 1<<30; tid = -1; // set the default region for (i = 0; i < n; ++i) { bam_header_t *htmp; data[i] = calloc(1, sizeof(aux_t)); data[i]->fp = bam_open(argv[optind+i], "r"); // open BAM data[i]->min_mapQ = mapQ; // set the mapQ filter data[i]->min_len = min_len; // set the qlen filter htmp = bam_header_read(data[i]->fp); // read the BAM header if (i == 0) { h = htmp; // keep the header of the 1st BAM if (reg) bam_parse_region(h, reg, &tid, &beg, &end); // also parse the region } else bam_header_destroy(htmp); // if not the 1st BAM, trash the header if (tid >= 0) { // if a region is specified and parsed successfully bam_index_t *idx = bam_index_load(argv[optind+i]); // load the index data[i]->iter = bam_iter_query(idx, tid, beg, end); // set the iterator bam_index_destroy(idx); // the index is not needed any more; phase out of the memory } } // the core multi-pileup loop mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization bam_mplp_set_maxcnt(mplp,2147483647); // set max_depth to int max n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM plp = calloc(n, sizeof(void*)); // plp[i] points to the array of covering reads (internal in mplp) while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) { // come to the next covered position if (pos < beg || pos >= end) continue; // out of range; skip if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster for (i = 0; i < n; ++i) { // base level filters have to go here int j, m = 0; for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos else if (bam1_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality } printf("\t%d", n_plp[i] - m); // this the depth to output } putchar('\n'); } free(n_plp); free(plp); bam_mplp_destroy(mplp); bam_header_destroy(h); for (i = 0; i < n; ++i) { bam_close(data[i]->fp); if (data[i]->iter) bam_iter_destroy(data[i]->iter); free(data[i]); } free(data); free(reg); if (bed) bed_destroy(bed); if ( file_list ) { for (i=0; i<n; i++) free(fn[i]); free(fn); } return 0; }
int bam_merge_core2(int by_qname, const char *out, const char *headers, int n, char * const *fn, int flag, const char *reg, int level) #endif { bamFile fpout, *fp; heap1_t *heap; bam_header_t *hout = 0; bam_header_t *hheaders = NULL; int i, j, *RG_len = 0; uint64_t idx = 0; char **RG = 0, mode[8]; bam_iter_t *iter = 0; if (headers) { tamFile fpheaders = sam_open(headers); if (fpheaders == 0) { const char *message = strerror(errno); fprintf(stderr, "[bam_merge_core] cannot open '%s': %s\n", headers, message); return -1; } hheaders = sam_header_read(fpheaders); sam_close(fpheaders); } g_is_by_qname = by_qname; fp = (bamFile*)calloc(n, sizeof(bamFile)); heap = (heap1_t*)calloc(n, sizeof(heap1_t)); iter = (bam_iter_t*)calloc(n, sizeof(bam_iter_t)); // prepare RG tag if (flag & MERGE_RG) { RG = (char**)calloc(n, sizeof(void*)); RG_len = (int*)calloc(n, sizeof(int)); for (i = 0; i != n; ++i) { int l = strlen(fn[i]); const char *s = fn[i]; if (l > 4 && strcmp(s + l - 4, ".bam") == 0) l -= 4; for (j = l - 1; j >= 0; --j) if (s[j] == '/') break; ++j; l -= j; RG[i] = calloc(l + 1, 1); RG_len[i] = l; strncpy(RG[i], s + j, l); } } // read the first for (i = 0; i != n; ++i) { bam_header_t *hin; fp[i] = bam_open(fn[i], "r"); if (fp[i] == 0) { int j; fprintf(stderr, "[bam_merge_core] fail to open file %s\n", fn[i]); for (j = 0; j < i; ++j) bam_close(fp[j]); free(fp); free(heap); // FIXME: possible memory leak return -1; } hin = bam_header_read(fp[i]); if (i == 0) { // the first BAM hout = hin; } else { // validate multiple baf int min_n_targets = hout->n_targets; if (hin->n_targets < min_n_targets) min_n_targets = hin->n_targets; for (j = 0; j < min_n_targets; ++j) if (strcmp(hout->target_name[j], hin->target_name[j]) != 0) { fprintf(stderr, "[bam_merge_core] different target sequence name: '%s' != '%s' in file '%s'\n", hout->target_name[j], hin->target_name[j], fn[i]); return -1; } // If this input file has additional target reference sequences, // add them to the headers to be output if (hin->n_targets > hout->n_targets) { swap_header_targets(hout, hin); // FIXME Possibly we should also create @SQ text headers // for the newly added reference sequences } bam_header_destroy(hin); } } if (hheaders) { // If the text headers to be swapped in include any @SQ headers, // check that they are consistent with the existing binary list // of reference information. if (hheaders->n_targets > 0) { if (hout->n_targets != hheaders->n_targets) { fprintf(stderr, "[bam_merge_core] number of @SQ headers in '%s' differs from number of target sequences\n", headers); if (!reg) return -1; } for (j = 0; j < hout->n_targets; ++j) if (strcmp(hout->target_name[j], hheaders->target_name[j]) != 0) { fprintf(stderr, "[bam_merge_core] @SQ header '%s' in '%s' differs from target sequence\n", hheaders->target_name[j], headers); if (!reg) return -1; } } swap_header_text(hout, hheaders); bam_header_destroy(hheaders); } if (reg) { int tid, beg, end; if (bam_parse_region(hout, reg, &tid, &beg, &end) < 0) { fprintf(stderr, "[%s] Malformated region string or undefined reference name\n", __func__); return -1; } for (i = 0; i < n; ++i) { bam_index_t *idx; idx = bam_index_load(fn[i]); iter[i] = bam_iter_query(idx, tid, beg, end); bam_index_destroy(idx); } } for (i = 0; i < n; ++i) { heap1_t *h = heap + i; h->i = i; h->b = (bam1_t*)calloc(1, sizeof(bam1_t)); if (bam_iter_read(fp[i], iter[i], h->b) >= 0) { h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam1_strand(h->b); h->idx = idx++; } else h->pos = HEAP_EMPTY; } if (flag & MERGE_UNCOMP) level = 0; else if (flag & MERGE_LEVEL1) level = 1; strcpy(mode, "w"); if (level >= 0) sprintf(mode + 1, "%d", level < 9? level : 9); if ((fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w")) == 0) { fprintf(stderr, "[%s] fail to create the output file.\n", __func__); return -1; } bam_header_write(fpout, hout); bam_header_destroy(hout); #ifndef _PBGZF_USE if (!(flag & MERGE_UNCOMP)) bgzf_mt(fpout, n_threads, 256); #endif ks_heapmake(heap, n, heap); while (heap->pos != HEAP_EMPTY) { bam1_t *b = heap->b; if (flag & MERGE_RG) { uint8_t *rg = bam_aux_get(b, "RG"); if (rg) bam_aux_del(b, rg); bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]); } bam_write1_core(fpout, &b->core, b->data_len, b->data); if ((j = bam_iter_read(fp[heap->i], iter[heap->i], b)) >= 0) { heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam1_strand(b); heap->idx = idx++; } else if (j == -1) { heap->pos = HEAP_EMPTY; free(heap->b->data); free(heap->b); heap->b = 0; } else fprintf(stderr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]); ks_heapadjust(heap, 0, n, heap); } if (flag & MERGE_RG) { for (i = 0; i != n; ++i) free(RG[i]); free(RG); free(RG_len); } for (i = 0; i != n; ++i) { bam_iter_destroy(iter[i]); bam_close(fp[i]); } bam_close(fpout); free(fp); free(heap); free(iter); return 0; }
BamX::BamX(pars & Params1) // optional constructor { // parameters Params=Params1; Nread=0; Npair=0; Nproper=0; Nout=0; LFlow=INT_MIN; LFhigh=INT_MAX; region.limit=false; IlluminizeBam=0; outFragTailBam=false; outInterChromBam=false; outUniqueMultipleBam=false; outUniquePartialBam=false; outUniqueUnmappedBam=false; outAllPairsBam=false; outReadPairPosBam=false; //output file //samfile_t *fp; bam_header_t *bam_header; string s = Params.getInput(); BamUtil bam1(s); Bam = bam1; string filename=extractfilename(s); // parameters string fragPosFile = Params.getString("ReadPairPosFile"); string r = Params.getString("ChromRegion"); int maxReads = Params.getInt("MaxReads"); Qmin = Params.getInt("Qmin"); LRmin = Params.getInt("MinReadLength"); maxmismatchPC=Params.getDouble("FractionMaxMisMatches"); FragLengthWindow=Params.getInt("FragmentLengthWindow"); int cmd_MateMode=Params.getInt("ReadPairSenseConfig"); string ReferenceFastaFile=Params.getString("ReferenceFastaFile"); FragmentTailPercent=Params.getDouble("FragmentTailPercent"); IlluminizeBam=Params.getInt("Illuminize")>0; outputDirectory=Params.getString("OutputDirectory"); int minLR=Params.getInt("MinReadLength"); int SplitBracketMin=Params.getInt("SplitBracketMin"); int SplitBaseQmin=Params.getInt("SplitBaseQmin"); string StatFile=Params.getString("StatFile"); if (StatFile.size()>0) { hists H1(StatFile); hist HLF=H1.h["LF"]; hist HLR=H1.h["LR"]; Params.setHist("LF",HLF); Params.setHist("LR",HLR); H1.h.clear(); // free some memory if (FragmentTailPercent>0) { LFlow=int(HLF.p2xTrim(FragmentTailPercent/100.)); LFhigh=int(HLF.p2xTrim(1-FragmentTailPercent/100.)); } } int dbg = Params.getInt("Dbg"); time(&tprev); if (ReferenceFastaFile.size()>0) { FastaObj RF1(ReferenceFastaFile, ""); Reference=RF1; RF1.seq.clear(); // free some memory } bam_header= Bam.fp->header; string bamheadertext = bam_header->text; ReadGroup = extractBamTag(bamheadertext,"@RG"); outAllPairsBam=(r.size()>0); if (!outAllPairsBam) { outFragTailBam=true; //FragmentTailPercent>=0; outInterChromBam=true; outUniqueMultipleBam=true; outUniquePartialBam=true; outUniqueUnmappedBam=true; } // output Bams outputBam.clear(); /* // test BamHeaderContainer vector<BamHeaderContainer> x; string sv=SpannerVersion; string q="@PG\tID:FragmentTail\tPN:SpannerX\tVN"+sv+"\tCL:"+Params.getCmdLine(); while (true) { string outfile=outputDirectory+"/"+filename+".fragtail.bam"; q=q+"\n@PG\tID:FragmentTail\tPN:SpannerX\tVN"+sv+"\tCL:"+Params.getCmdLine(); BamHeaderContainer x1( bam_header, q); x.push_back(x1); bam_header_t* h1=x[x.size()-1].header(); cout<< h1->text << endl; } cout << x.size() << endl; */ samfile_t *fpFT=0; samfile_t *fpIC=0; samfile_t *fpUM=0; samfile_t *fpUP=0; samfile_t *fpUZ=0; samfile_t *fpAP=0; samfile_t *fpWP=0; //region if (r.size()>0) { int r1,r2,r3; C_region r0(r); region=r0; string bamRegion=region.region; size_t k=bamRegion.find("chr"); if (k!=string::npos) { bamRegion=bamRegion.substr(3); } if ( bam_parse_region(bam_header, bamRegion.c_str(), &r1, &r2, &r3)==0) { region.limit=true; region.anchor=r1; region.start=r2; region.end=r3; } else { cerr << "region not found\t" << r << endl; exit(111); } } //fragPosFile if (fragPosFile.size()>0) { FragmentPosFileObj fp(fragPosFile); if (fp.fragmentPosList.size()>0) { FragPos=fp; } else { cerr << "Read Pair Pos file not found\t" << fragPosFile << endl; exit(112); } outFragTailBam=false; outInterChromBam=false; outUniqueMultipleBam=false; outUniquePartialBam=false; outUniqueUnmappedBam=false; outReadPairPosBam=true; } if (outAllPairsBam) { string outfile=outputDirectory+"/"+filename+"."+r+".bam"; string sv=SpannerVersion; string q="@PG\tID:Region\tPN:SpannerX\tVN"+sv+"\tCL:"+Params.getCmdLine(); outputBam["AP"]=BamHeaderContainer(bam_header,q); bam_header_t* h1=outputBam["AP"].header(); if ((fpAP = samopen(outfile.c_str(), "wb", h1)) == 0) { fprintf(stderr, "samopen: Fail to open output BAM file %s\n", filename.c_str()); exit(160); } } if (outFragTailBam) { string outfile=outputDirectory+"/"+filename+".fragtail.bam"; string sv=SpannerVersion; string q="@PG\tID:FragmentTail\tPN:SpannerX\tVN"+sv+"\tCL:"+Params.getCmdLine(); outputBam["FT"]=BamHeaderContainer(bam_header,q); bam_header_t* h1=outputBam["FT"].header(); if ((fpFT = samopen(outfile.c_str(), "wb", h1)) == 0) { fprintf(stderr, "samopen: Fail to open output BAM file %s\n", filename.c_str()); exit(161); } } if (outInterChromBam) { string outfile=outputDirectory+"/"+filename+".interchrom.bam"; string sv=SpannerVersion; string q="@PG\tID:InterChromPairs\tPN:SpannerX\tVN"+sv+"\tCL:"+Params.getCmdLine(); outputBam["IC"]=BamHeaderContainer(bam_header,q); bam_header_t* h1=outputBam["IC"].header(); if ((fpIC = samopen(outfile.c_str(), "wb", h1)) == 0) { fprintf(stderr, "samopen: Fail to open output BAM file %s\n", filename.c_str()); exit(162); } } if (outUniqueMultipleBam) { string outfile=outputDirectory+"/"+filename+".uMult.bam"; string sv=SpannerVersion; string q="@PG\tID:uniqMultiplyMappedPairs\tPN:SpannerX\tVN"+sv+"\tCL:"+Params.getCmdLine(); outputBam["UM"]=BamHeaderContainer(bam_header,q); bam_header_t* h1=outputBam["IUM"].header(); if ((fpUM = samopen(outfile.c_str(), "wb", h1)) == 0) { fprintf(stderr, "samopen: Fail to open output BAM file %s\n", filename.c_str()); exit(163); } } if (outUniquePartialBam) { string outfile=outputDirectory+"/"+filename+".uPart.bam"; string sv=SpannerVersion; string q="@PG\tID:uniqPartiallyMappedPairs\tPN:SpannerX\tVN"+sv+"\tCL:"+Params.getCmdLine(); outputBam["UP"]=BamHeaderContainer(bam_header,q); bam_header_t* h1=outputBam["UP"].header(); if ((fpUP = samopen(outfile.c_str(), "wb", h1)) == 0) { fprintf(stderr, "samopen: Fail to open output BAM file %s\n", filename.c_str()); exit(164); } } if (outUniqueUnmappedBam) { string outfile=outputDirectory+"/"+filename+".uUnmapped.bam"; string sv=SpannerVersion; string q="@PG\tID:uniqUnMappedPairs\tPN:SpannerX\tVN"+sv+"\tCL:"+Params.getCmdLine(); outputBam["UZ"]=BamHeaderContainer(bam_header,q); bam_header_t* h1=outputBam["UZ"].header(); if ((fpUZ = samopen(outfile.c_str(), "wb", h1)) == 0) { fprintf(stderr, "samopen: Fail to open output BAM file %s\n", filename.c_str()); exit(165); } } if (outReadPairPosBam) { string outfile=outputDirectory+"/"+filename+".weirdpairs.bam"; string sv=SpannerVersion; string q="@PG\tID:weirdpairs\tPN:SpannerX\tVN"+sv+"\tCL:"+Params.getCmdLine(); outputBam["WP"]=BamHeaderContainer(bam_header,q); bam_header_t* h1=outputBam["WP"].header(); if ((fpWP = samopen(outfile.c_str(), "wb", h1)) == 0) { fprintf(stderr, "samopen: Fail to open output BAM file %s\n", filename.c_str()); exit(165); } } cout << ReadGroup << endl << endl; //extractMateMode(); if (cmd_MateMode>=0) MateMode=cmd_MateMode; BamContainerPair bampair; bool more = true; while (more) { bampair=Bam.getNextBamPair(); // skip if neither end within region more=(bampair.BamEnd.size()>1); Npair++; if (Npair>=maxReads) break; // if ( (dbg!=0)&&(elapsedtime()>float(dbg))) { time(&tprev); cout << " pairs:" << Npair << "\toutput:" << Nout; cout << "\tchr:" << bampair.BamEnd[0].b.core.tid+1; cout << "\tpos:" << bampair.BamEnd[0].b.core.pos; cout << endl; } if (!more) continue; if (region.limit) { bool overlap = false; for (int e=0; e<=1; e++) { int a1=bampair.BamEnd[e].b.core.tid; int p1=bampair.BamEnd[e].b.core.pos; int p2=p1+bampair.BamEnd[e].len; overlap=region.overlap(a1,p1,p2); if (overlap) break; } if (!overlap) continue; } bampair.Illuminize(IlluminizeBam); bampair.calcFragmentLengths(); more=(bampair.BamEnd[1].packeddata.size()>1); //if (bampair.BamEnd[0].b.core.tid==bampair.BamEnd[1].b.core.tid) // cout<< bampair << endl; bool bothmap = ((bampair.BamEnd[0].b.core.flag&BAM_FUNMAP)==0)&&((bampair.BamEnd[0].b.core.flag&BAM_FMUNMAP)==0); if (outAllPairsBam) { Nout++; int s1=samwrite(fpAP, &(bampair.BamEnd[0].b)); int s2=samwrite(fpAP, &(bampair.BamEnd[1].b)); if ((s1*s2)>0) { continue; } else { cerr << "bad write to pairs.bam" << endl; exit(150); } } if (outReadPairPosBam) { int ichr1=bampair.BamEnd[0].b.core.tid+1; int istd1=bampair.BamEnd[0].sense=='+'? 0: 1; int ista1=bampair.BamEnd[0].b.core.pos+1; int iq1=bampair.BamEnd[0].q; int ichr2=bampair.BamEnd[1].b.core.tid+1; int istd2=bampair.BamEnd[1].sense=='+'? 0: 1; int ista2=bampair.BamEnd[1].b.core.pos+1; int iq2=bampair.BamEnd[1].q; FragmentPosObj fp1(0,ichr1,istd1,ista1,0,ichr2,istd2,ista2,0,iq1, iq2,0); /* if ((fp1.chr1==10)&&(fp1.start1>=89687801)&&(fp1.end1<=89700722)) { cout << "read "<< fp1 << endl; } */ if (FragPos.find(fp1)) { Nout++; int s1=samwrite(fpWP, &(bampair.BamEnd[0].b)); int s2=samwrite(fpWP, &(bampair.BamEnd[1].b)); if ((s1*s2)>0) { continue; } else { cerr << "bad write to weirdpairs.bam" << endl; exit(156); } } } bool ok[2]; for (int e=0; e<2; e++) { uint8_t* bq=bam1_qual(&(bampair.BamEnd[e].b)); int LR=bampair.BamEnd[0].b.core.l_qseq; double bok=0; for (int ib=0; ib<LR; ib++) { if (bq[ib]>SplitBaseQmin) { bok++; } } ok[e]=(bok>LRmin); } if (! (ok[0]&ok[1]) ) continue; if ( (outFragTailBam) & ((bampair.BamEnd[0].q>=Qmin)|(bampair.BamEnd[1].q>=Qmin)) ) { bool FT=(bampair.FragmentLength>LFhigh)|((bampair.FragmentLength<LFlow)&(bampair.FragmentLength>INT_MIN))&bothmap; if (FT && (fpFT!=0)) { Nout++; int s1=samwrite(fpFT, &(bampair.BamEnd[0].b)); int s2=samwrite(fpFT, &(bampair.BamEnd[1].b)); //if (outputBam["FT"].write(&(bampair.BamEnd[0].b),&(bampair.BamEnd[1].b))) { if ((s1*s2)>0) { continue; } else { cerr << "bad write to fragtail.bam" << endl; exit(151); } } } if ((outInterChromBam) & ((bampair.BamEnd[0].q>=Qmin)&(bampair.BamEnd[1].q>=Qmin))) { bool IC=(bampair.BamEnd[0].b.core.tid!=bampair.BamEnd[1].b.core.tid)&&bothmap; if (IC && (fpIC!=0)) { Nout++; int s1=samwrite(fpIC, &(bampair.BamEnd[0].b)); int s2=samwrite(fpIC, &(bampair.BamEnd[1].b)); if ((s1*s2)>0) { continue; } else { cerr << "bad write to interchrom.bam" << endl; exit(152); } } } if ((outUniqueMultipleBam) & ((bampair.BamEnd[0].q>=Qmin)|(bampair.BamEnd[1].q>=Qmin))){ int im=bampair.BamEnd[0].nmap>1? 0: 1; int iu=bampair.BamEnd[0].q>=Qmin? 0: 1; bool UM=(bampair.BamEnd[iu].nmap>1)&&(iu!=im)&&bothmap; if (UM && (fpUM!=0)) { Nout++; int s1=samwrite(fpUM, &(bampair.BamEnd[0].b)); int s2=samwrite(fpUM, &(bampair.BamEnd[1].b)); if ((s1*s2)>0) { continue; } else { cerr << "bad write to uMult.bam" << endl; exit(153); } } } if ( (outUniquePartialBam) && ((bampair.BamEnd[0].q>=Qmin)|(bampair.BamEnd[1].q>=Qmin)) && bothmap) { int c0=bampair.BamEnd[0].clip[0]+bampair.BamEnd[0].clip[1]; int LR=bampair.BamEnd[0].b.core.l_qseq; bool split0=((LR-c0)>SplitBracketMin)&(c0>SplitBracketMin); int ib0=0; if ((split0)&(bampair.BamEnd[0].clip[0]>SplitBracketMin)) { ib0=bampair.BamEnd[0].clip[0]; } else if ((split0)&(bampair.BamEnd[0].clip[1]>SplitBracketMin) ) { ib0=LR-bampair.BamEnd[0].clip[1]; } split0=split0&(ib0>0); if (split0) { uint8_t* bq=bam1_qual(&(bampair.BamEnd[0].b)); for (int ib=(ib0-SplitBracketMin); ib<(ib0+SplitBracketMin); ib++) { if (bq[ib]<SplitBaseQmin) { split0=false; break; } } } int c1=bampair.BamEnd[1].clip[0]+bampair.BamEnd[1].clip[1]; LR=bampair.BamEnd[1].b.core.l_qseq; bool split1=((LR-c0)>SplitBracketMin)&(c1>SplitBracketMin);; int ib1=0; if ((split1)&(bampair.BamEnd[1].clip[0]>SplitBracketMin)) { ib1=bampair.BamEnd[1].clip[0]; } else if ((split1)&(bampair.BamEnd[1].clip[1]>SplitBracketMin) ) { ib1=LR-bampair.BamEnd[1].clip[1]; } split1=split1&(ib1>0); if (split1) { uint8_t* bq=bam1_qual(&(bampair.BamEnd[1].b)); for (int ib=(ib1-SplitBracketMin); ib<(ib1+SplitBracketMin); ib++) { if (bq[ib]<SplitBaseQmin) { split1=false; break; } } } bool UP=(split0|split1)&((c1+c0)>minLR); if (UP && (fpUP!=0)) { Nout++; int s1=samwrite(fpUP, &(bampair.BamEnd[0].b)); int s2=samwrite(fpUP, &(bampair.BamEnd[1].b)); if ((s1*s2)>0) { continue; } else { cerr << "bad write to uPart.bam" << endl; exit(154); } } } if ( (outUniqueUnmappedBam) & ((bampair.BamEnd[0].q>=Qmin)|(bampair.BamEnd[1].q>=Qmin)) ) { bool z0=((bampair.BamEnd[0].b.core.flag&BAM_FUNMAP)>0); bool z1=((bampair.BamEnd[1].b.core.flag&BAM_FUNMAP)>0); uint8_t* bq=bam1_qual(&(bampair.BamEnd[0].b)); for (int nb,ib=0; ib<bampair.BamEnd[0].b.core.l_qseq; ib++) { if (bq[ib]<SplitBaseQmin) { nb++; } } bool UZ=(z0|z1)&(!(z1&z0)); if (UZ && (fpUZ!=0)) { Nout++; int s1=samwrite(fpUZ, &(bampair.BamEnd[0].b)); int s2=samwrite(fpUZ, &(bampair.BamEnd[1].b)); if ((s1*s2)>0) { continue; } else { cerr << "bad write to uUnmapped.bam" << endl; exit(155); } } } //cout<< bampair.Orientation << "\t"<< bampair.FragmentLength << "\t" <<bampair.BamEnd[1].b.core.pos << endl; } if (outReadPairPosBam) { samclose(fpWP); } else { if (outAllPairsBam) { samclose(fpAP); } else { samclose(fpFT); samclose(fpIC); samclose(fpUP); samclose(fpUM); samclose(fpUZ); } } /* for (ioutputBam=outputBam.begin(); ioutputBam!=outputBam.end(); ioutputBam++) { (*ioutputBam).second.close(); } if (FragmentTailPercent>0) outputBam["FT"].close(); */ samclose(Bam.fp); }
static int mpileup(mplp_conf_t *conf, int n, char **fn) { extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list); extern void bcf_call_del_rghash(void *rghash); mplp_aux_t **data; int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth; const bam_pileup1_t **plp; bam_mplp_t iter; bam_header_t *h = 0; char *ref; void *rghash = 0; bcf_callaux_t *bca = 0; bcf_callret1_t *bcr = 0; bcf_call_t bc; bcf_t *bp = 0; bcf_hdr_t *bh = 0; bam_sample_t *sm = 0; kstring_t buf; mplp_pileup_t gplp; memset(&gplp, 0, sizeof(mplp_pileup_t)); memset(&buf, 0, sizeof(kstring_t)); memset(&bc, 0, sizeof(bcf_call_t)); data = calloc(n, sizeof(void*)); plp = calloc(n, sizeof(void*)); n_plp = calloc(n, sizeof(int*)); sm = bam_smpl_init(); // read the header and initialize data for (i = 0; i < n; ++i) { bam_header_t *h_tmp; data[i] = calloc(1, sizeof(mplp_aux_t)); data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r"); data[i]->conf = conf; h_tmp = bam_header_read(data[i]->fp); data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); if (conf->reg) { int beg, end; bam_index_t *idx; idx = bam_index_load(fn[i]); if (idx == 0) { fprintf(stderr, "[%s] fail to load index for %d-th input.\n", __func__, i+1); exit(1); } if (bam_parse_region(h_tmp, conf->reg, &tid, &beg, &end) < 0) { fprintf(stderr, "[%s] malformatted region or wrong seqname for %d-th input.\n", __func__, i+1); exit(1); } if (i == 0) tid0 = tid, beg0 = beg, end0 = end; data[i]->iter = bam_iter_query(idx, tid, beg, end); bam_index_destroy(idx); } if (i == 0) h = h_tmp; else { // FIXME: to check consistency bam_header_destroy(h_tmp); } } gplp.n = sm->n; gplp.n_plp = calloc(sm->n, sizeof(int)); gplp.m_plp = calloc(sm->n, sizeof(int)); gplp.plp = calloc(sm->n, sizeof(void*)); fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n); // write the VCF header if (conf->flag & MPLP_GLF) { kstring_t s; bh = calloc(1, sizeof(bcf_hdr_t)); s.l = s.m = 0; s.s = 0; bp = bcf_open("-", (conf->flag&MPLP_NO_COMP)? "wu" : "w"); for (i = 0; i < h->n_targets; ++i) { kputs(h->target_name[i], &s); kputc('\0', &s); } bh->l_nm = s.l; bh->name = malloc(s.l); memcpy(bh->name, s.s, s.l); s.l = 0; for (i = 0; i < sm->n; ++i) { kputs(sm->smpl[i], &s); kputc('\0', &s); } bh->l_smpl = s.l; bh->sname = malloc(s.l); memcpy(bh->sname, s.s, s.l); bh->txt = malloc(strlen(BAM_VERSION) + 64); bh->l_txt = 1 + sprintf(bh->txt, "##samtoolsVersion=%s\n", BAM_VERSION); free(s.s); bcf_hdr_sync(bh); bcf_hdr_write(bp, bh); bca = bcf_call_init(-1., conf->min_baseQ); bcr = calloc(sm->n, sizeof(bcf_callret1_t)); bca->rghash = rghash; bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ; bca->min_frac = conf->min_frac; bca->min_support = conf->min_support; } if (tid0 >= 0 && conf->fai) { // region is set ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len); ref_tid = tid0; for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0; } else ref_tid = -1, ref = 0; iter = bam_mplp_init(n, mplp_func, (void**)data); max_depth = conf->max_depth; if (max_depth * sm->n > 1<<20) fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__); if (max_depth * sm->n < 8000) { max_depth = 8000 / sm->n; fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth); } max_indel_depth = conf->max_indel_depth * sm->n; bam_mplp_set_maxcnt(iter, max_depth); int storeSize = 100; int delStore[2][100] = {{0},{0}}; typedef char * mstring; while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) { if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; if (tid != ref_tid) { free(ref); ref = 0; if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len); for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid; ref_tid = tid; } if (conf->flag & MPLP_GLF) { int total_depth, _ref0, ref16; bcf1_t *b = calloc(1, sizeof(bcf1_t)); for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i]; group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG); _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; ref16 = bam_nt16_table[_ref0]; for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i); bcf_call_combine(gplp.n, bcr, ref16, &bc); bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, (conf->flag&MPLP_FMT_SP), 0, 0); bcf_write(bp, bh, b); bcf_destroy(b); // call indels if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i); if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) { b = calloc(1, sizeof(bcf1_t)); bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, (conf->flag&MPLP_FMT_SP), bca, ref); bcf_write(bp, bh, b); bcf_destroy(b); } } } else { printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); for (i = 0; i < n; ++i) { int j; printf("\t%d\t", n_plp[i]); if (n_plp[i] == 0) { printf("*\t*"); // FIXME: printf() is very slow... if (conf->flag & MPLP_PRINT_POS) printf("\t*"); } else { //MDW start //for each position in the pileup column int charLen = 16; int countChars[ charLen ][2]; int countiChars[ charLen ][2]; int countGap[2]={0,0}; //double qvTotal=0; int numStruck=0; int numGood=0; int tti; int ttj; mstring insAllele[100]; int insAlleleCnt[100]; int sf=0; int flag=0; //typedef char * string; char insStr0[10000]; int iCnt0=0; char insStr1[10000]; int iCnt1=0; char delStr0[10000]; int dCnt0=0; char delStr1[10000]; int dCnt1=0; float qposP[10000]; int qposCnt=0; //initialize with zeros for(tti=0;tti<charLen;tti++){ countChars[tti][0]=0; countChars[tti][1]=0; } // define repeat length here; look back up to 10 prior positions // start one position away. int replC=0; // for(tti=1;tti<=15;tti++){ // check for greater than zero if(toupper(ref[pos-1])==toupper(ref[pos-tti])){ replC++; }else{ // breaks the chain at first non identical to current position not strict homopolymer break; } } int reprC=0; // for(tti=1;tti<=15;tti++){ // check for greater than zero if(toupper(ref[pos+1])==toupper(ref[pos+tti])){ reprC++; }else{ // breaks the chain at first non identical to current position not strict homopolymer break; } } int repT = replC; if(replC < reprC){ repT=reprC; } for (j = 0; j < n_plp[i]; ++j){ const bam_pileup1_t *p = plp[i] + j; /* SAME LOGIC AS pileup_seq() */ if(p->is_refskip){ // never count intron gaps in numStruck continue; } if(p->is_del){ // skip deletion gap, after first position which is the first aligned char continue; } if( p->b->core.qual < conf->min_mqToCount || // mapping quality conf->maxrepC < (repT) || // max homopolymer run, this will not (!p->is_del && bam1_qual(p->b)[p->qpos] < conf->min_baseQ) || // base quality for matches p->alignedQPosBeg <= (conf->trimEnd ) || p->alignedQPosEnd <= (conf->trimEnd ) || // trimEnd is 1-based p->zf == 1 || // fusion tag p->ih > conf->maxIH || // max hit index (p->nmd > conf->maxNM) || // max mismatch (conf->flagFilter == 1 && !(p->b->core.flag&BAM_FPROPER_PAIR)) || // optionally keep only proper pairs (conf->flagFilter == 2 && p->b->core.flag&BAM_FSECONDARY) || // optionally strike secondary (conf->flagFilter == 3 && p->b->core.flag&BAM_FDUP) || // optionally strike dup (conf->flagFilter == 4 && (p->b->core.flag&BAM_FDUP || p->b->core.flag&BAM_FSECONDARY)) || // optionally strike secondary or dup (conf->flagFilter == 5 && (p->b->core.flag&BAM_FDUP || p->b->core.flag&BAM_FSECONDARY || p->b->core.flag&BAM_FQCFAIL || !(p->b->core.flag&BAM_FPROPER_PAIR) )) // optionally strike secondary, dup and QCfail ){ numStruck++; continue; } //printf("repT=%d: %d %c %c %c %c \n",repT,p->indel,ref[pos],ref[pos-1],ref[pos-2],ref[pos-3]); if(!p->is_del && p->indel==0){ countChars[ bam1_seqi(bam1_seq(p->b), p->qpos) ][ bam1_strand(p->b) ] ++; numGood++; }else if(p->is_refskip){ countGap[ bam1_strand(p->b) ]++; } if(p->indel<0){ numGood++; if(bam1_strand(p->b) ==0){ for(tti=1;tti<= -p->indel; tti++) { // current spot, starting at 0 in store, because indel<0 refers to next position delStr0[dCnt0] = ref[pos+tti]; dCnt0++; } delStr0[dCnt0] = ','; dCnt0++; }else{ for(tti=1;tti<= -p->indel; tti++) { // current spot, starting at 0 in store, because indel<0 refers to next position delStr1[dCnt1] = ref[pos+tti]; dCnt1++; } delStr1[dCnt1] = ','; dCnt1++; } }else if(p->indel>0){ numGood++; if(bam1_strand(p->b) ==0){ for(tti=1;tti<= p->indel; tti++) { // current spot, starting at 0 in store, because indel<0 refers to next position insStr0[iCnt0] = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + tti)]; iCnt0++; } insStr0[iCnt0] = ','; iCnt0++; }else{ for(tti=1;tti<= p->indel; tti++) { // current spot, starting at 0 in store, because indel<0 refers to next position insStr1[iCnt1] = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + tti)]; iCnt1++; } insStr1[iCnt1] = ','; iCnt1++; } } //calculate position of variant within aligned read - no soft clips if( toupper(ref[pos]) != toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]) || p->indel>0 || p->indel<0 ){ //distance to end; calculate distance to end of aligned read. removes soft clips. int distToEnd = (p->alignedQPosBeg < p->alignedQPosEnd) ? p->alignedQPosBeg : p->alignedQPosEnd; qposP[qposCnt] = distToEnd; qposCnt++; // printf("id=%s, pos=%d",bam1_qname(p->b),distToEnd); } } // //print A,C,G,T, by +/- printf("\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d", countChars[1][0],countChars[1][1], countChars[2][0],countChars[2][1], countChars[4][0],countChars[4][1], countChars[8][0],countChars[8][1], countChars[7][0],countChars[7][1]); putchar('\t'); for(tti=0;tti<dCnt0;tti++){ putchar(delStr0[tti]); } putchar('\t'); for(tti=0;tti<dCnt1;tti++){ putchar(delStr1[tti]); } putchar('\t'); for(tti=0;tti<iCnt0;tti++){ putchar(insStr0[tti]); } putchar('\t'); for(tti=0;tti<iCnt1;tti++){ putchar(insStr1[tti]); } printf("\t%d\t%d",numGood,numStruck); // get non-ref qpos variation float medqpos = -1; float medAbsDev = -1; if(qposCnt>0){ medqpos = median(qposCnt,qposP); float absDev[qposCnt]; for(tti=0;tti<qposCnt;tti++){ absDev[tti] = abs(medqpos - qposP[tti]); } medAbsDev = median(qposCnt-1,absDev); } printf("\t%f",medAbsDev); ///END MDW } } putchar('\n'); } } bcf_close(bp); bam_smpl_destroy(sm); free(buf.s); for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]); free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp); bcf_call_del_rghash(rghash); bcf_hdr_destroy(bh); bcf_call_destroy(bca); free(bc.PL); free(bcr); bam_mplp_destroy(iter); bam_header_destroy(h); for (i = 0; i < n; ++i) { bam_close(data[i]->fp); if (data[i]->iter) bam_iter_destroy(data[i]->iter); free(data[i]); } free(data); free(plp); free(ref); free(n_plp); return 0; }
int main_ld(int argc, char *argv[]) { int chr; //! chromosome identifier int beg; //! beginning coordinate for analysis int end; //! end coordinate for analysis int ref; //! ref long num_windows; //! number of windows std::string msg; //! string for error message bam_plbuf_t *buf; //! pileup buffer ldData t; // parse the command line options std::string region = t.parseCommandLine(argc, argv); // check input BAM file for errors t.checkBAM(); // initialize the sample data structure t.bam_smpl_init(); // add samples t.bam_smpl_add(); // initialize error model t.em = errmod_init(1.0-0.83); // parse genomic region int k = bam_parse_region(t.h, region, &chr, &beg, &end); if (k < 0) { msg = "Bad genome coordinates: " + region; fatal_error(msg, __FILE__, __LINE__, 0); } // fetch reference sequence t.ref_base = faidx_fetch_seq(t.fai_file, t.h->target_name[chr], 0, 0x7fffffff, &(t.len)); // calculate the number of windows if (t.flag & BAM_WINDOW) num_windows = ((end-beg)-1)/t.win_size; else { t.win_size = (end-beg); num_windows = 1; } // iterate through all windows along specified genomic region for (long cw=0; cw < num_windows; cw++) { // construct genome coordinate string std::string scaffold_name(t.h->target_name[chr]); std::ostringstream winc(scaffold_name); winc.seekp(0, std::ios::end); winc << ":" << beg+(cw*t.win_size)+1 << "-" << ((cw+1)*t.win_size)+(beg-1); std::string winCoord = winc.str(); // initialize number of sites to zero t.num_sites = 0; // parse the BAM file and check if region is retrieved from the reference if (t.flag & BAM_WINDOW) { k = bam_parse_region(t.h, winCoord, &ref, &(t.beg), &(t.end)); if (k < 0) { msg = "Bad window coordinates " + winCoord; fatal_error(msg, __FILE__, __LINE__, 0); } } else { ref = chr; t.beg = beg; t.end = end; if (ref < 0) { msg = "Bad scaffold name: " + region; fatal_error(msg, __FILE__, __LINE__, 0); } } // initialize nucdiv variables t.init_ld(); // create population assignments t.assign_pops(); // initialize pileup buf = bam_plbuf_init(make_ld, &t); // fetch region from bam file if ((bam_fetch(t.bam_in->x.bam, t.idx, ref, t.beg, t.end, buf, fetch_func)) < 0) { msg = "Failed to retrieve region " + region + " due to corrupted BAM index file"; fatal_error(msg, __FILE__, __LINE__, 0); } // finalize pileup bam_plbuf_push(0, buf); // calculate linkage disequilibrium statistics ld_func fp[3] = {&ldData::calc_zns, &ldData::calc_omegamax, &ldData::calc_wall}; (t.*fp[t.output])(); // print results to stdout t.print_ld(chr); // take out the garbage t.destroy_ld(); bam_plbuf_destroy(buf); } // end of window interation errmod_destroy(t.em); samclose(t.bam_in); bam_index_destroy(t.idx); t.bam_smpl_destroy(); free(t.ref_base); return 0; }
int main(int argc,char* argv[]) { time_t timestamp, current; int i,j,k; int a,n; char *pc; FILE *input_file; FILE *output_file; FILE* log_file=stderr; bamFile bam_input; bam_header_t *header; bam1_t* b; bam1_core_t *c; char cps_file_name[MAXFILEBUFFLENGTH]=""; char bam_file_name[MAXFILEBUFFLENGTH]=""; char out_file_name[MAXFILEBUFFLENGTH]=""; char log_file_name[MAXFILEBUFFLENGTH]=""; char buff[MAXFILEBUFFLENGTH]; char chr[MAXFILEBUFFLENGTH]; int beg, beg_prev, end, pos, offset; int ref_id, ref_id_prev, label; int s, side; int read_type, mapped_strand; char ch; int limit_counts = 0; int* contig_count[2]; int* contig_index[2]; splice_site** contig_sites[2]; long int n_reads[N_READ_TYPES][2]; long int n_total_reads = 0; long int n_skipped_reads = 0; int max_intron_length=0; int min_intron_length=0; int ignore_gene_labels = 0; int stranded = 1; int rev_compl[2] = {1,0}; int other_end, the_end, donor_id, acceptor_id; int *cigar; int flagged = 0; int margin = 4; /** reading input from the command line **/ timestamp = time(NULL); if(argc==1) { fprintf(stderr, "BAM2SSJ is the utility for fast counting reads covering splice junctions\nCommand line use:\n"); fprintf(stderr, "%s -cps <cps_file> -bam <bam_file> [-out <out_file>] [-log <log_file>] [-maxlen <max_intron_length>] [-minlen <min_intron_length>] [-margin <length>] ",argv[0]); fprintf(stderr, "[-v suppress verbose output] [-read1 0/1] [-read2 0/1] [-g ignore gene labels] [-u unstranded] [-f count reads flagged 0x800 only]\ntype %s -h for more info\n",argv[0]); exit(1); } for(i=1;i<argc;i++) { pc = argv[i]; if(*pc == '-') { if(strcmp(pc+1,"cps") == 0) sscanf(argv[++i], "%s", &cps_file_name[0]); if(strcmp(pc+1,"bam") == 0) sscanf(argv[++i], "%s", &bam_file_name[0]); if(strcmp(pc+1,"out") == 0) sscanf(argv[++i], "%s", &out_file_name[0]); if(strcmp(pc+1,"log") == 0) sscanf(argv[++i], "%s", &log_file_name[0]); if(strcmp(pc+1,"read1") == 0) sscanf(argv[++i], "%i", &rev_compl[0]); if(strcmp(pc+1,"read2") == 0) sscanf(argv[++i], "%i", &rev_compl[1]); if(strcmp(pc+1,"lim") == 0) sscanf(argv[++i], "%i", &limit_counts); if(strcmp(pc+1,"minlen") == 0) sscanf(argv[++i], "%i", &min_intron_length); if(strcmp(pc+1,"maxlen") == 0) sscanf(argv[++i], "%i", &max_intron_length); if(strcmp(pc+1,"margin") == 0) sscanf(argv[++i], "%i", &margin); if(strcmp(pc+1,"v") == 0) verbose = 0; if(strcmp(pc+1,"g") == 0) ignore_gene_labels = 1; if(strcmp(pc+1,"u") == 0) stranded = 0; if(strcmp(pc+1,"f") == 0) flagged = 1; if(strcmp(pc+1,"h") ==0 ) { fprintf(stderr, "Input: (1) sorted BAM file\n"); fprintf(stderr, "\t(2) CPS (chromosome-position-strand) tab-delimited file sorted by position (chr1 100 + etc)\n\n"); fprintf(stderr, "\tIn order to get CPS file from gtf, use the utility gtf2cps.sh\n"); fprintf(stderr, "\tImportant: CPS must be sorted by position ONLY!\n\n"); fprintf(stderr, "\tIf the 4th column contains (a numeric) gene label then only splice junctions within the same gene will be considered (unless the '-g' option is active)\n"); fprintf(stderr, "\tThe utility to generate CPS with gene labels is gtf2cps_with_gene_id.sh (or update the script accordingly if you are using genome other than human)\n\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, "\t-maxlen <upper limit on intron length>; 0 = no limit (default=%i)\n",max_intron_length); fprintf(stderr, "\t-minlen <lower limit on intron length>; 0 = no limit (default=%i)\n",min_intron_length); fprintf(stderr, "\t-margin <length> minimum number of flanking nucleotides in the read in order to support SJ or cover EB, (default=%i)\n",margin); fprintf(stderr, "\t-read1 0/1, reverse complement read1 no/yes (default=%i)\n",rev_compl[0]); fprintf(stderr, "\t-read2 0/1, reverse complement read2 no/yes (default=%i)\n",rev_compl[1]); fprintf(stderr, "\t-g ignore gene labels (column 4 of cps), default=%s\n", ignore_gene_labels ? "ON" : "OFF"); fprintf(stderr, "\t-u ignore strand (all reads map to the correct strand), default=%s\n", stranded ? "OFF" : "ON"); fprintf(stderr, "\t-f count only reads that are flagged 0x800 (uniquely mapped reads), default=%s\n", flagged ? "ON" : "OFF"); fprintf(stderr, "Output: tab-delimited (default=stdout)\n"); fprintf(stderr, "\tColumn 1 is splice_junction_id\n"); fprintf(stderr, "\tColumns 2-6 are counts of 53, 5X, X3, 50, and 03 reads for the correct (annotated) strand\n"); fprintf(stderr, "\tColumns 7-11 are similar counts for the incorrect (opposite to annotated) strand\n"); fprintf(stderr, "Descriptive read statistics are reported to stderr\n"); exit(1); } } } if(log_file_name[0]==0) { log_file = stderr; } else { log_file = fopen(log_file_name,"w"); if(log_file == NULL) log_file = stderr; } if(bam_file_name[0]==0) { fprintf(log_file,"Bam not specified, exiting\n"); exit(1); } if(cps_file_name[0]==0) { fprintf(log_file,"Input not specified, exiting\n"); exit(1); } if(out_file_name[0]==0) { fprintf(log_file,"[Warning: output set to stdout]\n"); output_file = stdout; } else { output_file = fopen(out_file_name,"w"); if(output_file == NULL) { fprintf(log_file,"[Warning: output set to stdout]\n"); output_file = stdout; } } if(max_intron_length>0) { if(verbose) fprintf(log_file,"[Warning: set max intron length=%i]\n",max_intron_length); } if(ignore_gene_labels) { if(verbose) fprintf(log_file,"[Warning: ignoring gene labels (column 4)]\n"); } if(flagged) { if(verbose) fprintf(log_file,"[Warning: only look at reads flagged 0x800]\n"); } if(margin>0) { if(verbose) fprintf(log_file,"[Warning: read margin set to %i]\n", margin); } if(verbose) { for(s = 0; s < 2; s++) if(rev_compl[s]) fprintf(log_file,"[Warning: take reverse complement of read %i]\n", s+1); fprintf(log_file,"[Warning: stranded = %s]\n", stranded ? "TRUE" : "FALSE (always correct strand)"); if(ignore_gene_labels) fprintf(log_file,"[Warning: ignore gene labels (column 4)]\n"); } for(i = 0; i < N_READ_TYPES; i++) for(s = 0; s < 2; s++) n_reads[i][s] = 0; /** initatializing BAM and header **/ bam_input = bam_open(bam_file_name, "r"); header = bam_header_read(bam_input); if(bam_input == NULL || header == NULL) { fprintf(log_file,"BAM can't be opened or contains no header, exiting\n"); exit(1); } /** reading input from CPS **/ input_file = fopen(cps_file_name, "r"); if(input_file == NULL) { fprintf(log_file,"CPS can't be opened, exiting\n"); exit(1); } /** populating gene structure arrays **/ for(s = 0; s < 2; s++) { contig_count[s] = (int*) malloc(sizeof(int) * (header->n_targets + ARRAY_MARGIN)); contig_index[s] = (int*) malloc(sizeof(int) * (header->n_targets + ARRAY_MARGIN)); contig_sites[s] = (splice_site**) malloc(sizeof(splice_site*) * (header->n_targets + ARRAY_MARGIN)); if(contig_count[s] == NULL || contig_sites[s] == NULL || contig_index[s] == NULL) { fprintf(log_file, "Not enought memory, exiting\n"); exit(1); } } for(s = 0; s < 2; s++) for(i=0; i < header->n_targets; i++) contig_count[s][i] = contig_index[s][i] = 0; if(verbose) fprintf(log_file, "Reading %s pass1", cps_file_name); while(fgets(buff, MAXFILEBUFFLENGTH, input_file)) { sscanf(buff, "%s %*i %c", &chr[0], &ch); bam_parse_region(header, chr, &i, &beg, &end); s = (ch == '+' ? 0 : 1); if(i < header->n_targets && i>=0) contig_count[s][i]++; } for(s = 0; s < 2; s++) { for(i = 0;i < header->n_targets; i++) { contig_sites[s][i] = (splice_site*) malloc(sizeof(splice_site) * (contig_count[s][i] + ARRAY_MARGIN)); if(contig_sites[s][i] == NULL) { fprintf(log_file, "Not enought memory, exiting\n"); exit(1); } } } if(verbose) fprintf(log_file, "\n"); if(verbose) fprintf(log_file, "Reading %s pass2",cps_file_name); fseek(input_file, 0, SEEK_SET); while(fgets(buff, MAXFILEBUFFLENGTH, input_file)) { sscanf(buff, "%s %i %c %i", &chr[0], &pos, &ch, &label); bam_parse_region(header, chr, &i, &beg, &end); s = (ch == '+' ? 0 : 1); if(i < header->n_targets && i>=0) { if(contig_index[s][i]>0) { if(pos < contig_sites[s][i][contig_index[s][i]-1].pos) { fprintf(log_file, "Splice sites weren't sorted, exiting\n"); exit(1); } } contig_sites[s][i][contig_index[s][i]].pos = pos; contig_sites[s][i][contig_index[s][i]].label = ignore_gene_labels ? 0 : label; for(side = 0; side < 2; side++) { contig_sites[s][i][contig_index[s][i]].count00[side] = 0; contig_sites[s][i][contig_index[s][i]].count5X[side] = 0; contig_sites[s][i][contig_index[s][i]].countX3[side] = 0; contig_sites[s][i][contig_index[s][i]].junctions = NULL; } contig_index[s][i]++; } } if(verbose) fprintf(log_file, "\n"); for(s = 0; s < 2; s++) for(i = 0;i < header->n_targets; i++) contig_index[s][i] = 0; /** analysis starts here **/ b = bam_init1(); k = 0; ref_id_prev = -1; beg_prev = -1; while(bam_read1(bam_input, b)>=0) { c = &b->core; ref_id = c->tid; if(ref_id<0) continue; if(flagged && ((c->flag & 0x800) == 0)) { n_skipped_reads++; continue; } if(stranded && ((c->flag & BAM_FREAD1) && (c->flag & BAM_FREAD2) || !(c->flag & BAM_FREAD1) && !(c->flag & BAM_FREAD2))) { n_skipped_reads++; continue; } cigar = bam1_cigar(b); if(ref_id != ref_id_prev && ref_id_prev >= 0) { if(contig_index[0][ref_id_prev] + contig_index[1][ref_id_prev] < contig_count[0][ref_id_prev] + contig_count[1][ref_id_prev]) { if(log_file==stderr) progressbar(1, 1, header->target_name[ref_id_prev], verbose); } beg_prev = -1; } /*if(ref_id < ref_id_prev) { fprintf(log_file,"BAM file wasn't sorted, exiting\n"); exit(1); }*/ ref_id_prev = ref_id; beg = c->pos + 1; if(beg < beg_prev) { fprintf(log_file,"BAM file wasn't sorted, exiting\n"); exit(1); } beg_prev = beg; s = ((c->flag & BAM_FREVERSE)>0); mapped_strand = (c->flag & BAM_FREAD1) ? (s + rev_compl[0]) & 1 : (s + rev_compl[1]) & 1; the_end = bam_calend(c, cigar); for(s = 0; s < 1 + stranded; s++) { end = beg; side = (s == mapped_strand) ? 0 : 1; side *= stranded; // keep reading until the currect site is on the same chromosome downstream of the read while(contig_sites[s][ref_id][contig_index[s][ref_id]].pos < beg && contig_index[s][ref_id] < contig_count[s][ref_id]) { contig_index[s][ref_id]++; if(log_file==stderr) progressbar(contig_index[0][ref_id]+contig_index[1][ref_id], contig_count[0][ref_id]+contig_count[1][ref_id], header->target_name[ref_id], verbose); } read_type = RT_OTHER; if(contig_index[s][ref_id]<contig_count[s][ref_id]) { // check if the read is a split read and find its other end read_type = RT_GENOME; for(i = 0; i < c->n_cigar; i++) { offset = cigar[i] >> 4; switch(cigar[i] & 0x0F) { case BAM_CMATCH: end += offset; // match to the reference break; case BAM_CINS: end += 0; // insertion to the reference, pointer stays unchanged break; case BAM_CDEL: end += offset; // deletion from the reference (technically the same as 'N') pointer moves break; case BAM_CREF_SKIP: other_end = end + offset; donor_id = acceptor_id = -INFTY; if(end - beg < margin) break; if(the_end - other_end < margin) break; for(j = contig_index[s][ref_id]; contig_sites[s][ref_id][j].pos <= other_end && j < contig_count[s][ref_id];j++) { if(contig_sites[s][ref_id][j].pos - end < min_intron_length && min_intron_length > 0) continue; if(contig_sites[s][ref_id][j].pos - end > max_intron_length && max_intron_length > 0) break; if(contig_sites[s][ref_id][j].label == contig_sites[s][ref_id][contig_index[s][ref_id]].label) { if(contig_sites[s][ref_id][j].pos == end - 1) donor_id = j; if(contig_sites[s][ref_id][j].pos == other_end) acceptor_id = j; } } if(donor_id>0 && acceptor_id>0) { update_count(&contig_sites[s][ref_id][donor_id].junctions, acceptor_id, side); contig_sites[s][ref_id][donor_id].count5X[side]++; contig_sites[s][ref_id][acceptor_id].countX3[side]++; read_type = RT_KJUNCT; } else { read_type = RT_UJUNCT; } end = other_end; break; case BAM_CSOFT_CLIP: case BAM_CHARD_CLIP: case BAM_CPAD: break; default: read_type = RT_OTHER; } } if(read_type == RT_GENOME) { for(j=contig_index[s][ref_id]; beg + margin <= contig_sites[s][ref_id][j].pos && contig_sites[s][ref_id][j].pos < end - margin && j<contig_count[s][ref_id]; j++) { contig_sites[s][ref_id][j].count00[side]++; read_type = RT_OVRLAP; k++; } } } n_reads[read_type][side]++; } n_total_reads++; if(k>limit_counts && limit_counts>0) break; }