bool YTranscriptFetcher::fetchBAMTranscripts(const char* filename, const char *refName, unsigned int start, unsigned int end, std::vector<YTranscript*> *transcripts,std::set<std::string> *transcriptNames) { //Open the region in the bam file fetch_data_t data; fetch_data_t *d = &data; d->beg = start-1-buffer; d->end = end+buffer; d->transcripts = transcripts; d->requestedTranscripts = transcriptNames; d->in = samopen(filename, "rb", 0); if (d->in == 0) { fprintf(stderr, "Failed to open BAM file %s\n", filename); return 0; } bam_index_t *idx; idx = bam_index_load(filename); // load BAM index if (idx == 0) { fprintf(stderr, "BAM indexing file is not available.\n"); return 0; } bam_init_header_hash(d->in->header); d->tid = bam_get_tid(d->in->header, refName); if(d->tid == -1) { fprintf(stderr, "Reference id %s not found in BAM file",refName); return 0; } bam_fetch(d->in->x.bam, idx, d->tid, d->beg, d->end, d, fetch_func); bam_index_destroy(idx); samclose(d->in); return 1; }
void hash_reads( table* T, const char* reads_fn, interval_stack* is ) { samfile_t* reads_f = samopen( reads_fn, "rb", NULL ); if( reads_f == NULL ) { failf( "Can't open bam file '%s'.", reads_fn ); } bam_index_t* reads_index = bam_index_load( reads_fn ); if( reads_index == NULL ) { failf( "Can't open bam index '%s.bai'.", reads_fn ); } bam_init_header_hash( reads_f->header ); table_create( T, reads_f->header->n_targets ); T->seq_names = (char**)malloc( sizeof(char*) * reads_f->header->n_targets ); size_t k; for( k = 0; k < reads_f->header->n_targets; k++ ) { T->seq_names[k] = strdup(reads_f->header->target_name[k]); } log_puts( LOG_MSG, "hashing reads ... \n" ); log_indent(); bam_iter_t read_iter; bam1_t* read = bam_init1(); int tid; interval_stack::iterator i; for( i = is->begin(); i != is->end(); i++ ) { tid = bam_get_tid( reads_f->header, i->seqname ); if( tid < 0 ) continue; read_iter = bam_iter_query( reads_index, tid, i->start, i->end ); while( bam_iter_read( reads_f->x.bam, read_iter, read ) >= 0 ) { if( bam1_strand(read) == i->strand ) { table_inc( T, read ); } } bam_iter_destroy(read_iter); } bam_destroy1(read); log_unindent(); log_printf( LOG_MSG, "done. (%zu unique reads hashed)\n", T->m ); bam_index_destroy(reads_index); samclose(reads_f); }
static int load_discordant_reads(MEI_data& mei_data, std::vector<bam_info>& bam_sources, const std::string& chr_name, const SearchWindow& window, UserDefinedSettings* userSettings) { // Loop over associated bam files. for (size_t i = 0; i < bam_sources.size(); i++) { // Locate file. bam_info source = bam_sources.at(i); LOG_DEBUG(*logStream << time_log() << "Loading discordant reads from " << source.BamFile << std::endl); // Setup link to bamfile, its index and header. bamFile fp = bam_open(source.BamFile.c_str(), "r"); bam_index_t *idx = bam_index_load(source.BamFile.c_str()); if (idx == NULL) { LOG_WARN(*logStream << time_log() << "Failed to load index for " << source.BamFile.c_str() << std::endl); LOG_WARN(*logStream << "Skipping window: " << chr_name << ", " << window.getStart() << "--" << window.getEnd() << " for BAM-file: " << source.BamFile.c_str() << std::endl); continue; } bam_header_t *header = bam_header_read(fp); bam_init_header_hash(header); int tid = bam_get_tid(header, chr_name.c_str()); if (tid < 0) { LOG_WARN(*logStream << time_log() << "Could not find sequence in alignment file: '" << chr_name << "'" << std::endl); LOG_WARN(*logStream << "Skipping window: " << chr_name << ", " << window.getStart() << "--" << window.getEnd() << " for BAM-file: " << source.BamFile.c_str() << std::endl); continue; } mei_data.sample_names = get_sample_dictionary(header); // Save insert size of current bamfile in data object provided for callback function. // Note: the insert size should ideally be separate from the MEI_data object, tried to do // this using a std::pair object, which did not work. Suggestions are welcome here. mei_data.current_insert_size = source.InsertSize; mei_data.current_chr_name = chr_name; // Set up environment variable for callback function. std::pair<MEI_data*, UserDefinedSettings*> env = std::make_pair(&mei_data, userSettings); // Load discordant reads into mei_data. bam_fetch(fp, idx, tid, window.getStart(), window.getEnd(), &env, fetch_disc_read_callback); bam_index_destroy(idx); } return 0; }
/* will return non-0 on error. parsed error prof will be written to * alnerrprof. values are allocated here and should be freed with * free_alnerrprof */ int parse_alnerrprof_statsfile(alnerrprof_t *alnerrprof, const char *path, bam_header_t *bam_header) { char line[BUF_SIZE]; int i; int *max_obs_pos; const int default_read_len = 250; int free_bam_header_hash = 0; int rc; FILE *in = fopen(path, "r"); /* needed for finding tid from tname */ if (bam_header->hash == 0) { bam_init_header_hash(bam_header); free_bam_header_hash = 1; } max_obs_pos = calloc(bam_header->n_targets, sizeof(int)); alnerrprof->num_targets = bam_header->n_targets; alnerrprof->prop_len = calloc(alnerrprof->num_targets, sizeof(int)); alnerrprof->props = calloc(alnerrprof->num_targets, sizeof(double *)); for (i=0; i<alnerrprof->num_targets; i++) { alnerrprof->prop_len[i] = default_read_len;/* default alloc here and realloc later */ alnerrprof->props[i] = calloc(alnerrprof->prop_len[i], sizeof(double)); } i=-1; /* make sure value is not reused by accident; triggers clang warning though */ while (NULL != fgets(line, BUF_SIZE, in)) { int pos = -1; char tname[BUF_SIZE]; double prop = -1; unsigned long int count = -1; int tid = -1; if (line[0]=='#') { continue; } if (4 != sscanf(line, "%s\t%d\t%lg\t%lu\n", tname, &pos, &prop, &count)) { LOG_ERROR("Couldn't parse line %s\n", line); rc = 1; goto free_and_exit; } assert(prop>=0.0 && prop<=1.0); pos = pos - 1; assert(pos<MAX_READ_LEN); tid = bam_get_tid(bam_header, tname); if (-1 == tid) { LOG_ERROR("Target name '%s' found in error profile doesn't match any of the sequences in BAM header. Skipping and trying to continue...\n", tname); continue; } assert(tid<alnerrprof->num_targets); /* for later downsizing */ if (pos+1 > max_obs_pos[tid]) { max_obs_pos[tid] = pos+1; } /* upsize if necessary */ while (pos >= alnerrprof->prop_len[tid]) { LOG_DEBUG("upsizing pos+1=%d alnerrprof->prop_len[tid=%d]=%d\n\n", pos+1, tid, alnerrprof->prop_len[tid]); alnerrprof->prop_len[tid] *= 2; alnerrprof->props[tid] = realloc(alnerrprof->props[tid], alnerrprof->prop_len[tid] * sizeof(double)); } alnerrprof->props[tid][pos] = prop; } /* downsize */ for (i=0; i<alnerrprof->num_targets; i++) { if (max_obs_pos[i]) { LOG_DEBUG("downsizing alnerrprof->prop_len[tid=%d] to max %d\n", i, max_obs_pos[i]); alnerrprof->props[i] = realloc(alnerrprof->props[i], max_obs_pos[i] * sizeof(double)); } else { free(alnerrprof->props[i]);/* no data for this tid: free */ } alnerrprof->prop_len[i] = max_obs_pos[i]; } #if 0 {/* fixme report */ for (i=0; i<alnerrprof->num_targets; i++) { int j; fprintf(stderr, "tid=%d len=%d: ", i, alnerrprof->prop_len[i]); for (j=0; j<alnerrprof->prop_len[i]; j++) { fprintf(stderr, " %d:%g ", j, alnerrprof->props[i][j]); } fprintf(stderr, "\n"); fprintf(stderr, "median for tid %d: %g for size %d\n", i, dbl_median(alnerrprof->props[i], alnerrprof->prop_len[i]), alnerrprof->prop_len[i]); } } #endif rc = 0; free_and_exit: free(max_obs_pos); free_bam_header_hash = 0; /* FIXME segfaults often for unknown reason */ if (free_bam_header_hash) { bam_destroy_header_hash(bam_header); } fclose(in); return rc; }
/** * Collects sufficient statistics from read for variants to be genotyped. * * The VCF records in the buffer must never occur before */ void BCFSingleGenotypingBufferedReader::process_read(bam_hdr_t *h, bam1_t *s) { //wrap bam1_t in AugmentBAMRecord as.initialize(h, s); uint32_t tid = bam_get_tid(s); uint32_t beg1 = as.beg1; uint32_t end1 = as.end1; //collect statistics for variant records that are in the buffer and overlap with the read GenotypingRecord* g; for (std::list<GenotypingRecord*>::iterator i=buffer.begin(); i!=buffer.end(); ++i) { g = *i; // std::cerr << g->pos1 << " " << g->beg1 << " " << g->end1 << " "; //same chromosome if (tid==g->rid) { if (end1 < g->beg1) { //can terminate return; } else if (beg1 > g->end1) { //this should not occur if the buffer was flushed before invoking process read continue; } //else if (beg1 <= g->beg1 && g->end1 <= end1) else if (beg1 <= g->pos1 && g->pos1 <= end1) { // collect_sufficient_statistics(*i, as); } else { //other types of overlap, just ignore } // std::cerr << "\n"; } //prior chromosome else if (tid<g->rid) { //this should not occur if the buffer was flushed before invoking process read return; } //latter chromosome else if (tid>g->rid) { //in case if the buffer has a VCF record later in the list which matches it continue; } } //you will only reach here if a read occurs after or overlaps the last record in the buffer //adding new VCF records and collecting statistics if necessary bcf1_t *v = bcf_init(); while (odr->read(v)) { int32_t vtype = vm->classify_variant(odr->hdr, v, variant); g = create_genotyping_record(odr->hdr, v, 2, variant); buffer.push_back(g); if (tid==g->rid) { //if (end1>=g->beg1 && pos1<=g->end1) if (beg1 <= g->pos1 && g->pos1 <= end1) { // collect_sufficient_statistics(g, as); } } //VCF record occurs after the read if (tid < g->rid || end1 < g->beg1) { return; } else { v = bcf_init(); } } //this means end of file bcf_destroy(v); }
/** * Flush records. */ void BCFSingleGenotypingBufferedReader::flush(bam_hdr_t *h, bam1_t *s, bool flush_all) { if (flush_all) { //read all the remaining from the reference genotyping file bcf1_t *v = bcf_init(); while (odr->read(v)) { int32_t vtype = vm->classify_variant(odr->hdr, v, variant); GenotypingRecord* g = create_genotyping_record(odr->hdr, v, 2, variant); buffer.push_back(g); v = bcf_init(); } bcf_destroy(v); GenotypingRecord* g; while (!buffer.empty()) { g = buffer.front(); // genotype_and_print(g); delete g; buffer.pop_front(); } } else { //std::cerr << "partial flush\n"; uint32_t tid = bam_get_tid(s); GenotypingRecord* g; while (!buffer.empty()) { g = buffer.front(); if (tid==g->rid) { if (bam_get_pos1(s) > g->end1) { // genotype_and_print(g); delete g; buffer.pop_front(); } else { return; } } else if (tid>g->rid) { // genotype_and_print(g); delete g; buffer.pop_front(); } else { return; } } } }
int main_bedcov(int argc, char *argv[]) { extern void bam_init_header_hash(bam_header_t*); gzFile fp; kstring_t str; kstream_t *ks; bam_index_t **idx; bam_header_t *h = 0; aux_t **aux; int *n_plp, dret, i, n, c, min_mapQ = 0; int64_t *cnt; const bam_pileup1_t **plp; while ((c = getopt(argc, argv, "Q:")) >= 0) { switch (c) { case 'Q': min_mapQ = atoi(optarg); break; } } if (optind + 2 > argc) { fprintf(stderr, "Usage: samtools bedcov <in.bed> <in1.bam> [...]\n"); return 1; } memset(&str, 0, sizeof(kstring_t)); n = argc - optind - 1; aux = calloc(n, sizeof(aux_t*)); idx = calloc(n, sizeof(bam_index_t*)); for (i = 0; i < n; ++i) { aux[i] = calloc(1, sizeof(aux_t)); aux[i]->min_mapQ = min_mapQ; aux[i]->fp = bam_open(argv[i+optind+1], "r"); idx[i] = bam_index_load(argv[i+optind+1]); if (aux[i]->fp == 0 || idx[i] == 0) { fprintf(stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]); return 2; } bgzf_set_cache_size(aux[i]->fp, 20); if (i == 0) h = bam_header_read(aux[0]->fp); } bam_init_header_hash(h); cnt = calloc(n, 8); fp = gzopen(argv[optind], "rb"); ks = ks_init(fp); n_plp = calloc(n, sizeof(int)); plp = calloc(n, sizeof(bam_pileup1_t*)); while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) { char *p, *q; int tid, beg, end, pos; bam_mplp_t mplp; for (p = q = str.s; *p && *p != '\t'; ++p); if (*p != '\t') goto bed_error; *p = 0; tid = bam_get_tid(h, q); *p = '\t'; if (tid < 0) goto bed_error; for (q = p = p + 1; isdigit(*p); ++p); if (*p != '\t') goto bed_error; *p = 0; beg = atoi(q); *p = '\t'; for (q = p = p + 1; isdigit(*p); ++p); if (*p == '\t' || *p == 0) { int c = *p; *p = 0; end = atoi(q); *p = c; } else goto bed_error; for (i = 0; i < n; ++i) { if (aux[i]->iter) bam_iter_destroy(aux[i]->iter); aux[i]->iter = bam_iter_query(idx[i], tid, beg, end); } mplp = bam_mplp_init(n, read_bam, (void**)aux); bam_mplp_set_maxcnt(mplp, 64000); memset(cnt, 0, 8 * n); while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) if (pos >= beg && pos < end) for (i = 0; i < n; ++i) cnt[i] += n_plp[i]; for (i = 0; i < n; ++i) { kputc('\t', &str); kputl(cnt[i], &str); } puts(str.s); bam_mplp_destroy(mplp); continue; bed_error: fprintf(stderr, "Errors in BED line '%s'\n", str.s); } free(n_plp); free(plp); ks_destroy(ks); gzclose(fp); free(cnt); for (i = 0; i < n; ++i) { if (aux[i]->iter) bam_iter_destroy(aux[i]->iter); bam_index_destroy(idx[i]); bam_close(aux[i]->fp); free(aux[i]); } bam_header_destroy(h); free(aux); free(idx); free(str.s); return 0; }
int32_t bam_streamer:: target_name_to_id(const char* seq_name) const { return bam_get_tid(_bfp->header,seq_name); }