bool YTranscriptFetcher::fetchBAMTranscripts(const char* filename, const char *refName, unsigned int start, unsigned int end, std::vector<YTranscript*> *transcripts,std::set<std::string> *transcriptNames) { //Open the region in the bam file fetch_data_t data; fetch_data_t *d = &data; d->beg = start-1-buffer; d->end = end+buffer; d->transcripts = transcripts; d->requestedTranscripts = transcriptNames; d->in = samopen(filename, "rb", 0); if (d->in == 0) { fprintf(stderr, "Failed to open BAM file %s\n", filename); return 0; } bam_index_t *idx; idx = bam_index_load(filename); // load BAM index if (idx == 0) { fprintf(stderr, "BAM indexing file is not available.\n"); return 0; } bam_init_header_hash(d->in->header); d->tid = bam_get_tid(d->in->header, refName); if(d->tid == -1) { fprintf(stderr, "Reference id %s not found in BAM file",refName); return 0; } bam_fetch(d->in->x.bam, idx, d->tid, d->beg, d->end, d, fetch_func); bam_index_destroy(idx); samclose(d->in); return 1; }
void hash_reads( table* T, const char* reads_fn, interval_stack* is ) { samfile_t* reads_f = samopen( reads_fn, "rb", NULL ); if( reads_f == NULL ) { failf( "Can't open bam file '%s'.", reads_fn ); } bam_index_t* reads_index = bam_index_load( reads_fn ); if( reads_index == NULL ) { failf( "Can't open bam index '%s.bai'.", reads_fn ); } bam_init_header_hash( reads_f->header ); table_create( T, reads_f->header->n_targets ); T->seq_names = (char**)malloc( sizeof(char*) * reads_f->header->n_targets ); size_t k; for( k = 0; k < reads_f->header->n_targets; k++ ) { T->seq_names[k] = strdup(reads_f->header->target_name[k]); } log_puts( LOG_MSG, "hashing reads ... \n" ); log_indent(); bam_iter_t read_iter; bam1_t* read = bam_init1(); int tid; interval_stack::iterator i; for( i = is->begin(); i != is->end(); i++ ) { tid = bam_get_tid( reads_f->header, i->seqname ); if( tid < 0 ) continue; read_iter = bam_iter_query( reads_index, tid, i->start, i->end ); while( bam_iter_read( reads_f->x.bam, read_iter, read ) >= 0 ) { if( bam1_strand(read) == i->strand ) { table_inc( T, read ); } } bam_iter_destroy(read_iter); } bam_destroy1(read); log_unindent(); log_printf( LOG_MSG, "done. (%zu unique reads hashed)\n", T->m ); bam_index_destroy(reads_index); samclose(reads_f); }
// Construct a map linking tid's to sequence names (chromosome names). This code // assumes all input bam files have identical set of sequence (identical both in // name and order). std::map<int, std::string> get_sequence_name_dictionary(ControlState& state) { std::map<int, std::string> dict; std::vector<bam_info>::iterator bam_info_iter; for (bam_info_iter = state.bams_to_parse.begin(); bam_info_iter != state.bams_to_parse.end(); ++bam_info_iter) { bamFile fp = bam_open((*bam_info_iter).BamFile.c_str(), "r"); bam_header_t *header = bam_header_read(fp); bam_init_header_hash(header); for (int tid = 0; tid < header->n_targets; tid++) { dict.insert(std::make_pair(tid, header->target_name[tid])); } break; // Skip other bam files, the sequences should be identical. } return dict; }
static int load_discordant_reads(MEI_data& mei_data, std::vector<bam_info>& bam_sources, const std::string& chr_name, const SearchWindow& window, UserDefinedSettings* userSettings) { // Loop over associated bam files. for (size_t i = 0; i < bam_sources.size(); i++) { // Locate file. bam_info source = bam_sources.at(i); LOG_DEBUG(*logStream << time_log() << "Loading discordant reads from " << source.BamFile << std::endl); // Setup link to bamfile, its index and header. bamFile fp = bam_open(source.BamFile.c_str(), "r"); bam_index_t *idx = bam_index_load(source.BamFile.c_str()); if (idx == NULL) { LOG_WARN(*logStream << time_log() << "Failed to load index for " << source.BamFile.c_str() << std::endl); LOG_WARN(*logStream << "Skipping window: " << chr_name << ", " << window.getStart() << "--" << window.getEnd() << " for BAM-file: " << source.BamFile.c_str() << std::endl); continue; } bam_header_t *header = bam_header_read(fp); bam_init_header_hash(header); int tid = bam_get_tid(header, chr_name.c_str()); if (tid < 0) { LOG_WARN(*logStream << time_log() << "Could not find sequence in alignment file: '" << chr_name << "'" << std::endl); LOG_WARN(*logStream << "Skipping window: " << chr_name << ", " << window.getStart() << "--" << window.getEnd() << " for BAM-file: " << source.BamFile.c_str() << std::endl); continue; } mei_data.sample_names = get_sample_dictionary(header); // Save insert size of current bamfile in data object provided for callback function. // Note: the insert size should ideally be separate from the MEI_data object, tried to do // this using a std::pair object, which did not work. Suggestions are welcome here. mei_data.current_insert_size = source.InsertSize; mei_data.current_chr_name = chr_name; // Set up environment variable for callback function. std::pair<MEI_data*, UserDefinedSettings*> env = std::make_pair(&mei_data, userSettings); // Load discordant reads into mei_data. bam_fetch(fp, idx, tid, window.getStart(), window.getEnd(), &env, fetch_disc_read_callback); bam_index_destroy(idx); } return 0; }
static bam_header_t *hash2header(const kh_ref_t *hash) { bam_header_t *header; khiter_t k; header = bam_header_init(); header->n_targets = kh_size(hash); header->target_name = (char**)calloc(kh_size(hash), sizeof(char*)); header->target_len = (uint32_t*)calloc(kh_size(hash), 4); for (k = kh_begin(hash); k != kh_end(hash); ++k) { if (kh_exist(hash, k)) { int i = (int)kh_value(hash, k); header->target_name[i] = (char*)kh_key(hash, k); header->target_len[i] = kh_value(hash, k)>>32; } } bam_init_header_hash(header); return header; }
bam_header_t *bam_header_read(bamFile fp) { bam_header_t *header; char buf[4]; int magic_len; int32_t i = 1, name_len; // check EOF i = bgzf_check_EOF(fp); if (i < 0) { // If the file is a pipe, checking the EOF marker will *always* fail // with ESPIPE. Suppress the error message in this case. if (errno != ESPIPE) perror("[bam_header_read] bgzf_check_EOF"); } else if (i == 0) fprintf(stderr, "[bam_header_read] EOF marker is absent. The input is probably truncated.\n"); // read "BAM1" magic_len = bam_read(fp, buf, 4); if (magic_len != 4 || strncmp(buf, "BAM\001", 4) != 0) { fprintf(stderr, "[bam_header_read] invalid BAM binary header (this is not a BAM file).\n"); return 0; } header = bam_header_init(); // read plain text and the number of reference sequences bam_read(fp, &header->l_text, 4); if (bam_is_be) bam_swap_endian_4p(&header->l_text); header->text = (char*)calloc(header->l_text + 1, 1); bam_read(fp, header->text, header->l_text); bam_read(fp, &header->n_targets, 4); if (bam_is_be) bam_swap_endian_4p(&header->n_targets); // read reference sequence names and lengths header->target_name = (char**)calloc(header->n_targets, sizeof(char*)); header->target_len = (uint32_t*)calloc(header->n_targets, 4); for (i = 0; i != header->n_targets; ++i) { bam_read(fp, &name_len, 4); if (bam_is_be) bam_swap_endian_4p(&name_len); header->target_name[i] = (char*)calloc(name_len, 1); bam_read(fp, header->target_name[i], name_len); bam_read(fp, &header->target_len[i], 4); if (bam_is_be) bam_swap_endian_4p(&header->target_len[i]); } bam_init_header_hash(header); return header; }
bam_header_t *bam_header_dup(const bam_header_t *h0) { bam_header_t *h; int i; h = bam_header_init(); *h = *h0; h->hash = h->dict = h->rg2lib = 0; h->text = (char*)calloc(h->l_text + 1, 1); memcpy(h->text, h0->text, h->l_text); h->target_len = (uint32_t*)calloc(h->n_targets, 4); h->target_name = (char**)calloc(h->n_targets, sizeof(void*)); for (i = 0; i < h->n_targets; ++i) { h->target_len[i] = h0->target_len[i]; h->target_name[i] = strdup(h0->target_name[i]); } bam_init_header_hash(h); return h; }
/* will return non-0 on error. parsed error prof will be written to * alnerrprof. values are allocated here and should be freed with * free_alnerrprof */ int parse_alnerrprof_statsfile(alnerrprof_t *alnerrprof, const char *path, bam_header_t *bam_header) { char line[BUF_SIZE]; int i; int *max_obs_pos; const int default_read_len = 250; int free_bam_header_hash = 0; int rc; FILE *in = fopen(path, "r"); /* needed for finding tid from tname */ if (bam_header->hash == 0) { bam_init_header_hash(bam_header); free_bam_header_hash = 1; } max_obs_pos = calloc(bam_header->n_targets, sizeof(int)); alnerrprof->num_targets = bam_header->n_targets; alnerrprof->prop_len = calloc(alnerrprof->num_targets, sizeof(int)); alnerrprof->props = calloc(alnerrprof->num_targets, sizeof(double *)); for (i=0; i<alnerrprof->num_targets; i++) { alnerrprof->prop_len[i] = default_read_len;/* default alloc here and realloc later */ alnerrprof->props[i] = calloc(alnerrprof->prop_len[i], sizeof(double)); } i=-1; /* make sure value is not reused by accident; triggers clang warning though */ while (NULL != fgets(line, BUF_SIZE, in)) { int pos = -1; char tname[BUF_SIZE]; double prop = -1; unsigned long int count = -1; int tid = -1; if (line[0]=='#') { continue; } if (4 != sscanf(line, "%s\t%d\t%lg\t%lu\n", tname, &pos, &prop, &count)) { LOG_ERROR("Couldn't parse line %s\n", line); rc = 1; goto free_and_exit; } assert(prop>=0.0 && prop<=1.0); pos = pos - 1; assert(pos<MAX_READ_LEN); tid = bam_get_tid(bam_header, tname); if (-1 == tid) { LOG_ERROR("Target name '%s' found in error profile doesn't match any of the sequences in BAM header. Skipping and trying to continue...\n", tname); continue; } assert(tid<alnerrprof->num_targets); /* for later downsizing */ if (pos+1 > max_obs_pos[tid]) { max_obs_pos[tid] = pos+1; } /* upsize if necessary */ while (pos >= alnerrprof->prop_len[tid]) { LOG_DEBUG("upsizing pos+1=%d alnerrprof->prop_len[tid=%d]=%d\n\n", pos+1, tid, alnerrprof->prop_len[tid]); alnerrprof->prop_len[tid] *= 2; alnerrprof->props[tid] = realloc(alnerrprof->props[tid], alnerrprof->prop_len[tid] * sizeof(double)); } alnerrprof->props[tid][pos] = prop; } /* downsize */ for (i=0; i<alnerrprof->num_targets; i++) { if (max_obs_pos[i]) { LOG_DEBUG("downsizing alnerrprof->prop_len[tid=%d] to max %d\n", i, max_obs_pos[i]); alnerrprof->props[i] = realloc(alnerrprof->props[i], max_obs_pos[i] * sizeof(double)); } else { free(alnerrprof->props[i]);/* no data for this tid: free */ } alnerrprof->prop_len[i] = max_obs_pos[i]; } #if 0 {/* fixme report */ for (i=0; i<alnerrprof->num_targets; i++) { int j; fprintf(stderr, "tid=%d len=%d: ", i, alnerrprof->prop_len[i]); for (j=0; j<alnerrprof->prop_len[i]; j++) { fprintf(stderr, " %d:%g ", j, alnerrprof->props[i][j]); } fprintf(stderr, "\n"); fprintf(stderr, "median for tid %d: %g for size %d\n", i, dbl_median(alnerrprof->props[i], alnerrprof->prop_len[i]), alnerrprof->prop_len[i]); } } #endif rc = 0; free_and_exit: free(max_obs_pos); free_bam_header_hash = 0; /* FIXME segfaults often for unknown reason */ if (free_bam_header_hash) { bam_destroy_header_hash(bam_header); } fclose(in); return rc; }
int main_bedcov(int argc, char *argv[]) { extern void bam_init_header_hash(bam_header_t*); gzFile fp; kstring_t str; kstream_t *ks; bam_index_t **idx; bam_header_t *h = 0; aux_t **aux; int *n_plp, dret, i, n, c, min_mapQ = 0; int64_t *cnt; const bam_pileup1_t **plp; while ((c = getopt(argc, argv, "Q:")) >= 0) { switch (c) { case 'Q': min_mapQ = atoi(optarg); break; } } if (optind + 2 > argc) { fprintf(stderr, "Usage: samtools bedcov <in.bed> <in1.bam> [...]\n"); return 1; } memset(&str, 0, sizeof(kstring_t)); n = argc - optind - 1; aux = calloc(n, sizeof(aux_t*)); idx = calloc(n, sizeof(bam_index_t*)); for (i = 0; i < n; ++i) { aux[i] = calloc(1, sizeof(aux_t)); aux[i]->min_mapQ = min_mapQ; aux[i]->fp = bam_open(argv[i+optind+1], "r"); idx[i] = bam_index_load(argv[i+optind+1]); if (aux[i]->fp == 0 || idx[i] == 0) { fprintf(stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]); return 2; } bgzf_set_cache_size(aux[i]->fp, 20); if (i == 0) h = bam_header_read(aux[0]->fp); } bam_init_header_hash(h); cnt = calloc(n, 8); fp = gzopen(argv[optind], "rb"); ks = ks_init(fp); n_plp = calloc(n, sizeof(int)); plp = calloc(n, sizeof(bam_pileup1_t*)); while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) { char *p, *q; int tid, beg, end, pos; bam_mplp_t mplp; for (p = q = str.s; *p && *p != '\t'; ++p); if (*p != '\t') goto bed_error; *p = 0; tid = bam_get_tid(h, q); *p = '\t'; if (tid < 0) goto bed_error; for (q = p = p + 1; isdigit(*p); ++p); if (*p != '\t') goto bed_error; *p = 0; beg = atoi(q); *p = '\t'; for (q = p = p + 1; isdigit(*p); ++p); if (*p == '\t' || *p == 0) { int c = *p; *p = 0; end = atoi(q); *p = c; } else goto bed_error; for (i = 0; i < n; ++i) { if (aux[i]->iter) bam_iter_destroy(aux[i]->iter); aux[i]->iter = bam_iter_query(idx[i], tid, beg, end); } mplp = bam_mplp_init(n, read_bam, (void**)aux); bam_mplp_set_maxcnt(mplp, 64000); memset(cnt, 0, 8 * n); while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) if (pos >= beg && pos < end) for (i = 0; i < n; ++i) cnt[i] += n_plp[i]; for (i = 0; i < n; ++i) { kputc('\t', &str); kputl(cnt[i], &str); } puts(str.s); bam_mplp_destroy(mplp); continue; bed_error: fprintf(stderr, "Errors in BED line '%s'\n", str.s); } free(n_plp); free(plp); ks_destroy(ks); gzclose(fp); free(cnt); for (i = 0; i < n; ++i) { if (aux[i]->iter) bam_iter_destroy(aux[i]->iter); bam_index_destroy(idx[i]); bam_close(aux[i]->fp); free(aux[i]); } bam_header_destroy(h); free(aux); free(idx); free(str.s); return 0; }