bool YTranscriptFetcher::fetchBAMTranscripts(const char* filename, const char *refName, unsigned int start, unsigned int end, std::vector<YTranscript*> *transcripts,std::set<std::string> *transcriptNames) {
    //Open the region in the bam file

    fetch_data_t data;
    fetch_data_t *d = &data;
    d->beg = start-1-buffer;
    d->end = end+buffer;

    d->transcripts = transcripts;
    d->requestedTranscripts = transcriptNames;
    d->in = samopen(filename, "rb", 0);

    if (d->in == 0) {
        fprintf(stderr, "Failed to open BAM file %s\n", filename);
        return 0;
    }
    bam_index_t *idx;
    idx = bam_index_load(filename); // load BAM index
    if (idx == 0) {
        fprintf(stderr, "BAM indexing file is not available.\n");
        return 0;
    }
    bam_init_header_hash(d->in->header);
    d->tid = bam_get_tid(d->in->header, refName);
    if(d->tid == -1) {
        fprintf(stderr, "Reference id %s not found in BAM file",refName);
        return 0;
    }
    bam_fetch(d->in->x.bam, idx, d->tid, d->beg, d->end, d, fetch_func);
    bam_index_destroy(idx);
    samclose(d->in);
    return 1;
}
Exemple #2
0
void hash_reads( table* T, const char* reads_fn, interval_stack* is )
{
    samfile_t* reads_f = samopen( reads_fn, "rb", NULL );
    if( reads_f == NULL ) {
        failf( "Can't open bam file '%s'.", reads_fn );
    }

    bam_index_t* reads_index = bam_index_load( reads_fn );
    if( reads_index == NULL ) {
        failf( "Can't open bam index '%s.bai'.", reads_fn );
    }

    bam_init_header_hash( reads_f->header );

    table_create( T, reads_f->header->n_targets );
    T->seq_names = (char**)malloc( sizeof(char*) * reads_f->header->n_targets );
    size_t k;
    for( k = 0; k < reads_f->header->n_targets; k++ ) {
        T->seq_names[k] = strdup(reads_f->header->target_name[k]);
    }

    log_puts( LOG_MSG, "hashing reads ... \n" );
    log_indent();
    bam_iter_t read_iter;
    bam1_t* read = bam_init1();
    int tid;

    interval_stack::iterator i;
    for( i = is->begin(); i != is->end(); i++ ) {
        tid = bam_get_tid( reads_f->header, i->seqname );
        if( tid < 0 ) continue;

        read_iter = bam_iter_query( reads_index, tid,
                                    i->start, i->end );

        while( bam_iter_read( reads_f->x.bam, read_iter, read ) >= 0 ) {
            if( bam1_strand(read) == i->strand ) {
                table_inc( T, read );
            }
        }

        bam_iter_destroy(read_iter);
    }

    bam_destroy1(read);

    log_unindent();
    log_printf( LOG_MSG, "done. (%zu unique reads hashed)\n", T->m );


    bam_index_destroy(reads_index);
    samclose(reads_f);
}
Exemple #3
0
static int load_discordant_reads(MEI_data& mei_data, std::vector<bam_info>& bam_sources, const std::string& chr_name,
                                 const SearchWindow& window, UserDefinedSettings* userSettings) {
    // Loop over associated bam files.
    for (size_t i = 0; i < bam_sources.size(); i++) {
        // Locate file.
        bam_info source = bam_sources.at(i);
        
        LOG_DEBUG(*logStream << time_log() << "Loading discordant reads from " << source.BamFile << std::endl);
        
        // Setup link to bamfile, its index and header.
        bamFile fp = bam_open(source.BamFile.c_str(), "r");
        bam_index_t *idx = bam_index_load(source.BamFile.c_str());
        
        if (idx == NULL) {
            LOG_WARN(*logStream << time_log() << "Failed to load index for " << source.BamFile.c_str() << std::endl);
            LOG_WARN(*logStream << "Skipping window: " << chr_name << ", " << window.getStart() << "--" <<
                     window.getEnd() << " for BAM-file: " << source.BamFile.c_str() << std::endl);
            continue;
        }
        
        bam_header_t *header = bam_header_read(fp);
        bam_init_header_hash(header);
        int tid = bam_get_tid(header, chr_name.c_str());
        
        if (tid < 0) {
            LOG_WARN(*logStream << time_log() << "Could not find sequence in alignment file: '" << chr_name <<
                     "'" << std::endl);
            LOG_WARN(*logStream << "Skipping window: " << chr_name << ", " << window.getStart() << "--" <<
                     window.getEnd() << " for BAM-file: " << source.BamFile.c_str() << std::endl);
            continue;
        }
        
        mei_data.sample_names = get_sample_dictionary(header);
        
        // Save insert size of current bamfile in data object provided for callback function.
        // Note: the insert size should ideally be separate from the MEI_data object, tried to do
        // this using a std::pair object, which did not work.  Suggestions are welcome here.
        mei_data.current_insert_size = source.InsertSize;
        mei_data.current_chr_name = chr_name;
        
        // Set up environment variable for callback function.
        std::pair<MEI_data*, UserDefinedSettings*> env = std::make_pair(&mei_data, userSettings);
        
        // Load discordant reads into mei_data.
        bam_fetch(fp, idx, tid, window.getStart(), window.getEnd(), &env, fetch_disc_read_callback);
        bam_index_destroy(idx);
    }
    return 0;
}
Exemple #4
0
/* will return non-0 on error. parsed error prof will be written to
 * alnerrprof. values are allocated here and should be freed with
 * free_alnerrprof */
int
parse_alnerrprof_statsfile(alnerrprof_t *alnerrprof, const char *path, bam_header_t *bam_header)
{
     char line[BUF_SIZE];
     int i;
     int *max_obs_pos;
     const int default_read_len = 250;
     int free_bam_header_hash = 0;
     int rc;
     FILE *in = fopen(path, "r");


     /* needed for finding tid from tname */
     if (bam_header->hash == 0) {
          bam_init_header_hash(bam_header);             
          free_bam_header_hash = 1;
     }

     max_obs_pos = calloc(bam_header->n_targets, sizeof(int));
     
     alnerrprof->num_targets = bam_header->n_targets;
     alnerrprof->prop_len = calloc(alnerrprof->num_targets, sizeof(int));
     alnerrprof->props = calloc(alnerrprof->num_targets, sizeof(double *));     
     for (i=0; i<alnerrprof->num_targets; i++) {
          alnerrprof->prop_len[i] = default_read_len;/* default alloc here and realloc later */
          alnerrprof->props[i] = calloc(alnerrprof->prop_len[i], sizeof(double));
     }
     i=-1; /* make sure value is not reused by accident; triggers clang warning though */

     while (NULL != fgets(line, BUF_SIZE, in)) {
          int pos = -1;
          char tname[BUF_SIZE];
          double prop = -1;
          unsigned long int count = -1;
          int tid = -1;
          if (line[0]=='#') {
               continue;
          }

          if (4 != sscanf(line, "%s\t%d\t%lg\t%lu\n", tname, &pos, &prop, &count)) {
              LOG_ERROR("Couldn't parse line %s\n", line);
              rc = 1;
              goto free_and_exit;
         }

         assert(prop>=0.0 && prop<=1.0);

         pos = pos - 1;
         assert(pos<MAX_READ_LEN);

         tid = bam_get_tid(bam_header, tname);
         if (-1 == tid) {
              LOG_ERROR("Target name '%s' found in error profile doesn't match any of the sequences in BAM header. Skipping and trying to continue...\n", tname);
              continue;
         }
         assert(tid<alnerrprof->num_targets);

         /* for later downsizing */
         if (pos+1 > max_obs_pos[tid]) {
              max_obs_pos[tid] = pos+1;
         }

         /* upsize if necessary */
         while (pos >= alnerrprof->prop_len[tid]) {
              LOG_DEBUG("upsizing pos+1=%d alnerrprof->prop_len[tid=%d]=%d\n\n", pos+1, tid, alnerrprof->prop_len[tid]);
              alnerrprof->prop_len[tid] *= 2;
              alnerrprof->props[tid] = realloc(alnerrprof->props[tid], alnerrprof->prop_len[tid] * sizeof(double));
         }
         alnerrprof->props[tid][pos] = prop;
     }

     /* downsize */
     for (i=0; i<alnerrprof->num_targets; i++) {
          if (max_obs_pos[i]) {
               LOG_DEBUG("downsizing alnerrprof->prop_len[tid=%d] to max %d\n", i, max_obs_pos[i]);
               alnerrprof->props[i] = realloc(alnerrprof->props[i], max_obs_pos[i] * sizeof(double));
          } else {
               free(alnerrprof->props[i]);/* no data for this tid: free */
          }
          alnerrprof->prop_len[i] = max_obs_pos[i];
     }

#if 0
     {/* fixme report */
          for (i=0; i<alnerrprof->num_targets; i++) {
               int j;
               fprintf(stderr, "tid=%d len=%d: ", i, alnerrprof->prop_len[i]);
               for (j=0; j<alnerrprof->prop_len[i]; j++) {
                    fprintf(stderr, " %d:%g ", j, alnerrprof->props[i][j]);
               }
               fprintf(stderr, "\n");
               fprintf(stderr, "median for tid %d: %g for size %d\n",
                       i,
                       dbl_median(alnerrprof->props[i], alnerrprof->prop_len[i]),
                       alnerrprof->prop_len[i]);
          }
     }
#endif

     rc = 0;

free_and_exit:
     
     free(max_obs_pos);

     free_bam_header_hash = 0; /* FIXME segfaults often for unknown reason */
     if (free_bam_header_hash) {
          bam_destroy_header_hash(bam_header);
     }
     fclose(in);

     return rc;
}
/**
 * Collects sufficient statistics from read for variants to be genotyped.
 *
 * The VCF records in the buffer must never occur before
 */
void BCFSingleGenotypingBufferedReader::process_read(bam_hdr_t *h, bam1_t *s)
{
    //wrap bam1_t in AugmentBAMRecord
    as.initialize(h, s);

    uint32_t tid = bam_get_tid(s);
    uint32_t beg1 = as.beg1;
    uint32_t end1 = as.end1;

    //collect statistics for variant records that are in the buffer and overlap with the read
    GenotypingRecord* g;
    for (std::list<GenotypingRecord*>::iterator i=buffer.begin(); i!=buffer.end(); ++i)
    {
        g = *i;

//        std::cerr << g->pos1 << " " << g->beg1 << " " << g->end1 << " ";

        //same chromosome
        if (tid==g->rid)
        {
            if (end1 < g->beg1)
            {
                //can terminate
                return;
            }
            else if (beg1 > g->end1)
            {
                //this should not occur if the buffer was flushed before invoking process read
                continue;
            }
            //else if (beg1 <= g->beg1 && g->end1 <= end1)
            else if (beg1 <= g->pos1 && g->pos1 <= end1)
            {
//                collect_sufficient_statistics(*i, as);
            }
            else
            {
                //other types of overlap, just ignore
            }

//            std::cerr << "\n";
        }
        //prior chromosome
        else if (tid<g->rid)
        {
            //this should not occur if the buffer was flushed before invoking process read
            return;
        }
        //latter chromosome
        else if (tid>g->rid)
        {
            //in case if the buffer has a VCF record later in the list which matches it
            continue;
        }
    }

    //you will only reach here if a read occurs after or overlaps the last record in the buffer
    //adding new VCF records and collecting statistics if necessary
    bcf1_t *v = bcf_init();
    while (odr->read(v))
    {
        int32_t vtype = vm->classify_variant(odr->hdr, v, variant);
        g = create_genotyping_record(odr->hdr, v, 2, variant);
        buffer.push_back(g);

        if (tid==g->rid)
        {
            //if (end1>=g->beg1 && pos1<=g->end1)
            if (beg1 <= g->pos1 && g->pos1 <= end1)
            {
//                collect_sufficient_statistics(g, as);
            }
        }

        //VCF record occurs after the read
        if (tid < g->rid || end1 < g->beg1)
        {
            return;
        }
        else
        {
            v = bcf_init();
        }
    }

    //this means end of file
    bcf_destroy(v);
}
/**
 * Flush records.
 */
void BCFSingleGenotypingBufferedReader::flush(bam_hdr_t *h, bam1_t *s, bool flush_all)
{
    if (flush_all)
    {
        //read all the remaining from the reference genotyping file
        bcf1_t *v = bcf_init();
        while (odr->read(v))
        {
            int32_t vtype = vm->classify_variant(odr->hdr, v, variant);
            GenotypingRecord* g = create_genotyping_record(odr->hdr, v, 2, variant);
            buffer.push_back(g);
            v = bcf_init();
        }
        bcf_destroy(v);

        GenotypingRecord* g;
        while (!buffer.empty())
        {
            g = buffer.front();
//            genotype_and_print(g);
            delete g;
            buffer.pop_front();
        }
    }
    else
    {
        //std::cerr << "partial flush\n";

        uint32_t tid = bam_get_tid(s);
        GenotypingRecord* g;

        while (!buffer.empty())
        {
            g = buffer.front();

            if (tid==g->rid)
            {
                if (bam_get_pos1(s) > g->end1)
                {
//                    genotype_and_print(g);
                    delete g;
                    buffer.pop_front();
                }
                else
                {
                    return;
                }
            }
            else if (tid>g->rid)
            {
//                genotype_and_print(g);
                delete g;
                buffer.pop_front();
            }
            else
            {
                return;
            }
        }
    }
}
Exemple #7
0
int main_bedcov(int argc, char *argv[])
{
	extern void bam_init_header_hash(bam_header_t*);
	gzFile fp;
	kstring_t str;
	kstream_t *ks;
	bam_index_t **idx;
	bam_header_t *h = 0;
	aux_t **aux;
	int *n_plp, dret, i, n, c, min_mapQ = 0;
	int64_t *cnt;
	const bam_pileup1_t **plp;

	while ((c = getopt(argc, argv, "Q:")) >= 0) {
		switch (c) {
		case 'Q': min_mapQ = atoi(optarg); break;
		}
	}
	if (optind + 2 > argc) {
		fprintf(stderr, "Usage: samtools bedcov <in.bed> <in1.bam> [...]\n");
		return 1;
	}
	memset(&str, 0, sizeof(kstring_t));
	n = argc - optind - 1;
	aux = calloc(n, sizeof(aux_t*));
	idx = calloc(n, sizeof(bam_index_t*));
	for (i = 0; i < n; ++i) {
		aux[i] = calloc(1, sizeof(aux_t));
		aux[i]->min_mapQ = min_mapQ;
		aux[i]->fp = bam_open(argv[i+optind+1], "r");
		idx[i] = bam_index_load(argv[i+optind+1]);
		if (aux[i]->fp == 0 || idx[i] == 0) {
			fprintf(stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]);
			return 2;
		}
		bgzf_set_cache_size(aux[i]->fp, 20);
		if (i == 0) h = bam_header_read(aux[0]->fp);
	}
	bam_init_header_hash(h);
	cnt = calloc(n, 8);

	fp = gzopen(argv[optind], "rb");
	ks = ks_init(fp);
	n_plp = calloc(n, sizeof(int));
	plp = calloc(n, sizeof(bam_pileup1_t*));
	while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) {
		char *p, *q;
		int tid, beg, end, pos;
		bam_mplp_t mplp;

		for (p = q = str.s; *p && *p != '\t'; ++p);
		if (*p != '\t') goto bed_error;
		*p = 0; tid = bam_get_tid(h, q); *p = '\t';
		if (tid < 0) goto bed_error;
		for (q = p = p + 1; isdigit(*p); ++p);
		if (*p != '\t') goto bed_error;
		*p = 0; beg = atoi(q); *p = '\t';
		for (q = p = p + 1; isdigit(*p); ++p);
		if (*p == '\t' || *p == 0) {
			int c = *p;
			*p = 0; end = atoi(q); *p = c;
		} else goto bed_error;

		for (i = 0; i < n; ++i) {
			if (aux[i]->iter) bam_iter_destroy(aux[i]->iter);
			aux[i]->iter = bam_iter_query(idx[i], tid, beg, end);
		}
		mplp = bam_mplp_init(n, read_bam, (void**)aux);
		bam_mplp_set_maxcnt(mplp, 64000);
		memset(cnt, 0, 8 * n);
		while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0)
			if (pos >= beg && pos < end)
				for (i = 0; i < n; ++i) cnt[i] += n_plp[i];
		for (i = 0; i < n; ++i) {
			kputc('\t', &str);
			kputl(cnt[i], &str);
		}
		puts(str.s);
		bam_mplp_destroy(mplp);
		continue;

bed_error:
		fprintf(stderr, "Errors in BED line '%s'\n", str.s);
	}
	free(n_plp); free(plp);
	ks_destroy(ks);
	gzclose(fp);

	free(cnt);
	for (i = 0; i < n; ++i) {
		if (aux[i]->iter) bam_iter_destroy(aux[i]->iter);
		bam_index_destroy(idx[i]);
		bam_close(aux[i]->fp);
		free(aux[i]);
	}
	bam_header_destroy(h);
	free(aux); free(idx);
	free(str.s);
	return 0;
}
int32_t
bam_streamer::
target_name_to_id(const char* seq_name) const
{
    return bam_get_tid(_bfp->header,seq_name);
}