/** * Gets records for the most recent position and fills up the buffer from file i. * returns true if buffer is filled or it is not necessary to fill buffer. * returns false if no more records are found to fill buffer */ void BCFSyncedStreamReader::fill_buffer(int32_t i) { //not necessary to fill buffer if (buffer[i].size()>=2) return; int32_t pos1 = buffer[i].size()==0 ? 0 : bcf_get_pos1(buffer[i].front()); if (ftypes[i]==FT_BCF_GZ) { bcf1_t *v = get_bcf1_from_pool(); bool populated = false; while (itrs[i] && bcf_itr_next(vcfs[i], itrs[i], v) >= 0) { populated = true; bcf_unpack(v, BCF_UN_STR); buffer[i].push_back(v); insert_into_pq(i, v); if (pos1==0) { pos1 = bcf_get_pos1(v); } if (bcf_get_pos1(v)!=pos1) { break; } v = get_bcf1_from_pool(); populated = false; } if (!populated) store_bcf1_into_pool(v); } else if (ftypes[i]==FT_VCF_GZ) { while (itrs[i] && tbx_itr_next(vcfs[i], tbxs[i], itrs[i], &s) >= 0) { bcf1_t *v = get_bcf1_from_pool(); vcf_parse(&s, hdrs[i], v); bcf_unpack(v, BCF_UN_STR); buffer[i].push_back(v); insert_into_pq(i, v); if (pos1==0) { pos1 = bcf_get_pos1(v); } if (bcf_get_pos1(v)!=pos1) { break; } } } }
/** * @brief pre-fetches the next variant record * @warning we're reusing the existing htslib memory, so users should be aware that all objects from the previous iteration are now stale unless a deep copy has been performed */ void IndexedVariantIterator::fetch_next_record() { while (bcf_itr_next(m_variant_file_ptr, m_index_iter_ptr.get(), m_variant_record_ptr.get()) < 0) { ++m_interval_iter; if (m_interval_list.end() == m_interval_iter) { m_variant_file_ptr.reset(); m_variant_record = Variant{}; return; } m_index_iter_ptr.reset(bcf_itr_querys(m_variant_index_ptr.get(), m_variant_header_ptr.get(), m_interval_iter->c_str())); } }
/** * Reads next record, hides the random access of different regions from the user. */ bool BCFOrderedReader::read(bcf1_t *v) { if (random_access_enabled) { if (ftype.format==bcf) { while(true) { if (itr && bcf_itr_next(file, itr, v)>=0) { return true; } else if (!initialize_next_interval()) { return false; } } } else { while(true) { if (itr && tbx_itr_next(file, tbx, itr, &s)>=0) { vcf_parse1(&s, hdr, v); return true; } else if (!initialize_next_interval()) { return false; } } } } else { if (bcf_read(file, hdr, v)==0) { return true; } else { return false; } } return false; };
int main_vcfview(int argc, char *argv[]) { int i, c, clevel = -1, flag = 0, n_samples = -1, *imap = 0, excl_snp = 0, excl_indel = 0; char *fn_ref = 0, *fn_out = 0, moder[8], **samples = 0; bcf_hdr_t *h, *hsub = 0; htsFile *in; bcf1_t *b; while ((c = getopt(argc, argv, "l:bSt:o:T:s:GNI")) >= 0) { switch (c) { case 'l': clevel = atoi(optarg); flag |= 2; break; case 'S': flag |= 1; break; case 'b': flag |= 2; break; case 'G': n_samples = 0; break; case 't': fn_ref = optarg; flag |= 1; break; case 'o': fn_out = optarg; break; case 's': samples = hts_readlines(optarg, &n_samples); break; case 'N': excl_snp = 1; break; case 'I': excl_indel = 1; break; } } if (argc == optind) { fprintf(stderr, "\nUsage: vcfview [options] <in.bcf>|<in.vcf>|<in.vcf.gz>\n\n"); fprintf(stderr, "Options: -b output in BCF\n"); fprintf(stderr, " -S input is VCF\n"); fprintf(stderr, " -o FILE output file name [stdout]\n"); fprintf(stderr, " -l INT compression level [%d]\n", clevel); fprintf(stderr, " -t FILE list of reference names and lengths [null]\n"); fprintf(stderr, " -s FILE/STR list of samples (STR if started with ':'; FILE otherwise) [null]\n"); fprintf(stderr, " -G drop individual genotype information\n"); fprintf(stderr, " -N exclude SNPs\n"); fprintf(stderr, " -I exclude INDELs\n"); fprintf(stderr, "\n"); return 1; } strcpy(moder, "r"); if ((flag&1) == 0 && !(file_type(argv[optind])&(IS_VCF|IS_VCF_GZ))) strcat(moder, "b"); in = hts_open(argv[optind], moder, fn_ref); h = vcf_hdr_read(in); if (h == 0) { fprintf(stderr, "[E::%s] fail to read the VCF/BCF2 header\n", __func__); hts_close(in); return 1; } if (n_samples >= 0) { if (n_samples) imap = (int*)malloc(n_samples * sizeof(int)); hsub = bcf_hdr_subset(h, n_samples, samples, imap); } b = bcf_init1(); if ((flag&4) == 0) { // VCF/BCF output htsFile *out; char modew[8]; strcpy(modew, "w"); if (clevel >= 0 && clevel <= 9) sprintf(modew + 1, "%d", clevel); if (flag&2) strcat(modew, "b"); out = hts_open(fn_out? fn_out : "-", modew, 0); vcf_hdr_write(out, hsub? hsub : h); if (optind + 1 < argc && !(flag&1)) { // BAM input and has a region hts_idx_t *idx; if ((idx = bcf_index_load(argv[optind])) == 0) { fprintf(stderr, "[E::%s] fail to load the BCF index\n", __func__); return 1; } for (i = optind + 1; i < argc; ++i) { hts_itr_t *iter; if ((iter = bcf_itr_querys(idx, h, argv[i])) == 0) { fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, argv[i]); continue; } while (bcf_itr_next((BGZF*)in->fp, iter, b) >= 0) { if (excl_snp && bcf_is_snp(b)) continue; if (excl_indel && !bcf_is_snp(b)) continue; if (n_samples >= 0) { bcf_subset(h, b, n_samples, imap); vcf_write1(out, hsub, b); } else vcf_write1(out, h, b); } hts_itr_destroy(iter); } hts_idx_destroy(idx); } else { while (vcf_read1(in, h, b) >= 0) { if (excl_snp && bcf_is_snp(b)) continue; if (excl_indel && !bcf_is_snp(b)) continue; if (n_samples >= 0) { bcf_subset(h, b, n_samples, imap); vcf_write1(out, hsub, b); } else vcf_write1(out, h, b); } } hts_close(out); } bcf_destroy1(b); if (n_samples > 0) { for (i = 0; i < n_samples; ++i) free(samples[i]); free(samples); bcf_hdr_destroy(hsub); free(imap); } bcf_hdr_destroy(h); hts_close(in); return 0; }
int bcf_sr_next_line(readers_t *files) { int32_t min_pos = INT_MAX; int ret,i,j; kstring_t *str = &files->tmps; while ( min_pos==INT_MAX ) { // Need to open new chromosome? int eos = 0; for (i=0; i<files->nreaders; i++) if ( !files->readers[i].itr && !files->readers[i].nbuffer ) eos++; if ( eos==files->nreaders ) { const char *seq; if ( files->targets ) { seq = tgt_next_seq(files->targets); if ( !seq ) return 0; // all chroms scanned } else { if ( ++files->iseq >= files->nseqs ) return 0; // all chroms scanned seq = files->seqs[files->iseq]; } for (i=0; i<files->nreaders; i++) { reader_t *reader = &files->readers[i]; if ( reader->tbx ) reader->itr = tbx_itr_querys(reader->tbx,seq); else reader->itr = bcf_itr_querys(reader->bcf,reader->header,seq); } } // Find the smallest coordinate for (i=0; i<files->nreaders; i++) { reader_t *reader = &files->readers[i]; int buffer_full = ( reader->nbuffer && reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) ? 1 : 0; if ( reader->itr && !buffer_full ) { // Fill the buffer with records starting at the same position while (1) { if ( reader->nbuffer+1 >= reader->mbuffer ) { reader->mbuffer += 8; reader->buffer = (bcf1_t**) realloc(reader->buffer, sizeof(bcf1_t*)*reader->mbuffer); for (j=8; j>0; j--) reader->buffer[reader->mbuffer-j] = bcf_init1(); } if ( reader->tbx ) { ret = tbx_itr_next((BGZF*)reader->file->fp, reader->tbx, reader->itr, str); if ( ret<0 ) break; vcf_parse1(str, reader->header, reader->buffer[reader->nbuffer+1]); } else { ret = bcf_itr_next((BGZF*)reader->file->fp, reader->itr, reader->buffer[reader->nbuffer+1]); if ( ret<0 ) break; } bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR|BCF_UN_FLT); // apply filter if ( reader->filter_id!=-1 && reader->buffer[reader->nbuffer+1]->d.n_flt && reader->filter_id!=reader->buffer[reader->nbuffer+1]->d.flt[0] ) continue; set_variant_types(reader->buffer[reader->nbuffer+1]); reader->nbuffer++; if ( reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break; } if ( ret<0 ) { tbx_itr_destroy(reader->itr); reader->itr = NULL; } // done for this chromosome } if ( reader->nbuffer ) { if ( min_pos > reader->buffer[1]->pos ) min_pos = reader->buffer[1]->pos; } // The buffer is full - either there is nothing else to read or the last record has a different coordinate if ( files->collapse && reader->nbuffer>2 && reader->buffer[1]->pos==reader->buffer[2]->pos ) { collapse_buffer(files, reader); } } if ( files->targets && min_pos!=INT_MAX ) { int ret = tgt_has_position(files->targets, min_pos); if ( ret==1 ) continue; // The position must be skipped if ( ret==-1 ) { // done for this chromosome, don't read the rest for (i=0; i<files->nreaders; i++) { files->readers[i].nbuffer = 0; if ( files->readers[i].itr ) { tbx_itr_destroy(files->readers[i].itr); files->readers[i].itr = NULL; } } min_pos = INT_MAX; continue; } // remove the active line, save the buffer line for (i=0; i<files->nreaders; i++) { reader_t *reader = &files->readers[i]; for (j=1; j<=reader->nbuffer; j++) if ( reader->buffer[j]->pos!=min_pos ) break; if ( j==1 ) continue; if ( j<=reader->nbuffer ) { bcf1_t *tmp = reader->buffer[1]; reader->buffer[1] = reader->buffer[j]; reader->buffer[j] = tmp; reader->nbuffer = 1; } else reader->nbuffer = 0; } min_pos = INT_MAX; } } //printf("[next_line] min_pos=%d\n", min_pos+1); //debug_buffers(files); // Set the current line ret = 0; bcf1_t *first = NULL; for (i=0; i<files->nreaders; i++) { reader_t *reader = &files->readers[i]; if ( !reader->nbuffer || reader->buffer[1]->pos!=min_pos ) continue; // Match the records by REF and ALT int j, irec = -1; if ( first ) { for (j=1; j<=reader->nbuffer; j++) { bcf1_t *line = reader->buffer[j]; if ( min_pos != line->pos ) break; // done with this buffer if ( files->collapse&COLLAPSE_ANY ) { irec=j; break; } // checking position only if ( files->collapse&COLLAPSE_SNPS && first->d.var_type&VCF_SNP && line->d.var_type&VCF_SNP ) { irec=j; break; } if ( files->collapse&COLLAPSE_INDELS && first->d.var_type&VCF_INDEL && line->d.var_type&VCF_INDEL ) { irec=j; break; } if ( first->rlen != line->rlen ) continue; // REFs do not match if ( strcmp(first->d.allele[0], line->d.allele[0]) ) continue; // REFs do not match int ial,jal; if ( files->collapse==COLLAPSE_NONE ) { // require exact match, all alleles must be identical if ( first->n_allele!=line->n_allele ) continue; // different number of alleles int nmatch = 1; // REF has been already checked for (ial=1; ial<first->n_allele; ial++) { for (jal=1; jal<line->n_allele; jal++) if ( !strcmp(first->d.allele[ial], line->d.allele[jal]) ) { nmatch++; break; } } if ( nmatch>=first->n_allele ) { irec=j; break; } } else { // thorough check: the REFs and some of the alleles have to be shared // (neglecting different representations of the same indel for now) for (ial=1; ial<first->n_allele; ial++) { for (jal=1; jal<line->n_allele; jal++) if ( !strcmp(first->d.allele[ial], line->d.allele[jal]) ) { irec=j; break; } if ( irec>=1 ) break; } } if ( irec>=1 ) break; } if ( irec==-1 ) continue; } else { first = reader->buffer[1]; irec = 1; } bcf1_t *tmp = reader->buffer[0]; reader->buffer[0] = reader->buffer[irec]; for (j=irec+1; j<=reader->nbuffer; j++) reader->buffer[j-1] = reader->buffer[j]; reader->buffer[ reader->nbuffer ] = tmp; reader->nbuffer--; ret |= 1<<i; } // fprintf(stdout,"[next_line] min_pos=%d mask=%d\n", min_pos+1, ret); // debug_buffers(stdout,files); return ret; }
static int query_regions(args_t *args, char *fname, char **regs, int nregs) { int i; htsFile *fp = hts_open(fname,"r"); if ( !fp ) error("Could not read %s\n", fname); enum htsExactFormat format = hts_get_format(fp)->format; regidx_t *reg_idx = NULL; if ( args->targets_fname ) { reg_idx = regidx_init(args->targets_fname, NULL, NULL, 0, NULL); if ( !reg_idx ) error("Could not read %s\n", args->targets_fname); } if ( format == bcf ) { htsFile *out = hts_open("-","w"); if ( !out ) error("Could not open stdout\n", fname); hts_idx_t *idx = bcf_index_load(fname); if ( !idx ) error("Could not load .csi index of %s\n", fname); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Could not read the header: %s\n", fname); if ( args->print_header ) bcf_hdr_write(out,hdr); if ( !args->header_only ) { bcf1_t *rec = bcf_init(); for (i=0; i<nregs; i++) { hts_itr_t *itr = bcf_itr_querys(idx,hdr,regs[i]); while ( bcf_itr_next(fp, itr, rec) >=0 ) { if ( reg_idx && !regidx_overlap(reg_idx, bcf_seqname(hdr,rec),rec->pos,rec->pos+rec->rlen-1, NULL) ) continue; bcf_write(out,hdr,rec); } tbx_itr_destroy(itr); } bcf_destroy(rec); } if ( hts_close(out) ) error("hts_close returned non-zero status for stdout\n"); bcf_hdr_destroy(hdr); hts_idx_destroy(idx); } else if ( format==vcf || format==sam || format==unknown_format ) { tbx_t *tbx = tbx_index_load(fname); if ( !tbx ) error("Could not load .tbi/.csi index of %s\n", fname); kstring_t str = {0,0,0}; if ( args->print_header ) { while ( hts_getline(fp, KS_SEP_LINE, &str) >= 0 ) { if ( !str.l || str.s[0]!=tbx->conf.meta_char ) break; puts(str.s); } } if ( !args->header_only ) { int nseq; const char **seq = NULL; if ( reg_idx ) seq = tbx_seqnames(tbx, &nseq); for (i=0; i<nregs; i++) { hts_itr_t *itr = tbx_itr_querys(tbx, regs[i]); if ( !itr ) continue; while (tbx_itr_next(fp, tbx, itr, &str) >= 0) { if ( reg_idx && !regidx_overlap(reg_idx,seq[itr->curr_tid],itr->curr_beg,itr->curr_end, NULL) ) continue; puts(str.s); } tbx_itr_destroy(itr); } free(seq); } free(str.s); tbx_destroy(tbx); } else if ( format==bam ) error("Please use \"samtools view\" for querying BAM files.\n"); if ( reg_idx ) regidx_destroy(reg_idx); if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname); for (i=0; i<nregs; i++) free(regs[i]); free(regs); return 0; }
/* * _reader_fill_buffer() - buffers all records with the same coordinate */ static void _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) { // Return if the buffer is full: the coordinate of the last buffered record differs if ( reader->nbuffer && reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) return; // No iterator (sequence not present in this file) and not streaming if ( !reader->itr && !files->streaming ) return; // Fill the buffer with records starting at the same position int i, ret = 0; while (1) { if ( reader->nbuffer+1 >= reader->mbuffer ) { // Increase buffer size reader->mbuffer += 8; reader->buffer = (bcf1_t**) realloc(reader->buffer, sizeof(bcf1_t*)*reader->mbuffer); for (i=8; i>0; i--) // initialize { reader->buffer[reader->mbuffer-i] = bcf_init1(); reader->buffer[reader->mbuffer-i]->max_unpack = files->max_unpack; reader->buffer[reader->mbuffer-i]->pos = -1; // for rare cases when VCF starts from 1 } } if ( files->streaming ) { if ( reader->file->format.format==vcf ) { if ( (ret=hts_getline(reader->file, KS_SEP_LINE, &files->tmps)) < 0 ) break; // no more lines int ret = vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]); if ( ret<0 ) break; } else if ( reader->file->format.format==bcf ) { if ( (ret=bcf_read1(reader->file, reader->header, reader->buffer[reader->nbuffer+1])) < 0 ) break; // no more lines } else { fprintf(stderr,"[%s:%d %s] fixme: not ready for this\n", __FILE__,__LINE__,__FUNCTION__); exit(1); } } else if ( reader->tbx_idx ) { if ( (ret=tbx_itr_next(reader->file, reader->tbx_idx, reader->itr, &files->tmps)) < 0 ) break; // no more lines vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]); } else { if ( (ret=bcf_itr_next(reader->file, reader->itr, reader->buffer[reader->nbuffer+1])) < 0 ) break; // no more lines bcf_subset_format(reader->header,reader->buffer[reader->nbuffer+1]); } // apply filter if ( !reader->nfilter_ids ) bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR); else { bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR|BCF_UN_FLT); if ( !has_filter(reader, reader->buffer[reader->nbuffer+1]) ) continue; } reader->nbuffer++; if ( reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break; // the buffer is full } if ( ret<0 ) { // done for this region tbx_itr_destroy(reader->itr); reader->itr = NULL; } if ( files->collapse && reader->nbuffer>=2 && reader->buffer[1]->pos==reader->buffer[2]->pos ) collapse_buffer(files, reader); }
/** * Gets records for the most recent position and fills up the buffer from file i. * returns true if buffer is filled or it is not necessary to fill buffer. * returns false if no more records are found to fill buffer */ void BCFSyncedReader::fill_buffer(int32_t i) { if (buffer[i].size()>=2) return; if (random_access) { int32_t pos1 = buffer[i].size()==0 ? 0 : bcf_get_pos1(buffer[i].front()); if (ftypes[i].format==bcf) { bcf1_t *v = get_bcf1_from_pool(); bool populated = false; while (itrs[i] && bcf_itr_next(files[i], itrs[i], v)>=0) { populated = true; bcf_unpack(v, BCF_UN_STR); //check to ensure order if (!buffer[i].empty()) { if (!bcf_is_in_order(buffer[i].back(), v)) { fprintf(stderr, "[E:%s:%d %s] VCF file not in order: %s\n", __FILE__, __LINE__, __FUNCTION__, file_names[i].c_str()); exit(1); } } buffer[i].push_back(v); insert_into_pq(i, v); if (pos1==0) { pos1 = bcf_get_pos1(v); } if (bcf_get_pos1(v)!=pos1) { break; } v = get_bcf1_from_pool(); populated = false; } if (!populated) store_bcf1_into_pool(v); } else if (ftypes[i].format==vcf) { while (itrs[i] && tbx_itr_next(files[i], tbxs[i], itrs[i], &s)>=0) { bcf1_t *v = get_bcf1_from_pool(); vcf_parse(&s, hdrs[i], v); bcf_unpack(v, BCF_UN_STR); //check to ensure order if (!buffer[i].empty()) { if (!bcf_is_in_order(buffer[i].back(), v)) { fprintf(stderr, "[E:%s:%d %s] VCF file not in order: %s\n", __FILE__, __LINE__, __FUNCTION__, file_names[i].c_str()); exit(1); } } buffer[i].push_back(v); insert_into_pq(i, v); if (pos1==0) { pos1 = bcf_get_pos1(v); } if (bcf_get_pos1(v)!=pos1) { break; } } } } else { int32_t rid = buffer[i].size()==0 ? -1 : bcf_get_rid(buffer[i].front()); int32_t pos1 = buffer[i].size()==0 ? 0 : bcf_get_pos1(buffer[i].front()); bcf1_t *v = get_bcf1_from_pool(); bool populated = false; while (bcf_read(files[i], hdrs[i], v)>=0) { populated = true; bcf_unpack(v, BCF_UN_STR); //check to ensure order if (!buffer[i].empty()) { if (!bcf_is_in_order(buffer[i].back(), v)) { fprintf(stderr, "[E:%s:%d %s] VCF file not in order: %s\n", __FILE__, __LINE__, __FUNCTION__, file_names[i].c_str()); exit(1); } } buffer[i].push_back(v); insert_into_pq(i, v); if (rid==-1) { rid = bcf_get_rid(v); pos1 = bcf_get_pos1(v); } if (bcf_get_rid(v)!=rid || bcf_get_pos1(v)!=pos1) { break; } v = get_bcf1_from_pool(); populated = false; } if (!populated) store_bcf1_into_pool(v); } }