예제 #1
0
/**
 * Initialize next interval.
 * Returns false only if all intervals are accessed.
 */
bool BCFOrderedReader::initialize_next_interval()
{
    while (interval_index!=intervals.size())
    {
        if (ftype.format==bcf)
        {
            intervals[interval_index++].to_string(&s);
            itr = bcf_itr_querys(idx, hdr, s.s);
            if (itr)
            {
                return true;
            }
        }
        else if (ftype.format==vcf && ftype.compression==bgzf)
        {
            intervals[interval_index++].to_string(&s);
            itr = tbx_itr_querys(tbx, s.s);
            if (itr)
            {
                return true;
            }
        }
    }

    return false;
};
예제 #2
0
/**
 * Jump to interval. Returns false if not successful.
 *
 * @interval - string representation of interval.
 */
bool BCFOrderedReader::jump_to_interval(GenomeInterval& interval)
{
    if (index_loaded)
    {
        intervals_present = true;
        random_access_enabled = true;
        intervals.clear();
        intervals.push_back(interval);
        interval_index = 0;
        if (ftype.format==bcf)
        {
            intervals[interval_index++].to_string(&s);
            itr = bcf_itr_querys(idx, hdr, s.s);
            if (itr)
            {
                return true;
            }
        }
        else if (ftype.format==vcf && ftype.compression==bgzf)
        {
            intervals[interval_index++].to_string(&s);
            itr = tbx_itr_querys(tbx, s.s);
            if (itr)
            {
                return true;
            }
        }
    }

    return false;
};
/**
 * @brief pre-fetches the next variant record
 * @warning we're reusing the existing htslib memory, so users should be aware that all objects from the previous iteration are now stale unless a deep copy has been performed
 */
void IndexedVariantIterator::fetch_next_record() {
  while (bcf_itr_next(m_variant_file_ptr, m_index_iter_ptr.get(), m_variant_record_ptr.get()) < 0) {
    ++m_interval_iter;
    if (m_interval_list.end() == m_interval_iter) {
      m_variant_file_ptr.reset();
      m_variant_record = Variant{};
      return;
    }
    m_index_iter_ptr.reset(bcf_itr_querys(m_variant_index_ptr.get(), m_variant_header_ptr.get(), m_interval_iter->c_str()));
  }
}
IndexedVariantIterator::IndexedVariantIterator(const std::shared_ptr<htsFile>& file_ptr,
                                               const std::shared_ptr<hts_idx_t>& index_ptr,
                                               const std::shared_ptr<bcf_hdr_t>& header_ptr,
                                               const std::vector<std::string> interval_list) :
  VariantIterator { file_ptr, header_ptr },
  m_variant_index_ptr { index_ptr },
  m_interval_list { interval_list.empty() ? all_intervals : move(interval_list) },
  m_interval_iter { m_interval_list.begin() },
  m_index_iter_ptr { utils::make_unique_hts_itr(bcf_itr_querys(m_variant_index_ptr.get(), m_variant_header_ptr.get(), m_interval_iter->c_str())) }
{
  fetch_next_record();
}
예제 #5
0
/**
 * Initialize buffer for next interval.
 * This should only be invoked if the buffer is empty.
 * Returns true if successful.
 */
bool BCFSyncedReader::initialize_next_interval()
{
    if (random_access)
    {
        while (intervals_index < intervals.size())
        {
            GenomeInterval interval = intervals[intervals_index++];

            for (size_t i=0; i<nfiles; ++i)
            {
                hts_itr_destroy(itrs[i]);
                itrs[i] = 0;
                interval.to_string(&s);

                if (ftypes[i].format==bcf)
                {
                    itrs[i] = bcf_itr_querys(idxs[i], hdrs[i], s.s);
                }
                else if (ftypes[i].format==vcf)
                {
                    itrs[i] = tbx_itr_querys(tbxs[i], s.s);
                }

                fill_buffer(i);
            }

            //make sure pq is not empty
            //it is possible for the pq to be empty as iterators may be returned
            //as the sequence might be a valid sequence stated in the header
            if (pq.size()!=0)
            {
                return true;
            }
        }

        return false;
    }
    else
    {
        for (size_t i=0; i<nfiles; ++i)
        {
            fill_buffer(i);
        }

        if (pq.size()!=0)
        {
            return true;
        }

        return false;
    }
}
예제 #6
0
/**
 * Initialize buffer for next interval.
 * This should only be invoked if the buffer is empty.
 * Returns true if successful.
 */
bool BCFSyncedStreamReader::initialize_next_interval()
{
    while (intervals_index < intervals.size())
    {
        GenomeInterval interval = intervals[intervals_index++];

        for (int32_t i = 0; i<nfiles; ++i)
        {
            int32_t ftype = hts_file_type(vcf_files[i].c_str());
            hts_itr_destroy(itrs[i]);
            itrs[i] = 0;
            interval.to_string(&s);

            if (ftype==FT_BCF_GZ)
            {
                itrs[i] = bcf_itr_querys(idxs[i], hdrs[i], s.s);
            }
            else if (ftype==FT_VCF_GZ)
            {
                itrs[i] = tbx_itr_querys(tbxs[i], s.s);
            }

            fill_buffer(i);
        }

        //make sure pq is not empty
        //it is possible for the pq to be empty as iterators may be returned
        //as the sequence might be a valid sequence stated in the header
        if (pq.size()!=0)
        {
            return true;
        }
    }

    return false;
}
예제 #7
0
파일: vcfview.c 프로젝트: mp15/htslib
int main_vcfview(int argc, char *argv[])
{
	int i, c, clevel = -1, flag = 0, n_samples = -1, *imap = 0, excl_snp = 0, excl_indel = 0;
	char *fn_ref = 0, *fn_out = 0, moder[8], **samples = 0;
	bcf_hdr_t *h, *hsub = 0;
	htsFile *in;
	bcf1_t *b;

	while ((c = getopt(argc, argv, "l:bSt:o:T:s:GNI")) >= 0) {
		switch (c) {
		case 'l': clevel = atoi(optarg); flag |= 2; break;
		case 'S': flag |= 1; break;
		case 'b': flag |= 2; break;
		case 'G': n_samples = 0; break;
		case 't': fn_ref = optarg; flag |= 1; break;
		case 'o': fn_out = optarg; break;
		case 's': samples = hts_readlines(optarg, &n_samples); break;
		case 'N': excl_snp = 1; break;
		case 'I': excl_indel = 1; break;
		}
	}
	if (argc == optind) {
		fprintf(stderr, "\nUsage:   vcfview [options] <in.bcf>|<in.vcf>|<in.vcf.gz>\n\n");
		fprintf(stderr, "Options: -b           output in BCF\n");
		fprintf(stderr, "         -S           input is VCF\n");
		fprintf(stderr, "         -o FILE      output file name [stdout]\n");
		fprintf(stderr, "         -l INT       compression level [%d]\n", clevel);
		fprintf(stderr, "         -t FILE      list of reference names and lengths [null]\n");
		fprintf(stderr, "         -s FILE/STR  list of samples (STR if started with ':'; FILE otherwise) [null]\n");
		fprintf(stderr, "         -G           drop individual genotype information\n");
		fprintf(stderr, "         -N           exclude SNPs\n");
		fprintf(stderr, "         -I           exclude INDELs\n");
		fprintf(stderr, "\n");
		return 1;
	}
	strcpy(moder, "r");
	if ((flag&1) == 0 && !(file_type(argv[optind])&(IS_VCF|IS_VCF_GZ))) strcat(moder, "b");

	in = hts_open(argv[optind], moder, fn_ref);
	h = vcf_hdr_read(in);
	if (h == 0) {
		fprintf(stderr, "[E::%s] fail to read the VCF/BCF2 header\n", __func__);
		hts_close(in);
		return 1;
	}
	if (n_samples >= 0) {
		if (n_samples) imap = (int*)malloc(n_samples * sizeof(int));
		hsub = bcf_hdr_subset(h, n_samples, samples, imap);
	}
	b = bcf_init1();

	if ((flag&4) == 0) { // VCF/BCF output
		htsFile *out;
		char modew[8];
		strcpy(modew, "w");
		if (clevel >= 0 && clevel <= 9) sprintf(modew + 1, "%d", clevel);
		if (flag&2) strcat(modew, "b");
		out = hts_open(fn_out? fn_out : "-", modew, 0);
		vcf_hdr_write(out, hsub? hsub : h);
		if (optind + 1 < argc && !(flag&1)) { // BAM input and has a region
			hts_idx_t *idx;
			if ((idx = bcf_index_load(argv[optind])) == 0) {
				fprintf(stderr, "[E::%s] fail to load the BCF index\n", __func__);
				return 1;
			}
			for (i = optind + 1; i < argc; ++i) {
				hts_itr_t *iter;
				if ((iter = bcf_itr_querys(idx, h, argv[i])) == 0) {
					fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, argv[i]);
					continue;
				}
				while (bcf_itr_next((BGZF*)in->fp, iter, b) >= 0) {
					if (excl_snp && bcf_is_snp(b)) continue;
					if (excl_indel && !bcf_is_snp(b)) continue;
					if (n_samples >= 0) {
						bcf_subset(h, b, n_samples, imap);
						vcf_write1(out, hsub, b);
					} else vcf_write1(out, h, b);
				}
				hts_itr_destroy(iter);
			}
			hts_idx_destroy(idx);
		} else {
			while (vcf_read1(in, h, b) >= 0) {
				if (excl_snp && bcf_is_snp(b)) continue;
				if (excl_indel && !bcf_is_snp(b)) continue;
				if (n_samples >= 0) {
					bcf_subset(h, b, n_samples, imap);
					vcf_write1(out, hsub, b);
				} else vcf_write1(out, h, b);
			}
		}
		hts_close(out);
	}

	bcf_destroy1(b);
	if (n_samples > 0) {
		for (i = 0; i < n_samples; ++i) free(samples[i]);
		free(samples);
		bcf_hdr_destroy(hsub);
		free(imap);
	}
	bcf_hdr_destroy(h);
	hts_close(in);
	return 0;
}
예제 #8
0
int bcf_sr_next_line(readers_t *files)
{
    int32_t min_pos = INT_MAX;
    int ret,i,j;
    kstring_t *str = &files->tmps;

    while ( min_pos==INT_MAX )
    {
        // Need to open new chromosome?
        int eos = 0;
        for (i=0; i<files->nreaders; i++)
            if ( !files->readers[i].itr && !files->readers[i].nbuffer ) eos++;
        if ( eos==files->nreaders )
        {
            const char *seq;
            if ( files->targets )
            {
                seq = tgt_next_seq(files->targets);
                if ( !seq ) return 0;   // all chroms scanned
            }
            else
            {
                if ( ++files->iseq >= files->nseqs ) return 0;  // all chroms scanned
                seq = files->seqs[files->iseq];
            }
            for (i=0; i<files->nreaders; i++)
            {
                reader_t *reader = &files->readers[i];
                if ( reader->tbx )
                    reader->itr = tbx_itr_querys(reader->tbx,seq);
                else
                    reader->itr = bcf_itr_querys(reader->bcf,reader->header,seq);
            }
        }

        // Find the smallest coordinate
        for (i=0; i<files->nreaders; i++)
        {
            reader_t *reader = &files->readers[i];
            int buffer_full = ( reader->nbuffer && reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) ? 1 : 0;
            if ( reader->itr && !buffer_full )
            {
                // Fill the buffer with records starting at the same position
                while (1)
                {
                    if ( reader->nbuffer+1 >= reader->mbuffer ) 
                    {
                        reader->mbuffer += 8;
                        reader->buffer = (bcf1_t**) realloc(reader->buffer, sizeof(bcf1_t*)*reader->mbuffer);
                        for (j=8; j>0; j--)
                            reader->buffer[reader->mbuffer-j] = bcf_init1();
                    }
                    if ( reader->tbx )
                    {
                        ret = tbx_itr_next((BGZF*)reader->file->fp, reader->tbx, reader->itr, str);
                        if ( ret<0 ) break;
                        vcf_parse1(str, reader->header, reader->buffer[reader->nbuffer+1]);
                    }
                    else
                    {
                        ret = bcf_itr_next((BGZF*)reader->file->fp, reader->itr, reader->buffer[reader->nbuffer+1]);
                        if ( ret<0 ) break;
                    }
                    bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR|BCF_UN_FLT);
                    // apply filter
                    if ( reader->filter_id!=-1 && reader->buffer[reader->nbuffer+1]->d.n_flt && reader->filter_id!=reader->buffer[reader->nbuffer+1]->d.flt[0] ) continue;
                    set_variant_types(reader->buffer[reader->nbuffer+1]);
                    reader->nbuffer++;
                    if ( reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break;
                }
                if ( ret<0 ) { tbx_itr_destroy(reader->itr); reader->itr = NULL; } // done for this chromosome
            }
            if ( reader->nbuffer )
            {
                if ( min_pos > reader->buffer[1]->pos ) min_pos = reader->buffer[1]->pos; 
            }
            // The buffer is full - either there is nothing else to read or the last record has a different coordinate
            if ( files->collapse && reader->nbuffer>2 && reader->buffer[1]->pos==reader->buffer[2]->pos )
            {
                collapse_buffer(files, reader);
            }
        }
        if ( files->targets && min_pos!=INT_MAX )
        {
            int ret = tgt_has_position(files->targets, min_pos);
            if ( ret==1 ) continue;

            // The position must be skipped
            if ( ret==-1 )
            {
                // done for this chromosome, don't read the rest
                for (i=0; i<files->nreaders; i++) 
                {
                    files->readers[i].nbuffer = 0;
                    if ( files->readers[i].itr )
                    {
                        tbx_itr_destroy(files->readers[i].itr);
                        files->readers[i].itr = NULL;
                    }
                }
                min_pos = INT_MAX;
                continue;
            }

            // remove the active line, save the buffer line
            for (i=0; i<files->nreaders; i++)
            {
                reader_t *reader = &files->readers[i];
                for (j=1; j<=reader->nbuffer; j++)
                    if ( reader->buffer[j]->pos!=min_pos ) break;
                if ( j==1 ) continue;
                if ( j<=reader->nbuffer )
                {
                    bcf1_t *tmp = reader->buffer[1]; reader->buffer[1] = reader->buffer[j]; reader->buffer[j] = tmp;
                    reader->nbuffer = 1;
                }
                else 
                    reader->nbuffer = 0;
            }
            min_pos = INT_MAX;
        }
    }

    //printf("[next_line] min_pos=%d\n", min_pos+1);
    //debug_buffers(files);

    // Set the current line
    ret = 0;
    bcf1_t *first = NULL;
    for (i=0; i<files->nreaders; i++)
    {
        reader_t *reader = &files->readers[i];
        if ( !reader->nbuffer || reader->buffer[1]->pos!=min_pos ) continue;

        // Match the records by REF and ALT
        int j, irec = -1;
        if ( first )
        {
            for (j=1; j<=reader->nbuffer; j++)
            {
                bcf1_t *line = reader->buffer[j];
                if ( min_pos != line->pos ) break;  // done with this buffer

                if ( files->collapse&COLLAPSE_ANY ) { irec=j; break; }  // checking position only
                if ( files->collapse&COLLAPSE_SNPS && first->d.var_type&VCF_SNP && line->d.var_type&VCF_SNP ) { irec=j; break; }
                if ( files->collapse&COLLAPSE_INDELS && first->d.var_type&VCF_INDEL && line->d.var_type&VCF_INDEL ) { irec=j; break; }

                if ( first->rlen != line->rlen ) continue;  // REFs do not match
                if ( strcmp(first->d.allele[0], line->d.allele[0]) ) continue; // REFs do not match
                int ial,jal;
                if ( files->collapse==COLLAPSE_NONE )
                {
                    // require exact match, all alleles must be identical
                    if ( first->n_allele!=line->n_allele ) continue;   // different number of alleles
                    int nmatch = 1; // REF has been already checked
                    for (ial=1; ial<first->n_allele; ial++)
                    {
                        for (jal=1; jal<line->n_allele; jal++)
                            if ( !strcmp(first->d.allele[ial], line->d.allele[jal]) ) { nmatch++; break; }
                    }
                    if ( nmatch>=first->n_allele ) { irec=j; break; }
                }
                else
                {
                    // thorough check: the REFs and some of the alleles have to be shared
                    // (neglecting different representations of the same indel for now)
                    for (ial=1; ial<first->n_allele; ial++)
                    {
                        for (jal=1; jal<line->n_allele; jal++)
                            if ( !strcmp(first->d.allele[ial], line->d.allele[jal]) ) { irec=j; break; }
                        if ( irec>=1 ) break;
                    }
                }
                if ( irec>=1 ) break;
            }
            if ( irec==-1 ) continue;
        }
        else 
        {
            first = reader->buffer[1];
            irec  = 1;
        }
        bcf1_t *tmp = reader->buffer[0];
        reader->buffer[0] = reader->buffer[irec];
        for (j=irec+1; j<=reader->nbuffer; j++)
            reader->buffer[j-1] = reader->buffer[j];
        reader->buffer[ reader->nbuffer ] = tmp;
        reader->nbuffer--;
        ret |= 1<<i;
    }
    // fprintf(stdout,"[next_line] min_pos=%d mask=%d\n", min_pos+1, ret);
    // debug_buffers(stdout,files);

    return ret;
}
예제 #9
0
파일: tabix.c 프로젝트: Illumina/akt
static int query_regions(args_t *args, char *fname, char **regs, int nregs)
{
    int i;
    htsFile *fp = hts_open(fname,"r");
    if ( !fp ) error("Could not read %s\n", fname);
    enum htsExactFormat format = hts_get_format(fp)->format;

    regidx_t *reg_idx = NULL;
    if ( args->targets_fname )
    {
        reg_idx = regidx_init(args->targets_fname, NULL, NULL, 0, NULL);
        if ( !reg_idx ) error("Could not read %s\n", args->targets_fname);
    }

    if ( format == bcf )
    {
        htsFile *out = hts_open("-","w");
        if ( !out ) error("Could not open stdout\n", fname);
        hts_idx_t *idx = bcf_index_load(fname);
        if ( !idx ) error("Could not load .csi index of %s\n", fname);
        bcf_hdr_t *hdr = bcf_hdr_read(fp);
        if ( !hdr ) error("Could not read the header: %s\n", fname);
        if ( args->print_header )
            bcf_hdr_write(out,hdr);
        if ( !args->header_only )
        {
            bcf1_t *rec = bcf_init();
            for (i=0; i<nregs; i++)
            {
                hts_itr_t *itr = bcf_itr_querys(idx,hdr,regs[i]);
                while ( bcf_itr_next(fp, itr, rec) >=0 )
                {
                    if ( reg_idx && !regidx_overlap(reg_idx, bcf_seqname(hdr,rec),rec->pos,rec->pos+rec->rlen-1, NULL) ) continue;
                    bcf_write(out,hdr,rec);
                }
                tbx_itr_destroy(itr);
            }
            bcf_destroy(rec);
        }
        if ( hts_close(out) ) error("hts_close returned non-zero status for stdout\n");
        bcf_hdr_destroy(hdr);
        hts_idx_destroy(idx);
    }
    else if ( format==vcf || format==sam || format==unknown_format )
    {
        tbx_t *tbx = tbx_index_load(fname);
        if ( !tbx ) error("Could not load .tbi/.csi index of %s\n", fname);
        kstring_t str = {0,0,0};
        if ( args->print_header )
        {
            while ( hts_getline(fp, KS_SEP_LINE, &str) >= 0 )
            {
                if ( !str.l || str.s[0]!=tbx->conf.meta_char ) break;
                puts(str.s);
            }
        }
        if ( !args->header_only )
        {
            int nseq;
            const char **seq = NULL;
            if ( reg_idx ) seq = tbx_seqnames(tbx, &nseq);
            for (i=0; i<nregs; i++)
            {
                hts_itr_t *itr = tbx_itr_querys(tbx, regs[i]);
                if ( !itr ) continue;
                while (tbx_itr_next(fp, tbx, itr, &str) >= 0)
                {
                    if ( reg_idx && !regidx_overlap(reg_idx,seq[itr->curr_tid],itr->curr_beg,itr->curr_end, NULL) ) continue;
                    puts(str.s);
                }
                tbx_itr_destroy(itr);
            }
            free(seq);
        }
        free(str.s);
        tbx_destroy(tbx);
    }
    else if ( format==bam )
        error("Please use \"samtools view\" for querying BAM files.\n");

    if ( reg_idx ) regidx_destroy(reg_idx);
    if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname);

    for (i=0; i<nregs; i++) free(regs[i]);
    free(regs);
    return 0;
}