Exemplo n.º 1
0
int main_vcfview(int argc, char *argv[])
{
	int c, clevel = -1, in_type = FT_BCF, out_type = FT_VCF;
	char *fname_out = NULL, moder[8], modew[8];

	while ((c = getopt(argc, argv, "l:bvo:n:z?hu")) >= 0) {
		switch (c) {
            case 'o': 
                switch (optarg[0]) {
                    case 'b': out_type = FT_BCF_GZ; break;
                    case 'u': out_type = FT_BCF; break;
                    case 'z': out_type = FT_VCF_GZ; break;
                    case 'v': out_type = FT_VCF; break;
                    default: error("The output type \"%s\" not recognised\n", optarg);
                }
                break;
            case 'l': clevel = atoi(optarg); out_type |= FT_GZ; break;
            case 'v': in_type  = FT_VCF; break;
            case 'b': out_type = FT_BCF_GZ; break;
            case 'u': out_type = FT_BCF; break;
            case 'z': out_type = FT_VCF_GZ; break;
            case 'n': fname_out = optarg; break;
            case '?':
            case 'h': usage(); return 1; break;
        }
    }
	if (argc!=optind+1) { usage(); return 1; }

    // Init reader
	strcpy(moder, "r");
	if ( (!strcmp("-",argv[optind]) && (in_type & FT_BCF)) || (hts_file_type(argv[optind]) & FT_BCF)) strcat(moder, "b");
	htsFile *fp_in = hts_open(argv[optind], moder, NULL);
    if ( !fp_in ) error("Fail to open: %s\n", argv[optind]);
	bcf_hdr_t *hdr = vcf_hdr_read(fp_in);
    if ( !hdr ) error("Fail to read VCF/BCF header: %s\n", argv[optind]); 
	bcf1_t *rec = bcf_init1();

    // Init writer
    strcpy(modew, "w");
    if (clevel >= 0 && clevel <= 9) sprintf(modew + 1, "%d", clevel);
    if (out_type & FT_GZ) strcat(modew,"z");
    if (out_type & FT_BCF) strcat(modew, "b");
    if (out_type == FT_BCF) strcat(modew, "u"); // uncompressed BCF output
    htsFile *fp_out = hts_open(fname_out ? fname_out : "-", modew, NULL);

    vcf_hdr_write(fp_out, hdr);
    while ( vcf_read1(fp_in, hdr, rec) >= 0) vcf_write1(fp_out, hdr, rec);

	bcf_destroy1(rec);
	bcf_hdr_destroy(hdr);
	hts_close(fp_in);
    hts_close(fp_out);

	return 0;
}
Exemplo n.º 2
0
/**
 * Initialize buffer for next interval.
 * This should only be invoked if the buffer is empty.
 * Returns true if successful.
 */
bool BCFSyncedStreamReader::initialize_next_interval()
{
    while (intervals_index < intervals.size())
    {
        GenomeInterval interval = intervals[intervals_index++];

        for (int32_t i = 0; i<nfiles; ++i)
        {
            int32_t ftype = hts_file_type(vcf_files[i].c_str());
            hts_itr_destroy(itrs[i]);
            itrs[i] = 0;
            interval.to_string(&s);

            if (ftype==FT_BCF_GZ)
            {
                itrs[i] = bcf_itr_querys(idxs[i], hdrs[i], s.s);
            }
            else if (ftype==FT_VCF_GZ)
            {
                itrs[i] = tbx_itr_querys(tbxs[i], s.s);
            }

            fill_buffer(i);
        }

        //make sure pq is not empty
        //it is possible for the pq to be empty as iterators may be returned
        //as the sequence might be a valid sequence stated in the header
        if (pq.size()!=0)
        {
            return true;
        }
    }

    return false;
}
Exemplo n.º 3
0
int main_vcfindex(int argc, char *argv[])
{
    int c, min_shift = 14, force = 0;

    static struct option loptions[] = 
    {
        {"help",0,0,'h'},
        {"force",0,0,'f'},
        {"min-shift",1,0,'m'},
        {0,0,0,0}
    };

    while ((c = getopt_long(argc, argv, "h?fm:", loptions,NULL)) >= 0)
    {
        switch (c) 
        {
            case 'f': force = 1; break;
            case 'm': min_shift = atoi(optarg); break;
            default: usage();
        }
    }
    if ( optind==argc ) usage();
    if (min_shift < 0 || min_shift > 30)
    {
        fprintf(stderr, "[E::%s] expected min_shift in range [0,30] (%d)\n", __func__, min_shift);
        return 1;
    }

    char *fname = argv[optind];
    int ftype = hts_file_type(fname);
    if (!ftype)
    {
        fprintf(stderr, "[E::%s] unknown filetype; expected .vcf.gz or .bcf\n", __func__);
        return 1;
    }

    if (!force)
    {
        // Before complaining about existing index, check if the VCF file isn't newer.
        char *idx_fname = (char*)alloca(strlen(fname) + 5);
        strcat(strcpy(idx_fname, fname), min_shift <= 0 ? ".tbi" : ".csi");
        struct stat stat_tbi, stat_file;
        if ( stat(idx_fname, &stat_tbi)==0 )
        {
            stat(fname, &stat_file);
            if ( stat_file.st_mtime <= stat_tbi.st_mtime )
            {
                fprintf(stderr,"[E::%s] the index file exists. Please use '-f' to overwrite.\n", __func__);
                return 1;
            }
        }
    }

    if (ftype == FT_BCF_GZ)
    {
        if ( bcf_index_build(fname, min_shift) != 0 ) 
        {
            fprintf(stderr,"[E::%s] bcf_index_build failed: %s\n", __func__, fname);
            return 1;
        }        
    }
    else if (ftype == FT_VCF_GZ)
    {
        if ( tbx_index_build(fname, min_shift, &tbx_conf_vcf) != 0 )
        {
            fprintf(stderr,"[E::%s] tbx_index_build failed for %s\n", __func__, fname);
            return 1;
        }
    }
    return 0;
}
Exemplo n.º 4
0
int vcf_index_stats(char *fname, int stats)
{
    char *fn_out = NULL;
    FILE *out;
    out = fn_out ? fopen(fn_out, "w") : stdout;

    const char **seq;
    int i, nseq, ftype = hts_file_type(fname);
    tbx_t *tbx = NULL;
    hts_idx_t *idx = NULL;

    htsFile *fp = hts_open(fname,"r");
    if ( !fp ) { fprintf(stderr,"Could not read %s\n", fname); return 1; }
    bcf_hdr_t *hdr = bcf_hdr_read(fp);
    if ( !hdr ) { fprintf(stderr,"Could not read the header: %s\n", fname); return 1; }

    if ( ftype & FT_VCF || !ftype )
    {
        tbx = tbx_index_load(fname);
        if ( !tbx ) { fprintf(stderr,"Could not load TBI index: %s\n", fname); return 1; }
    }
    else if ( ftype & FT_BCF )
    {
        idx = bcf_index_load(fname);
        if ( !idx ) { fprintf(stderr,"Could not load CSI index: %s\n", fname); return 1; }
    }
    else
    {
        fprintf(stderr,"Could not detect the file type as VCF or BCF: %s\n", fname);
        return 1;
    }

    seq = tbx ? tbx_seqnames(tbx, &nseq) : bcf_index_seqnames(idx, hdr, &nseq);
    uint64_t sum = 0;
    for (i=0; i<nseq; i++)
    {
        uint64_t records, v;
        hts_idx_get_stat(tbx ? tbx->idx : idx, i, &records, &v);
        sum+=records;
        if (stats&2 || !records) continue;
        bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", seq[i], NULL);
        int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1;
        if (hkey<0)
        {
            fprintf(stderr,"could not get contig length for %s\n", seq[i]);
            return 1;
        }
        fprintf(out, "%s\t%s", seq[i], strcmp(hrec->vals[hkey], "2147483647")==0 ? "." : hrec->vals[hkey]);
        fprintf(out, "\t%" PRIu64 "\n", records);
    }
    if (!sum)
    {
        // No counts found.
        // Is this because index version has no stored count data, or no records?
        bcf1_t *rec = bcf_init1();
        if (bcf_read1(fp, hdr, rec) >= 0)
        {
            fprintf(stderr,"%s index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", tbx ? "TBI" : "CSI", fname);
            return 1;
        }
        bcf_destroy1(rec);
    }
    if (stats&2) fprintf(out, "%" PRIu64 "\n", sum);
    free(seq);
    fclose(out);
    hts_close(fp);
    bcf_hdr_destroy(hdr);
    if (tbx)
        tbx_destroy(tbx);
    if (idx)
        hts_idx_destroy(idx);
    return 0;
}
Exemplo n.º 5
0
int main_vcfindex(int argc, char *argv[])
{
    int c, force = 0, tbi = 0, stats = 0;
    int min_shift = BCF_LIDX_SHIFT;

    static struct option loptions[] =
    {
        {"csi",no_argument,NULL,'c'},
        {"tbi",no_argument,NULL,'t'},
        {"force",no_argument,NULL,'f'},
        {"min-shift",required_argument,NULL,'m'},
        {"stats",no_argument,NULL,'s'},
        {"nrecords",no_argument,NULL,'n'},
        {NULL, 0, NULL, 0}
    };

    while ((c = getopt_long(argc, argv, "ctfm:sn", loptions, NULL)) >= 0)
    {
        switch (c)
        {
            case 'c': tbi = 0; break;
            case 't': tbi = 1; min_shift = 0; break;
            case 'f': force = 1; break;
            case 'm': min_shift = atoi(optarg); break;
            case 's': stats |= 1; break;
            case 'n': stats |= 2; break;
            default: usage();
        }
    }
    if ( optind==argc ) usage();
    if (stats>2)
    {
        fprintf(stderr, "[E::%s] expected only one of --stats or --nrecords options\n", __func__);
        return 1;
    }
    if (tbi && min_shift>0)
    {
        fprintf(stderr, "[E::%s] min-shift option only expected for CSI indices \n", __func__);
        return 1;
    }
    if (min_shift < 0 || min_shift > 30)
    {
        fprintf(stderr, "[E::%s] expected min_shift in range [0,30] (%d)\n", __func__, min_shift);
        return 1;
    }

    char *fname = argv[optind];
    if (stats) return vcf_index_stats(fname, stats);
    int ftype = hts_file_type(fname);
    if (!ftype || (ftype != FT_BCF_GZ && ftype != FT_VCF_GZ))
    {
        fprintf(stderr, "[E::%s] unknown filetype; expected bgzip compressed VCF or BCF\n", __func__);
        if (!(ftype & FT_GZ))
            fprintf(stderr, "[E::%s] was the VCF/BCF compressed with bgzip?\n", __func__);
        return 1;
    }
    if (tbi && ftype == FT_BCF_GZ)
    {
        fprintf(stderr, "[Warning] TBI-index does not work for BCF files. Generating CSI instead.\n");
        tbi = 0; min_shift = BCF_LIDX_SHIFT;
    }
    if (min_shift == 0 && ftype == FT_BCF_GZ)
    {
        fprintf(stderr, "[E::%s] Require min_shift>0 for BCF files.\n", __func__);
        return 1;
    }
    if (!tbi && ftype == FT_VCF_GZ && min_shift == 0)
    {
        fprintf(stderr, "[Warning] min-shift set to 0 for VCF file. Generating TBI file.\n");
        tbi = 1;
    }

    if (!force)
    {
        // Before complaining about existing index, check if the VCF file isn't newer.
        char *idx_fname = (char*)alloca(strlen(fname) + 5);
        strcat(strcpy(idx_fname, fname), tbi ? ".tbi" : ".csi");
        struct stat stat_tbi, stat_file;
        if ( stat(idx_fname, &stat_tbi)==0 )
        {
            stat(fname, &stat_file);
            if ( stat_file.st_mtime <= stat_tbi.st_mtime )
            {
                fprintf(stderr,"[E::%s] the index file exists. Please use '-f' to overwrite.\n", __func__);
                return 1;
            }
        }
    }

    if (ftype == FT_BCF_GZ)
    {
        if ( bcf_index_build(fname, min_shift) != 0 )
        {
            fprintf(stderr,"[E::%s] bcf_index_build failed for %s\n", __func__, fname);
            return 1;
        }
    }
    else
    {
        if ( tbx_index_build(fname, min_shift, &tbx_conf_vcf) != 0 )
        {
            fprintf(stderr,"[E::%s] tbx_index_build failed for %s\n", __func__, fname);
            return 1;
        }
    }
    return 0;
}
Exemplo n.º 6
0
/**
 * Constructor.
 *
 * @intervals - if empty, will add the contigs found in the header files
 */
BCFSyncedStreamReader::BCFSyncedStreamReader(std::vector<std::string>& vcf_files, std::vector<GenomeInterval>& intervals, bool sync_by_pos)
:vcf_files(vcf_files), intervals(intervals), sync_by_pos(sync_by_pos)
{
    nfiles = vcf_files.size();
    vcfs.resize(nfiles, 0);
    hdrs.resize(nfiles, 0);
    idxs.resize(nfiles, 0);
    tbxs.resize(nfiles, 0);
    itrs.resize(nfiles, 0);
    ftypes.resize(nfiles, -1);

    current_interval = "";
    current_pos1 = 0;

    buffer.resize(nfiles);
    s = {0, 0, 0};

    exists_selected_intervals = (intervals.size()!=0);
    for (uint32_t i=0; i<intervals.size(); ++i)
    {
        intervals_map[intervals[i].to_string()] = i;
    }
    intervals_index = 0;

    //1. check file type validity
    //2. loads indices
    //3. adds sequences found in all indexed files, this allows us to iterate through all sequences.
    for (int32_t i = 0; i<nfiles; ++i)
    {
        ftypes[i] = hts_file_type(vcf_files[i].c_str());
        vcfs[i] = bcf_open(vcf_files[i].c_str(), "r");
        if (vcfs[i]==NULL) exit(1);
        hdrs[i] = bcf_alt_hdr_read(vcfs[i]);

        if (i==0)
        {
            if (!(ftypes[i] & (FT_VCF|FT_BCF|FT_STDIN)))
            {
                fprintf(stderr, "[E:%s:%d %s] %s not a VCF or BCF file\n", __FILE__, __LINE__, __FUNCTION__, vcf_files[i].c_str());
                exit(1);
            }

            if (!load_index(i))
            {
                fprintf(stderr, "[I:%s:%d %s] index cannot be loaded for %s\n", __FILE__, __LINE__, __FUNCTION__, vcf_files[i].c_str());
                exit(1);
            }

            if (!exists_selected_intervals)
            {
                //add sequences from file i
                add_interval(i);
            }
        }
        else
        {
            if (!(ftypes[i] & (FT_VCF_GZ|FT_BCF_GZ)))
            {
                fprintf(stderr, "[E:%s:%d %s] %s not a VCF_GZ or BCF file\n", __FILE__, __LINE__, __FUNCTION__, vcf_files[i].c_str());
                exit(1);
            }

            if (!load_index(i))
            {
                fprintf(stderr, "[E:%s:%d %s] index cannot be loaded for %s\n", __FILE__, __LINE__, __FUNCTION__, vcf_files[i].c_str());
                exit(1);
            }

            if (!exists_selected_intervals)
            {
                //add sequences from file i
                add_interval(i);
            }
        }
    }
}
Exemplo n.º 7
0
static void init_data(args_t *args)
{
    args->file_type = hts_file_type(args->fname);
}
Exemplo n.º 8
0
bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr, int ifrom, int ito)
{
    bcf_sr_regions_t *reg;
    if ( !is_file ) return _regions_init_string(regions);

    reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t));
    reg->start = reg->end = -1;
    reg->prev_start = reg->prev_seq = -1;

    reg->file = hts_open(regions, "rb");
    if ( !reg->file )
    {
        fprintf(stderr,"[%s:%d %s] Could not open file: %s\n", __FILE__,__LINE__,__FUNCTION__,regions);
        free(reg);
        return NULL;
    }

    reg->tbx = tbx_index_load(regions);
    if ( !reg->tbx ) 
    {
        int len = strlen(regions);
        int is_bed  = strcasecmp(".bed",regions+len-4) ? 0 : 1;
        if ( !is_bed && !strcasecmp(".bed.gz",regions+len-7) ) is_bed = 1;
        int ft_type = hts_file_type(regions);
        if ( ft_type & FT_VCF ) ito = 1;

        // read the whole file, tabix index is not present
        while ( hts_getline(reg->file, KS_SEP_LINE, &reg->line) > 0 )
        {
            char *chr, *chr_end;
            int from, to, ret;
            ret = _regions_parse_line(reg->line.s, ichr,ifrom,abs(ito), &chr,&chr_end,&from,&to);
            if ( ret < 0 ) 
            {
                if ( ito<0 )
                    ret = _regions_parse_line(reg->line.s, ichr,ifrom,ifrom, &chr,&chr_end,&from,&to);
                if ( ret<0 )
                {
                    fprintf(stderr,"[%s:%d] Could not parse the file %s, using the columns %d,%d[,%d]\n", __FILE__,__LINE__,regions,ichr+1,ifrom+1,ito+1);
                    hts_close(reg->file); reg->file = NULL; free(reg); 
                    return NULL;
                }
            }
            if ( !ret ) continue;
            if ( is_bed ) from++;
            *chr_end = 0;
            _regions_add(reg, chr, from, to);
            *chr_end = '\t';
        }
        hts_close(reg->file); reg->file = NULL;
        if ( !reg->nseqs ) { free(reg); return NULL; }
        return reg;
    }

    reg->seq_names = (char**) tbx_seqnames(reg->tbx, &reg->nseqs);
    if ( !reg->seq_hash )
        reg->seq_hash = khash_str2int_init();
    int i;
    for (i=0; i<reg->nseqs; i++)
    {
        khash_str2int_set(reg->seq_hash,reg->seq_names[i],i);
    }
    reg->fname  = strdup(regions);
    reg->is_bin = 1;
    return reg;
}