int main_vcfview(int argc, char *argv[]) { int c, clevel = -1, in_type = FT_BCF, out_type = FT_VCF; char *fname_out = NULL, moder[8], modew[8]; while ((c = getopt(argc, argv, "l:bvo:n:z?hu")) >= 0) { switch (c) { case 'o': switch (optarg[0]) { case 'b': out_type = FT_BCF_GZ; break; case 'u': out_type = FT_BCF; break; case 'z': out_type = FT_VCF_GZ; break; case 'v': out_type = FT_VCF; break; default: error("The output type \"%s\" not recognised\n", optarg); } break; case 'l': clevel = atoi(optarg); out_type |= FT_GZ; break; case 'v': in_type = FT_VCF; break; case 'b': out_type = FT_BCF_GZ; break; case 'u': out_type = FT_BCF; break; case 'z': out_type = FT_VCF_GZ; break; case 'n': fname_out = optarg; break; case '?': case 'h': usage(); return 1; break; } } if (argc!=optind+1) { usage(); return 1; } // Init reader strcpy(moder, "r"); if ( (!strcmp("-",argv[optind]) && (in_type & FT_BCF)) || (hts_file_type(argv[optind]) & FT_BCF)) strcat(moder, "b"); htsFile *fp_in = hts_open(argv[optind], moder, NULL); if ( !fp_in ) error("Fail to open: %s\n", argv[optind]); bcf_hdr_t *hdr = vcf_hdr_read(fp_in); if ( !hdr ) error("Fail to read VCF/BCF header: %s\n", argv[optind]); bcf1_t *rec = bcf_init1(); // Init writer strcpy(modew, "w"); if (clevel >= 0 && clevel <= 9) sprintf(modew + 1, "%d", clevel); if (out_type & FT_GZ) strcat(modew,"z"); if (out_type & FT_BCF) strcat(modew, "b"); if (out_type == FT_BCF) strcat(modew, "u"); // uncompressed BCF output htsFile *fp_out = hts_open(fname_out ? fname_out : "-", modew, NULL); vcf_hdr_write(fp_out, hdr); while ( vcf_read1(fp_in, hdr, rec) >= 0) vcf_write1(fp_out, hdr, rec); bcf_destroy1(rec); bcf_hdr_destroy(hdr); hts_close(fp_in); hts_close(fp_out); return 0; }
/** * Initialize buffer for next interval. * This should only be invoked if the buffer is empty. * Returns true if successful. */ bool BCFSyncedStreamReader::initialize_next_interval() { while (intervals_index < intervals.size()) { GenomeInterval interval = intervals[intervals_index++]; for (int32_t i = 0; i<nfiles; ++i) { int32_t ftype = hts_file_type(vcf_files[i].c_str()); hts_itr_destroy(itrs[i]); itrs[i] = 0; interval.to_string(&s); if (ftype==FT_BCF_GZ) { itrs[i] = bcf_itr_querys(idxs[i], hdrs[i], s.s); } else if (ftype==FT_VCF_GZ) { itrs[i] = tbx_itr_querys(tbxs[i], s.s); } fill_buffer(i); } //make sure pq is not empty //it is possible for the pq to be empty as iterators may be returned //as the sequence might be a valid sequence stated in the header if (pq.size()!=0) { return true; } } return false; }
int main_vcfindex(int argc, char *argv[]) { int c, min_shift = 14, force = 0; static struct option loptions[] = { {"help",0,0,'h'}, {"force",0,0,'f'}, {"min-shift",1,0,'m'}, {0,0,0,0} }; while ((c = getopt_long(argc, argv, "h?fm:", loptions,NULL)) >= 0) { switch (c) { case 'f': force = 1; break; case 'm': min_shift = atoi(optarg); break; default: usage(); } } if ( optind==argc ) usage(); if (min_shift < 0 || min_shift > 30) { fprintf(stderr, "[E::%s] expected min_shift in range [0,30] (%d)\n", __func__, min_shift); return 1; } char *fname = argv[optind]; int ftype = hts_file_type(fname); if (!ftype) { fprintf(stderr, "[E::%s] unknown filetype; expected .vcf.gz or .bcf\n", __func__); return 1; } if (!force) { // Before complaining about existing index, check if the VCF file isn't newer. char *idx_fname = (char*)alloca(strlen(fname) + 5); strcat(strcpy(idx_fname, fname), min_shift <= 0 ? ".tbi" : ".csi"); struct stat stat_tbi, stat_file; if ( stat(idx_fname, &stat_tbi)==0 ) { stat(fname, &stat_file); if ( stat_file.st_mtime <= stat_tbi.st_mtime ) { fprintf(stderr,"[E::%s] the index file exists. Please use '-f' to overwrite.\n", __func__); return 1; } } } if (ftype == FT_BCF_GZ) { if ( bcf_index_build(fname, min_shift) != 0 ) { fprintf(stderr,"[E::%s] bcf_index_build failed: %s\n", __func__, fname); return 1; } } else if (ftype == FT_VCF_GZ) { if ( tbx_index_build(fname, min_shift, &tbx_conf_vcf) != 0 ) { fprintf(stderr,"[E::%s] tbx_index_build failed for %s\n", __func__, fname); return 1; } } return 0; }
int vcf_index_stats(char *fname, int stats) { char *fn_out = NULL; FILE *out; out = fn_out ? fopen(fn_out, "w") : stdout; const char **seq; int i, nseq, ftype = hts_file_type(fname); tbx_t *tbx = NULL; hts_idx_t *idx = NULL; htsFile *fp = hts_open(fname,"r"); if ( !fp ) { fprintf(stderr,"Could not read %s\n", fname); return 1; } bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) { fprintf(stderr,"Could not read the header: %s\n", fname); return 1; } if ( ftype & FT_VCF || !ftype ) { tbx = tbx_index_load(fname); if ( !tbx ) { fprintf(stderr,"Could not load TBI index: %s\n", fname); return 1; } } else if ( ftype & FT_BCF ) { idx = bcf_index_load(fname); if ( !idx ) { fprintf(stderr,"Could not load CSI index: %s\n", fname); return 1; } } else { fprintf(stderr,"Could not detect the file type as VCF or BCF: %s\n", fname); return 1; } seq = tbx ? tbx_seqnames(tbx, &nseq) : bcf_index_seqnames(idx, hdr, &nseq); uint64_t sum = 0; for (i=0; i<nseq; i++) { uint64_t records, v; hts_idx_get_stat(tbx ? tbx->idx : idx, i, &records, &v); sum+=records; if (stats&2 || !records) continue; bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", seq[i], NULL); int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1; if (hkey<0) { fprintf(stderr,"could not get contig length for %s\n", seq[i]); return 1; } fprintf(out, "%s\t%s", seq[i], strcmp(hrec->vals[hkey], "2147483647")==0 ? "." : hrec->vals[hkey]); fprintf(out, "\t%" PRIu64 "\n", records); } if (!sum) { // No counts found. // Is this because index version has no stored count data, or no records? bcf1_t *rec = bcf_init1(); if (bcf_read1(fp, hdr, rec) >= 0) { fprintf(stderr,"%s index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", tbx ? "TBI" : "CSI", fname); return 1; } bcf_destroy1(rec); } if (stats&2) fprintf(out, "%" PRIu64 "\n", sum); free(seq); fclose(out); hts_close(fp); bcf_hdr_destroy(hdr); if (tbx) tbx_destroy(tbx); if (idx) hts_idx_destroy(idx); return 0; }
int main_vcfindex(int argc, char *argv[]) { int c, force = 0, tbi = 0, stats = 0; int min_shift = BCF_LIDX_SHIFT; static struct option loptions[] = { {"csi",no_argument,NULL,'c'}, {"tbi",no_argument,NULL,'t'}, {"force",no_argument,NULL,'f'}, {"min-shift",required_argument,NULL,'m'}, {"stats",no_argument,NULL,'s'}, {"nrecords",no_argument,NULL,'n'}, {NULL, 0, NULL, 0} }; while ((c = getopt_long(argc, argv, "ctfm:sn", loptions, NULL)) >= 0) { switch (c) { case 'c': tbi = 0; break; case 't': tbi = 1; min_shift = 0; break; case 'f': force = 1; break; case 'm': min_shift = atoi(optarg); break; case 's': stats |= 1; break; case 'n': stats |= 2; break; default: usage(); } } if ( optind==argc ) usage(); if (stats>2) { fprintf(stderr, "[E::%s] expected only one of --stats or --nrecords options\n", __func__); return 1; } if (tbi && min_shift>0) { fprintf(stderr, "[E::%s] min-shift option only expected for CSI indices \n", __func__); return 1; } if (min_shift < 0 || min_shift > 30) { fprintf(stderr, "[E::%s] expected min_shift in range [0,30] (%d)\n", __func__, min_shift); return 1; } char *fname = argv[optind]; if (stats) return vcf_index_stats(fname, stats); int ftype = hts_file_type(fname); if (!ftype || (ftype != FT_BCF_GZ && ftype != FT_VCF_GZ)) { fprintf(stderr, "[E::%s] unknown filetype; expected bgzip compressed VCF or BCF\n", __func__); if (!(ftype & FT_GZ)) fprintf(stderr, "[E::%s] was the VCF/BCF compressed with bgzip?\n", __func__); return 1; } if (tbi && ftype == FT_BCF_GZ) { fprintf(stderr, "[Warning] TBI-index does not work for BCF files. Generating CSI instead.\n"); tbi = 0; min_shift = BCF_LIDX_SHIFT; } if (min_shift == 0 && ftype == FT_BCF_GZ) { fprintf(stderr, "[E::%s] Require min_shift>0 for BCF files.\n", __func__); return 1; } if (!tbi && ftype == FT_VCF_GZ && min_shift == 0) { fprintf(stderr, "[Warning] min-shift set to 0 for VCF file. Generating TBI file.\n"); tbi = 1; } if (!force) { // Before complaining about existing index, check if the VCF file isn't newer. char *idx_fname = (char*)alloca(strlen(fname) + 5); strcat(strcpy(idx_fname, fname), tbi ? ".tbi" : ".csi"); struct stat stat_tbi, stat_file; if ( stat(idx_fname, &stat_tbi)==0 ) { stat(fname, &stat_file); if ( stat_file.st_mtime <= stat_tbi.st_mtime ) { fprintf(stderr,"[E::%s] the index file exists. Please use '-f' to overwrite.\n", __func__); return 1; } } } if (ftype == FT_BCF_GZ) { if ( bcf_index_build(fname, min_shift) != 0 ) { fprintf(stderr,"[E::%s] bcf_index_build failed for %s\n", __func__, fname); return 1; } } else { if ( tbx_index_build(fname, min_shift, &tbx_conf_vcf) != 0 ) { fprintf(stderr,"[E::%s] tbx_index_build failed for %s\n", __func__, fname); return 1; } } return 0; }
/** * Constructor. * * @intervals - if empty, will add the contigs found in the header files */ BCFSyncedStreamReader::BCFSyncedStreamReader(std::vector<std::string>& vcf_files, std::vector<GenomeInterval>& intervals, bool sync_by_pos) :vcf_files(vcf_files), intervals(intervals), sync_by_pos(sync_by_pos) { nfiles = vcf_files.size(); vcfs.resize(nfiles, 0); hdrs.resize(nfiles, 0); idxs.resize(nfiles, 0); tbxs.resize(nfiles, 0); itrs.resize(nfiles, 0); ftypes.resize(nfiles, -1); current_interval = ""; current_pos1 = 0; buffer.resize(nfiles); s = {0, 0, 0}; exists_selected_intervals = (intervals.size()!=0); for (uint32_t i=0; i<intervals.size(); ++i) { intervals_map[intervals[i].to_string()] = i; } intervals_index = 0; //1. check file type validity //2. loads indices //3. adds sequences found in all indexed files, this allows us to iterate through all sequences. for (int32_t i = 0; i<nfiles; ++i) { ftypes[i] = hts_file_type(vcf_files[i].c_str()); vcfs[i] = bcf_open(vcf_files[i].c_str(), "r"); if (vcfs[i]==NULL) exit(1); hdrs[i] = bcf_alt_hdr_read(vcfs[i]); if (i==0) { if (!(ftypes[i] & (FT_VCF|FT_BCF|FT_STDIN))) { fprintf(stderr, "[E:%s:%d %s] %s not a VCF or BCF file\n", __FILE__, __LINE__, __FUNCTION__, vcf_files[i].c_str()); exit(1); } if (!load_index(i)) { fprintf(stderr, "[I:%s:%d %s] index cannot be loaded for %s\n", __FILE__, __LINE__, __FUNCTION__, vcf_files[i].c_str()); exit(1); } if (!exists_selected_intervals) { //add sequences from file i add_interval(i); } } else { if (!(ftypes[i] & (FT_VCF_GZ|FT_BCF_GZ))) { fprintf(stderr, "[E:%s:%d %s] %s not a VCF_GZ or BCF file\n", __FILE__, __LINE__, __FUNCTION__, vcf_files[i].c_str()); exit(1); } if (!load_index(i)) { fprintf(stderr, "[E:%s:%d %s] index cannot be loaded for %s\n", __FILE__, __LINE__, __FUNCTION__, vcf_files[i].c_str()); exit(1); } if (!exists_selected_intervals) { //add sequences from file i add_interval(i); } } } }
static void init_data(args_t *args) { args->file_type = hts_file_type(args->fname); }
bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr, int ifrom, int ito) { bcf_sr_regions_t *reg; if ( !is_file ) return _regions_init_string(regions); reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t)); reg->start = reg->end = -1; reg->prev_start = reg->prev_seq = -1; reg->file = hts_open(regions, "rb"); if ( !reg->file ) { fprintf(stderr,"[%s:%d %s] Could not open file: %s\n", __FILE__,__LINE__,__FUNCTION__,regions); free(reg); return NULL; } reg->tbx = tbx_index_load(regions); if ( !reg->tbx ) { int len = strlen(regions); int is_bed = strcasecmp(".bed",regions+len-4) ? 0 : 1; if ( !is_bed && !strcasecmp(".bed.gz",regions+len-7) ) is_bed = 1; int ft_type = hts_file_type(regions); if ( ft_type & FT_VCF ) ito = 1; // read the whole file, tabix index is not present while ( hts_getline(reg->file, KS_SEP_LINE, ®->line) > 0 ) { char *chr, *chr_end; int from, to, ret; ret = _regions_parse_line(reg->line.s, ichr,ifrom,abs(ito), &chr,&chr_end,&from,&to); if ( ret < 0 ) { if ( ito<0 ) ret = _regions_parse_line(reg->line.s, ichr,ifrom,ifrom, &chr,&chr_end,&from,&to); if ( ret<0 ) { fprintf(stderr,"[%s:%d] Could not parse the file %s, using the columns %d,%d[,%d]\n", __FILE__,__LINE__,regions,ichr+1,ifrom+1,ito+1); hts_close(reg->file); reg->file = NULL; free(reg); return NULL; } } if ( !ret ) continue; if ( is_bed ) from++; *chr_end = 0; _regions_add(reg, chr, from, to); *chr_end = '\t'; } hts_close(reg->file); reg->file = NULL; if ( !reg->nseqs ) { free(reg); return NULL; } return reg; } reg->seq_names = (char**) tbx_seqnames(reg->tbx, ®->nseqs); if ( !reg->seq_hash ) reg->seq_hash = khash_str2int_init(); int i; for (i=0; i<reg->nseqs; i++) { khash_str2int_set(reg->seq_hash,reg->seq_names[i],i); } reg->fname = strdup(regions); reg->is_bin = 1; return reg; }