BCFOrderedReader::BCFOrderedReader(std::string file_name, std::vector<GenomeInterval>& intervals) { this->file_name = (file_name=="+")? "-" : file_name; file = NULL; hdr = NULL; idx = NULL; tbx = NULL; itr = NULL; this->intervals = intervals; interval_index = 0; index_loaded = false; file = hts_open(this->file_name.c_str(), "r"); if (!file) { fprintf(stderr, "[%s:%d %s] Cannot open %s\n", __FILE__, __LINE__, __FUNCTION__, file_name.c_str()); exit(1); } ftype = file->format; if (ftype.format!=vcf && ftype.format!=bcf) { fprintf(stderr, "[%s:%d %s] Not a VCF/BCF file: %s\n", __FILE__, __LINE__, __FUNCTION__, file_name.c_str()); exit(1); } s = {0, 0, 0}; if (file==NULL) exit(1); hdr = bcf_alt_hdr_read(file); if (!hdr) exit(1); intervals_present = intervals.size()!=0; if (ftype.format==bcf) { if ((idx = bcf_index_load(file_name.c_str()))) { index_loaded = true; } else { if (intervals_present) { fprintf(stderr, "[E:%s] index cannot be loaded for %s for random access, ignoring specified intervals and reading from start.\n", __FUNCTION__, file_name.c_str()); // exit(1); } } } else if (ftype.format==vcf) { if (ftype.compression==bgzf) { if ((tbx = tbx_index_load(file_name.c_str()))) { index_loaded = true; } else { if (intervals_present) { fprintf(stderr, "[E:%s] index cannot be loaded for %s for random access, ignoring specified intervals and reading from start.\n", __FUNCTION__, file_name.c_str()); // exit(1); } } } else { if (intervals_present) { fprintf(stderr, "[E:%s] no random access support for VCF file: %s\n", __FUNCTION__, file_name.c_str()); // exit(1); } } } random_access_enabled = intervals_present && index_loaded; };
/** * Constructor. * * @intervals - if empty, will add the contigs found in the header files */ BCFSyncedStreamReader::BCFSyncedStreamReader(std::vector<std::string>& vcf_files, std::vector<GenomeInterval>& intervals, bool sync_by_pos) :vcf_files(vcf_files), intervals(intervals), sync_by_pos(sync_by_pos) { nfiles = vcf_files.size(); vcfs.resize(nfiles, 0); hdrs.resize(nfiles, 0); idxs.resize(nfiles, 0); tbxs.resize(nfiles, 0); itrs.resize(nfiles, 0); ftypes.resize(nfiles, -1); current_interval = ""; current_pos1 = 0; buffer.resize(nfiles); s = {0, 0, 0}; exists_selected_intervals = (intervals.size()!=0); for (uint32_t i=0; i<intervals.size(); ++i) { intervals_map[intervals[i].to_string()] = i; } intervals_index = 0; //1. check file type validity //2. loads indices //3. adds sequences found in all indexed files, this allows us to iterate through all sequences. for (int32_t i = 0; i<nfiles; ++i) { ftypes[i] = hts_file_type(vcf_files[i].c_str()); vcfs[i] = bcf_open(vcf_files[i].c_str(), "r"); if (vcfs[i]==NULL) exit(1); hdrs[i] = bcf_alt_hdr_read(vcfs[i]); if (i==0) { if (!(ftypes[i] & (FT_VCF|FT_BCF|FT_STDIN))) { fprintf(stderr, "[E:%s:%d %s] %s not a VCF or BCF file\n", __FILE__, __LINE__, __FUNCTION__, vcf_files[i].c_str()); exit(1); } if (!load_index(i)) { fprintf(stderr, "[I:%s:%d %s] index cannot be loaded for %s\n", __FILE__, __LINE__, __FUNCTION__, vcf_files[i].c_str()); exit(1); } if (!exists_selected_intervals) { //add sequences from file i add_interval(i); } } else { if (!(ftypes[i] & (FT_VCF_GZ|FT_BCF_GZ))) { fprintf(stderr, "[E:%s:%d %s] %s not a VCF_GZ or BCF file\n", __FILE__, __LINE__, __FUNCTION__, vcf_files[i].c_str()); exit(1); } if (!load_index(i)) { fprintf(stderr, "[E:%s:%d %s] index cannot be loaded for %s\n", __FILE__, __LINE__, __FUNCTION__, vcf_files[i].c_str()); exit(1); } if (!exists_selected_intervals) { //add sequences from file i add_interval(i); } } } }
/** * Constructor. * * @intervals - if empty, will add the contigs found in the header files */ BCFSyncedReader::BCFSyncedReader(std::vector<std::string>& file_names, std::vector<GenomeInterval>& intervals, bool sync_by_pos) :file_names(file_names), intervals(intervals), sync_by_pos(sync_by_pos) { nfiles = file_names.size(); files.resize(nfiles, 0); hdrs.resize(nfiles, 0); idxs.resize(nfiles, 0); tbxs.resize(nfiles, 0); itrs.resize(nfiles, 0); ftypes.resize(nfiles); current_interval = ""; current_pos1 = 0; buffer.resize(nfiles); s = {0, 0, 0}; random_access = (intervals.size()!=0); for (size_t i=0; i<intervals.size(); ++i) { intervals_map[intervals[i].to_string()] = i; } intervals_index = 0; uint32_t no_stdins = 0; for (size_t i = 0; i<nfiles; ++i) { if (file_names[0]=="+") { file_names[0]="-"; ++no_stdins; } else if (file_names[0]=="-") { file_names[0]="-"; ++no_stdins; } if (no_stdins>1) { fprintf(stderr, "[E:%s:%d %s] BCFSyncedReader does not support reading from more than one STDIN stream\n", __FILE__, __LINE__, __FUNCTION__); exit(1); } files[i] = hts_open(file_names[i].c_str(), "r"); if (files[i]==NULL) { fprintf(stderr, "[%s:%d %s] Cannot open %s\n", __FILE__, __LINE__, __FUNCTION__, file_names[i].c_str()); exit(1); } ftypes[i] = files[i]->format; //check format if (ftypes[i].format!=vcf && ftypes[i].format!=bcf) { fprintf(stderr, "[E:%s:%d %s] %s not a VCF or BCF file\n", __FILE__, __LINE__, __FUNCTION__, file_names[i].c_str()); exit(1); } //read header hdrs[i] = bcf_alt_hdr_read(files[i]); if (!hdrs[i]) { fprintf(stderr, "[E:%s:%d %s] header cannot be read for %s\n", __FILE__, __LINE__, __FUNCTION__, file_names[i].c_str()); exit(1); } //load index if intervals are specified if (random_access && !load_index(i)) { fprintf(stderr, "[E:%s:%d %s] index cannot be loaded for %s for random access\n", __FILE__, __LINE__, __FUNCTION__, file_names[i].c_str()); exit(1); } //check contigs consistency if (i) { int32_t nseqs0; const char ** seqnames0 = bcf_hdr_seqnames(hdrs[i], &nseqs0); int32_t nseqs; const char ** seqnames = bcf_hdr_seqnames(hdrs[i], &nseqs); if (nseqs0==0 || nseqs==0 || nseqs0!=nseqs) { fprintf(stderr, "[E:%s:%d %s] contigs in header not consistent with first file for %s\n", __FILE__, __LINE__, __FUNCTION__, file_names[i].c_str()); exit(1); } for (size_t j=0; j<nseqs; ++j) { if (strcmp(seqnames0[j], seqnames[j])) { fprintf(stderr, "[E:%s:%d %s] contigs in header not consistent with first file for %s\n", __FILE__, __LINE__, __FUNCTION__, file_names[i].c_str()); exit(1); } } free(seqnames0); free(seqnames); } } }