static int query_chroms(char *fname) { const char **seq; int i, nseq, ftype = file_type(fname); if ( ftype & IS_TXT || !ftype ) { tbx_t *tbx = tbx_index_load(fname); if ( !tbx ) error("Could not load .tbi index of %s\n", fname); seq = tbx_seqnames(tbx, &nseq); for (i=0; i<nseq; i++) printf("%s\n", seq[i]); free(seq); tbx_destroy(tbx); } else if ( ftype==IS_BCF ) { htsFile *fp = hts_open(fname,"r"); if ( !fp ) error("Could not read %s\n", fname); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Could not read the header: %s\n", fname); hts_close(fp); hts_idx_t *idx = bcf_index_load(fname); if ( !idx ) error("Could not load .csi index of %s\n", fname); seq = bcf_index_seqnames(idx, hdr, &nseq); for (i=0; i<nseq; i++) printf("%s\n", seq[i]); free(seq); bcf_hdr_destroy(hdr); hts_idx_destroy(idx); } else if ( ftype==IS_BAM ) // todo: BAM error("BAM: todo\n"); return 0; }
TBXOrderedReader::TBXOrderedReader(std::string hts_file, std::vector<GenomeInterval>& intervals) { this->hts_file = hts_file; this->intervals = intervals; interval_index = 0; hts = NULL; tbx = NULL; itr = NULL; s = {0, 0, 0}; hts = hts_open(hts_file.c_str(), "r"); intervals_present = intervals.size()!=0; if ((tbx = tbx_index_load(hts_file.c_str()))) { index_loaded = true; } else { if (intervals_present) { fprintf(stderr, "[E:%s] index cannot be loaded for %s\n", __FUNCTION__, hts_file.c_str()); exit(1); } } random_access_enabled = intervals_present && index_loaded; };
int bcf_sr_set_targets(readers_t *files, const char *fname) { regions_t *tgts = (regions_t *) calloc(1,sizeof(regions_t)); tgts->file = hts_open(fname, "rb", NULL); if ( !tgts->file ) return 0; tgts->tbx = tbx_index_load(fname); tgts->seq_names = (char**) tbx_seqnames(tgts->tbx, &tgts->nseqs); tgts->cseq = -1; files->targets = tgts; return 1; }
Tabix::Tabix(string& file) { has_jumped = false; filename = file; const char* cfilename = file.c_str(); struct stat stat_tbi,stat_vcf; char *fnidx = (char*) calloc(strlen(cfilename) + 5, 1); strcat(strcpy(fnidx, cfilename), ".tbi"); if ( bgzf_is_bgzf(cfilename)!=1 ) { cerr << "[tabix++] was bgzip used to compress this file? " << file << endl; free(fnidx); exit(1); } // Common source of errors: new VCF is used with an old index stat(fnidx, &stat_tbi); stat(cfilename, &stat_vcf); if ( stat_vcf.st_mtime > stat_tbi.st_mtime ) { cerr << "[tabix++] the index file is older than the vcf file. Please use '-f' to overwrite or reindex." << endl; free(fnidx); exit(1); } free(fnidx); if ((fn = hts_open(cfilename, "r")) == 0) { cerr << "[tabix++] fail to open the data file." << endl; exit(1); } if ((tbx = tbx_index_load(cfilename)) == NULL) { cerr << "[tabix++] failed to load the index file." << endl; exit(1); } int nseq; const char** seq = tbx_seqnames(tbx, &nseq); for (int i=0; i<nseq; i++) { chroms.push_back(seq[i]); } free(seq); idxconf = &tbx_conf_vcf; // set up the iterator, defaults to the beginning if (nseq == 0){ // the vcf file contains only the header according // to the index iter = NULL; current_chrom = chroms.end(); }else{ current_chrom = chroms.begin(); iter = tbx_itr_querys(tbx, current_chrom->c_str()); } }
void hts_streamer:: _load_index() { if (_tidx) return; _tidx = tbx_index_load(name()); if (! _tidx) { std::ostringstream oss; oss << "Failed to load index for hts file: '" << name() << "'"; BOOST_THROW_EXCEPTION(illumina::common::GeneralException(oss.str())); } }
void cis_data::scanPhenotypes(string fbed) { int n_includedP = 0; int n_excludedP = 0; int n_negativeStrd = 0; //Open BED file vrb.title("Scanning phenotype data in [" + fbed + "]"); htsFile *fp = hts_open(fbed.c_str(),"r"); if (!fp) vrb.error("Cannot open file"); tbx_t * tbx = tbx_index_load(fbed.c_str()); if (!tbx) vrb.error("Cannot open index file"); //Read header kstring_t str = {0,0,0}; if (!hts_getline(fp, KS_SEP_LINE, &str) || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line"); //Scan file vector < string > tokens; while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) { if (str.l && str.s[0] != tbx->conf.meta_char) { stb.split(string(str.s), tokens); if (tokens.size() < 5) vrb.error("Incorrect number of columns!"); if ((grp_mode == GRP_NONE && filter_phenotype.check(tokens[3])) || (grp_mode != GRP_NONE && filter_phenotype.check(tokens[4]))) { phenotype_id.push_back(tokens[3]); phenotype_chr.push_back(tokens[0]); phenotype_start.push_back(atoi(tokens[1].c_str()) + 1); phenotype_end.push_back(atoi(tokens[2].c_str())); if (grp_mode > 0 && full_test) phenotype_grp.push_back("ALL_GENES"); if (grp_mode > 0 && !full_test) phenotype_grp.push_back(tokens[4]); phenotype_neg.push_back(tokens[5] == "-"); if (phenotype_neg.back()) n_negativeStrd ++; n_includedP++; } else n_excludedP ++; } } //Finalize & verbose tbx_destroy(tbx); if (hts_close(fp)) vrb.error("Cannot properly close file"); phenotype_count = phenotype_id.size(); vrb.bullet(stb.str(n_includedP) + " phenotypes included"); if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user"); if (n_negativeStrd > 0 ) vrb.bullet(stb.str(n_negativeStrd) + " phenotypes are on the negative strand"); if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in region!"); }
/** * Load index for the ith file, returns true if successful */ bool BCFSyncedReader::load_index(int32_t i) { if (ftypes[i].format==bcf && ftypes[i].compression==bgzf) { if (!(idxs[i] = bcf_index_load(file_names[i].c_str()))) { return false; } } else if (ftypes[i].format==vcf && ftypes[i].compression==bgzf) { if (!(tbxs[i] = tbx_index_load(file_names[i].c_str()))) { return false; } } return true; }
/** * Load index for the ith file, returns true if successful */ bool BCFSyncedStreamReader::load_index(int32_t i) { if (ftypes[i]==FT_BCF_GZ) { if (!(idxs[i] = bcf_index_load(vcf_files[i].c_str()))) { return false; } } else if (ftypes[i]==FT_VCF_GZ) { if (!(tbxs[i] = tbx_index_load(vcf_files[i].c_str()))) { return false; } } return true; }
int main(int argc, char* argv[]) { namespace po = boost::program_options; std::string file; std::string output; try { // Declare the supported options. po::options_description desc("Allowed options"); desc.add_options() ("help,h", "produce help message") ("version", "Show version") ("input-file", po::value< std::string >(), "The input files") ("output-file", po::value<std::string>(), "The output file name.") ; po::positional_options_description popts; popts.add("input-file", 1); popts.add("output-file", 1); po::options_description cmdline_options; cmdline_options .add(desc) ; po::variables_map vm; po::store(po::command_line_parser(argc, argv). options(cmdline_options).positional(popts).run(), vm); po::notify(vm); if (vm.count("version")) { std::cout << "vcfhdr2json version " << HAPLOTYPES_VERSION << "\n"; return 0; } if (vm.count("help")) { std::cout << desc << "\n"; return 1; } if (vm.count("input-file")) { file = vm["input-file"].as< std::string > (); } if (vm.count("output-file")) { output = vm["output-file"].as< std::string >(); } if(file.size() == 0) { std::cerr << "Please specify an input file.\n"; return 1; } if (output == "") { std::cerr << "Please specify an output file.\n"; return 1; } } catch (po::error & e) { std::cerr << e.what() << "\n"; return 1; } try { Json::StyledWriter writer; htsFile * fp = bcf_open(file.c_str(), "r"); bcf_hdr_t * hdr = bcf_hdr_read(fp); Json::Value root; Json::Value a; for (int i = 0; i < bcf_hdr_nsamples(hdr); ++i) { a.append(hdr->samples[i]); } root["samples"] = a; Json::Value fields; for (int i = 0; i < hdr->nhrec; i++) { Json::Value field; field["key"] = hdr->hrec[i]->key; if (!hdr->hrec[i]->value) { Json::Value values; for (int j = 0; j < hdr->hrec[i]->nkeys; j++) { values[hdr->hrec[i]->keys[j]] = hdr->hrec[i]->vals[j]; } field["values"] = values; } else { field["value"] = hdr->hrec[i]->value; } fields.append(field); } root["fields"] = fields; tbx_t * tbx_idx = tbx_index_load(file.c_str()); if ( !tbx_idx ) { hts_idx_t * csi_idx = bcf_index_load(file.c_str()); if(!csi_idx) { root["tabix"] = Json::Value::null; } else { root["tabix"] = Json::Value(); root["tabix"]["chromosomes"] = Json::Value(); int count = 0; const char ** tbx_names = bcf_index_seqnames(csi_idx, hdr, &count); for (int i = 0; i < count; ++i) { root["tabix"]["chromosomes"].append(tbx_names[i]); } free(tbx_names); hts_idx_destroy(csi_idx); } } else { root["tabix"] = Json::Value(); root["tabix"]["chromosomes"] = Json::Value(); int count = 0; const char ** tbx_names = tbx_seqnames(tbx_idx, &count); for (int i = 0; i < count; ++i) { root["tabix"]["chromosomes"].append(tbx_names[i]); } free(tbx_names); tbx_destroy(tbx_idx); } std::ofstream out(output.c_str()); out << writer.write(root); bcf_close(fp); bcf_hdr_destroy(hdr); } catch(std::runtime_error & e) { std::cerr << e.what() << std::endl; return 1; } catch(std::logic_error & e) { std::cerr << e.what() << std::endl; return 1; } return 0; }
bcf_hdr_t *vcf_hdr_read(htsFile *fp) { if (!fp->is_bin) { kstring_t txt, *s = &fp->line; bcf_hdr_t *h; h = bcf_hdr_init(); txt.l = txt.m = 0; txt.s = 0; while (hts_getline(fp, KS_SEP_LINE, s) >= 0) { if (s->l == 0) continue; if (s->s[0] != '#') { if (hts_verbose >= 2) fprintf(stderr, "[E::%s] no sample line\n", __func__); free(txt.s); bcf_hdr_destroy(h); return 0; } if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here int dret; gzFile f; kstream_t *ks; kstring_t tmp; tmp.l = tmp.m = 0; tmp.s = 0; f = gzopen(fp->fn_aux, "r"); ks = ks_init(f); while (ks_getuntil(ks, 0, &tmp, &dret) >= 0) { int c; kputs("##contig=<ID=", &txt); kputs(tmp.s, &txt); ks_getuntil(ks, 0, &tmp, &dret); kputs(",length=", &txt); kputw(atol(tmp.s), &txt); kputsn(">\n", 2, &txt); if (dret != '\n') while ((c = ks_getc(ks)) != '\n' && c != -1); // skip the rest of the line } free(tmp.s); ks_destroy(ks); gzclose(f); } kputsn(s->s, s->l, &txt); if (s->s[1] != '#') break; kputc('\n', &txt); } h->l_text = txt.l + 1; // including NULL h->text = txt.s; bcf_hdr_parse(h); // check tabix index, are all contigs listed in the header? add the missing ones tbx_t *idx = tbx_index_load(fp->fn); if ( idx ) { int i, n, need_sync = 0; const char **names = tbx_seqnames(idx, &n); for (i=0; i<n; i++) { bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_DT_CTG, (char*) names[i]); if ( hrec ) continue; hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t)); hrec->key = strdup("contig"); bcf_hrec_add_key(hrec, "ID", strlen("ID")); bcf_hrec_set_val(hrec, hrec->nkeys-1, (char*) names[i], strlen(names[i]), 0); bcf_hrec_add_key(hrec, "length", strlen("length")); bcf_hrec_set_val(hrec, hrec->nkeys-1, "-1", strlen("-1"), 0); // what is a good default value? bcf_hdr_add_hrec(h, hrec); need_sync = 1; } free(names); tbx_destroy(idx); if ( need_sync ) { bcf_hdr_sync(h); bcf_hdr_fmt_text(h); } } return h; } else return bcf_hdr_read((BGZF*)fp->fp); }
int bcf_sr_add_reader(readers_t *files, const char *fname) { files->readers = (reader_t*) realloc(files->readers, sizeof(reader_t)*(files->nreaders+1)); reader_t *reader = &files->readers[files->nreaders++]; memset(reader,0,sizeof(reader_t)); int type = file_type(fname); if ( type==IS_VCF_GZ ) { reader->tbx = tbx_index_load(fname); if ( !reader->tbx ) { fprintf(stderr,"[add_reader] Could not load the index of %s\n", fname); return 0; } // This is just to read the header htsFile *file = hts_open(fname, "r", NULL); if ( !file ) return 0; reader->header = vcf_hdr_read(file); hts_close(file); // The VCF opened in binary tabix mode reader->file = hts_open(fname, "rb", NULL); if ( !reader->file ) return 0; } else if ( type==IS_BCF ) { reader->file = hts_open(fname, "rb", NULL); if ( !reader->file ) return 0; reader->header = vcf_hdr_read(reader->file); reader->bcf = bcf_index_load(fname); if ( !reader->bcf ) { fprintf(stderr,"[add_reader] Could not load the index of %s\n", fname); return 0; // not indexed..? } } else { fprintf(stderr,"Expected .vcf.gz or .bcf file\n"); return 0; } reader->fname = fname; reader->filter_id = -1; if ( files->apply_filters ) reader->filter_id = bcf_id2int(reader->header, BCF_DT_ID, "PASS"); // Update list of chromosomes if ( files->region ) { if ( !files->seqs ) { files->mseqs = files->nseqs = 1; files->seqs = (const char**) malloc(sizeof(const char*)); files->seqs[0] = files->region; } } else { int n,i,j; const char **names = bcf_seqnames(reader->header, &n); for (i=0; i<n; i++) { for (j=0; j<files->nseqs; j++) if ( !strcmp(names[i],files->seqs[j]) ) break; if ( j<files->nseqs ) continue; // already have this chr files->mseqs += 30; files->seqs = (const char**) realloc(files->seqs, sizeof(const char*)*files->mseqs); files->seqs[files->nseqs++] = names[i]; } free(names); } files->iseq = -1; return 1; }
BCFOrderedReader::BCFOrderedReader(std::string file_name, std::vector<GenomeInterval>& intervals) { this->file_name = (file_name=="+")? "-" : file_name; file = NULL; hdr = NULL; idx = NULL; tbx = NULL; itr = NULL; this->intervals = intervals; interval_index = 0; index_loaded = false; file = hts_open(this->file_name.c_str(), "r"); if (!file) { fprintf(stderr, "[%s:%d %s] Cannot open %s\n", __FILE__, __LINE__, __FUNCTION__, file_name.c_str()); exit(1); } ftype = file->format; if (ftype.format!=vcf && ftype.format!=bcf) { fprintf(stderr, "[%s:%d %s] Not a VCF/BCF file: %s\n", __FILE__, __LINE__, __FUNCTION__, file_name.c_str()); exit(1); } s = {0, 0, 0}; if (file==NULL) exit(1); hdr = bcf_alt_hdr_read(file); if (!hdr) exit(1); intervals_present = intervals.size()!=0; if (ftype.format==bcf) { if ((idx = bcf_index_load(file_name.c_str()))) { index_loaded = true; } else { if (intervals_present) { fprintf(stderr, "[E:%s] index cannot be loaded for %s for random access, ignoring specified intervals and reading from start.\n", __FUNCTION__, file_name.c_str()); // exit(1); } } } else if (ftype.format==vcf) { if (ftype.compression==bgzf) { if ((tbx = tbx_index_load(file_name.c_str()))) { index_loaded = true; } else { if (intervals_present) { fprintf(stderr, "[E:%s] index cannot be loaded for %s for random access, ignoring specified intervals and reading from start.\n", __FUNCTION__, file_name.c_str()); // exit(1); } } } else { if (intervals_present) { fprintf(stderr, "[E:%s] no random access support for VCF file: %s\n", __FUNCTION__, file_name.c_str()); // exit(1); } } } random_access_enabled = intervals_present && index_loaded; };
int bcf_sr_add_reader(bcf_srs_t *files, const char *fname) { files->has_line = (int*) realloc(files->has_line, sizeof(int)*(files->nreaders+1)); files->has_line[files->nreaders] = 0; files->readers = (bcf_sr_t*) realloc(files->readers, sizeof(bcf_sr_t)*(files->nreaders+1)); bcf_sr_t *reader = &files->readers[files->nreaders++]; memset(reader,0,sizeof(bcf_sr_t)); reader->file = hts_open(fname, "r"); if ( !reader->file ) return 0; reader->type = reader->file->is_bin? FT_BCF : FT_VCF; if (reader->file->is_compressed) reader->type |= FT_GZ; if ( files->require_index ) { if ( reader->type==FT_VCF_GZ ) { reader->tbx_idx = tbx_index_load(fname); if ( !reader->tbx_idx ) { fprintf(stderr,"[add_reader] Could not load the index of %s\n", fname); return 0; } reader->header = bcf_hdr_read(reader->file); } else if ( reader->type==FT_BCF_GZ ) { reader->header = bcf_hdr_read(reader->file); reader->bcf_idx = bcf_index_load(fname); if ( !reader->bcf_idx ) { fprintf(stderr,"[add_reader] Could not load the index of %s\n", fname); return 0; // not indexed..? } } else { fprintf(stderr,"Index required, expected .vcf.gz or .bcf file: %s\n", fname); return 0; } } else { if ( reader->type & FT_BCF ) { reader->header = bcf_hdr_read(reader->file); } else if ( reader->type & FT_VCF ) { reader->header = bcf_hdr_read(reader->file); } else { fprintf(stderr,"File type not recognised: %s\n", fname); return 0; } files->streaming = 1; } if ( files->streaming && files->nreaders>1 ) { fprintf(stderr,"[%s:%d %s] Error: %d readers, yet require_index not set\n", __FILE__,__LINE__,__FUNCTION__,files->nreaders); return 0; } if ( files->streaming && files->regions ) { fprintf(stderr,"[%s:%d %s] Error: cannot tabix-jump in streaming mode\n", __FILE__,__LINE__,__FUNCTION__); return 0; } if ( !reader->header ) return 0; reader->fname = fname; if ( files->apply_filters ) reader->filter_ids = init_filters(reader->header, files->apply_filters, &reader->nfilter_ids); // Update list of chromosomes if ( !files->explicit_regs && !files->streaming ) { int n,i; const char **names = reader->tbx_idx ? tbx_seqnames(reader->tbx_idx, &n) : bcf_hdr_seqnames(reader->header, &n); for (i=0; i<n; i++) { if ( !files->regions ) files->regions = _regions_init_string(names[i]); else _regions_add(files->regions, names[i], -1, -1); } free(names); } return 1; }
int convert(int argc, char **argv) { if (argc < 2) return convert_help(); int c; char *in=NULL, *out=NULL, *bim=NULL, *vid=NULL, *tmp_dir=NULL, *ped=NULL; uint32_t num_fields, num_records, col = 2; int i_is_set = 0, o_is_set = 0, f_is_set = 0, b_is_set = 0, v_is_set = 0, t_is_set = 0, p_is_set = 0, r_is_set = 0; while((c = getopt (argc, argv, "hi:o:f:r:b:v:t:p:c:")) != -1) { switch (c) { case 'c': col = atoi(optarg); break; case 'p': p_is_set = 1; ped = optarg; break; case 't': t_is_set = 1; tmp_dir = optarg; break; case 'v': v_is_set = 1; vid = optarg; break; case 'b': b_is_set = 1; bim = optarg; break; case 'i': i_is_set = 1; in = optarg; break; case 'o': o_is_set = 1; out = optarg; break; case 'f': f_is_set = 1; num_fields = atoi(optarg); break; case 'r': r_is_set = 1; num_records = atoi(optarg); break; case 'h': convert_help(); return 1; case '?': if ( (optopt == 'i') || (optopt == 'f') || (optopt == 'r') || (optopt == 't') || (optopt == 's') || (optopt == 'p') || (optopt == 'c') || (optopt == 'o') ) fprintf (stderr, "Option -%c requires an argument.\n", optopt); else if (isprint (optopt)) fprintf (stderr, "Unknown option `-%c'.\n", optopt); else fprintf (stderr, "Unknown option character `\\x%x'.\n", optopt); default: convert_help(); return 1; } } char *type = argv[0]; if (i_is_set == 0) { printf("Input file is not set\n"); return convert_help(); } if (strcmp(type, "bcf") == 0) { if ( (f_is_set == 0) || (r_is_set == 0) ) { fprintf(stderr,"Attempting to autodetect num of records " "and fields from %s\n", in); //Try and auto detect the sizes, need the index tbx_t *tbx = NULL; hts_idx_t *idx = NULL; htsFile *fp = hts_open(in,"rb"); if ( !fp ) { fprintf(stderr,"Could not read %s\n", in); return 1; } bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) { fprintf(stderr,"Could not read the header: %s\n", in); return 1; } if (hts_get_format(fp)->format==vcf) { tbx = tbx_index_load(in); if ( !tbx ) { fprintf(stderr,"Could not load TBI index: %s\n", in); return 1; } } else if ( hts_get_format(fp)->format==bcf ) { idx = bcf_index_load(in); if ( !idx ) { fprintf(stderr,"Could not load CSI index: %s\n", in); return 1; } } else { fprintf(stderr, "Could not detect the file type as VCF or BCF: %s\n", in); return 1; } num_fields = hdr->n[BCF_DT_SAMPLE]; num_records = 0; const char **seq; int nseq; seq = tbx ? tbx_seqnames(tbx, &nseq) : bcf_index_seqnames(idx, hdr, &nseq); int i; uint32_t sum = 0; for (i = 0; i < nseq; ++i) { uint64_t records, v; hts_idx_get_stat(tbx ? tbx->idx: idx, i, &records, &v); num_records += records; } fprintf(stderr, "Number of records:%u\tNumber of fields:%u\n", num_records, num_fields); free(seq); hts_close(fp); bcf_hdr_destroy(hdr); if (idx) hts_idx_destroy(idx); if (tbx) tbx_destroy(tbx); } if (o_is_set == 0) { out = (char*)malloc(strlen(in) + 5); // 5 for ext and \0 strcpy(out,in); strcat(out, ".gqt"); } if (b_is_set == 0) { bim = (char*)malloc(strlen(in) + 5); // 5 for ext and \0 strcpy(bim,in); strcat(bim, ".bim"); } if (v_is_set == 0) { vid = (char*)malloc(strlen(in) + 5); // 5 for ext and \0 strcpy(vid,in); strcat(vid, ".vid"); } if (t_is_set == 0) { tmp_dir = (char*)malloc(3*sizeof(char)); // "./\0" strcpy(tmp_dir,"./"); } int r = bcf_wahbm(in, out, bim, vid, tmp_dir, num_fields, num_records); return r; } if (strcmp(type, "ped") == 0) { if (o_is_set == 0) { if (p_is_set == 1) { out = (char*)malloc(strlen(ped) + 4); // 4 for ext and \0 strcpy(out,ped); strcat(out, ".db"); } else { out = (char*)malloc(strlen(in) + 4); // 4 for ext and \0 strcpy(out,in); strcat(out, ".db"); } } fprintf(stderr, "Creating sample database %s\n", out); return ped_ped(in, ped, col, out); } return convert_help(); }
int main_tabix(int argc, char *argv[]) { int c, min_shift = -1, is_force = 0, is_all = 0; tbx_conf_t conf = tbx_conf_gff, *conf_ptr = NULL; while ((c = getopt(argc, argv, "0fap:s:b:e:S:c:m:")) >= 0) if (c == '0') conf.preset |= TBX_UCSC; else if (c == 'f') is_force = 1; else if (c == 'a') is_all = 1; else if (c == 'm') min_shift = atoi(optarg); else if (c == 's') conf.sc = atoi(optarg); else if (c == 'b') conf.bc = atoi(optarg); else if (c == 'e') conf.ec = atoi(optarg); else if (c == 'c') conf.meta_char = *optarg; else if (c == 'S') conf.line_skip = atoi(optarg); else if (c == 'p') { if (strcmp(optarg, "gff") == 0) conf_ptr = &tbx_conf_gff; else if (strcmp(optarg, "bed") == 0) conf_ptr = &tbx_conf_bed; else if (strcmp(optarg, "sam") == 0) conf_ptr = &tbx_conf_sam; else if (strcmp(optarg, "vcf") == 0) conf_ptr = &tbx_conf_vcf; else { fprintf(stderr, "The type '%s' not recognised\n", optarg); return 1; } } if (optind == argc) { fprintf(stderr, "\nUsage: bcftools tabix [options] <in.gz> [reg1 [...]]\n\n"); fprintf(stderr, "Options: -p STR preset: gff, bed, sam or vcf [gff]\n"); fprintf(stderr, " -s INT column number for sequence names (suppressed by -p) [1]\n"); fprintf(stderr, " -b INT column number for region start [4]\n"); fprintf(stderr, " -e INT column number for region end (if no end, set INT to -b) [5]\n"); fprintf(stderr, " -0 specify coordinates are zero-based\n"); fprintf(stderr, " -S INT skip first INT lines [0]\n"); fprintf(stderr, " -c CHAR skip lines starting with CHAR [null]\n"); fprintf(stderr, " -a print all records\n"); fprintf(stderr, " -f force to overwrite existing index\n"); fprintf(stderr, " -m INT set the minimal interval size to 1<<INT; 0 for the old tabix index [0]\n"); fprintf(stderr, "\n"); return 1; } if (is_all) { // read without random access kstring_t s; BGZF *fp; s.l = s.m = 0; s.s = 0; fp = bgzf_open(argv[optind], "r"); while (bgzf_getline(fp, '\n', &s) >= 0) puts(s.s); bgzf_close(fp); free(s.s); } else if (optind + 2 > argc) { // create index if ( !conf_ptr ) { // auto-detect file type by file name int l = strlen(argv[optind]); int strcasecmp(const char *s1, const char *s2); if (l>=7 && strcasecmp(argv[optind]+l-7, ".gff.gz") == 0) conf_ptr = &tbx_conf_gff; else if (l>=7 && strcasecmp(argv[optind]+l-7, ".bed.gz") == 0) conf_ptr = &tbx_conf_bed; else if (l>=7 && strcasecmp(argv[optind]+l-7, ".sam.gz") == 0) conf_ptr = &tbx_conf_sam; else if (l>=7 && strcasecmp(argv[optind]+l-7, ".vcf.gz") == 0) conf_ptr = &tbx_conf_vcf; } if ( conf_ptr ) conf = *conf_ptr; if (!is_force) { char *fn; FILE *fp; fn = (char*)alloca(strlen(argv[optind]) + 5); strcat(strcpy(fn, argv[optind]), min_shift <= 0? ".tbi" : ".csi"); if ((fp = fopen(fn, "rb")) != 0) { fclose(fp); fprintf(stderr, "[E::%s] the index file exists; use option '-f' to overwrite\n", __func__); return 1; } } if ( tbx_index_build(argv[optind], min_shift, &conf) ) { fprintf(stderr,"tbx_index_build failed: Is the file bgzip-compressed? Was wrong -p [type] option used?\n"); return 1; } } else { // read with random access tbx_t *tbx; BGZF *fp; kstring_t s; int i; if ((tbx = tbx_index_load(argv[optind])) == 0) return 1; if ((fp = bgzf_open(argv[optind], "r")) == 0) return 1; s.s = 0; s.l = s.m = 0; for (i = optind + 1; i < argc; ++i) { hts_itr_t *itr; if ((itr = tbx_itr_querys(tbx, argv[i])) == 0) continue; while (tbx_bgzf_itr_next(fp, tbx, itr, &s) >= 0) puts(s.s); tbx_itr_destroy(itr); } free(s.s); bgzf_close(fp); tbx_destroy(tbx); } return 0; }
int vcf_index_stats(char *fname, int stats) { char *fn_out = NULL; FILE *out; out = fn_out ? fopen(fn_out, "w") : stdout; const char **seq; int i, nseq; tbx_t *tbx = NULL; hts_idx_t *idx = NULL; htsFile *fp = hts_open(fname,"r"); if ( !fp ) { fprintf(stderr,"Could not read %s\n", fname); return 1; } bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) { fprintf(stderr,"Could not read the header: %s\n", fname); return 1; } if ( hts_get_format(fp)->format==vcf ) { tbx = tbx_index_load(fname); if ( !tbx ) { fprintf(stderr,"Could not load TBI index: %s\n", fname); return 1; } } else if ( hts_get_format(fp)->format==bcf ) { idx = bcf_index_load(fname); if ( !idx ) { fprintf(stderr,"Could not load CSI index: %s\n", fname); return 1; } } else { fprintf(stderr,"Could not detect the file type as VCF or BCF: %s\n", fname); return 1; } seq = tbx ? tbx_seqnames(tbx, &nseq) : bcf_index_seqnames(idx, hdr, &nseq); uint64_t sum = 0; for (i=0; i<nseq; i++) { uint64_t records, v; hts_idx_get_stat(tbx ? tbx->idx : idx, i, &records, &v); sum+=records; if (stats&2 || !records) continue; bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", seq[i], NULL); int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1; fprintf(out,"%s\t%s\t%" PRIu64 "\n", seq[i], hkey<0?".":hrec->vals[hkey], records); } if (!sum) { // No counts found. // Is this because index version has no stored count data, or no records? bcf1_t *rec = bcf_init1(); if (bcf_read1(fp, hdr, rec) >= 0) { fprintf(stderr,"%s index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", tbx ? "TBI" : "CSI", fname); return 1; } bcf_destroy1(rec); } if (stats&2) fprintf(out, "%" PRIu64 "\n", sum); free(seq); fclose(out); hts_close(fp); bcf_hdr_destroy(hdr); if (tbx) tbx_destroy(tbx); if (idx) hts_idx_destroy(idx); return 0; }
void union_data::scanPhenotypes(string fbed) { int n_includedP = 0; int n_excludedP = 0; //Open BED file vrb.title("Scanning phenotype data in [" + fbed + "]"); htsFile *fp = hts_open(fbed.c_str(),"r"); if (!fp) vrb.error("Cannot open file"); tbx_t * tbx = tbx_index_load(fbed.c_str()); if (!tbx) vrb.error("Cannot open index file"); //Read header kstring_t str = {0,0,0}; if (!hts_getline(fp, KS_SEP_LINE, &str) || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line"); //Scan file vector < string > tokens; unsigned int linecount =0; if (regionPhenotype.chr != "NA"){ hts_itr_t *itr = tbx_itr_querys(tbx, regionPhenotype.get().c_str()); vrb.bullet("target region [" + regionPhenotype.get() + "]"); if (!itr) vrb.error("Cannot jump to region!"); //Read data while (tbx_itr_next(fp, tbx, itr, &str) >= 0) { linecount ++; if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); if (phenotype_id_to_idx.count(tokens[3])) continue; if (filter_phenotype.check(tokens[3])) { phenotype_id.push_back(tokens[3]); phenotype_chr.push_back(tokens[0]); phenotype_start.push_back(atoi(tokens[1].c_str()) + 1); phenotype_end.push_back(atoi(tokens[2].c_str())); pair < string, int > temp (tokens[3],phenotype_id_to_idx.size()); phenotype_id_to_idx.insert(temp); n_includedP++; } else n_excludedP ++; } tbx_itr_destroy(itr); }else{ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) { linecount ++; if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); if (str.l && str.s[0] != tbx->conf.meta_char) { stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); if (phenotype_id_to_idx.count(tokens[3])) continue; if (filter_phenotype.check(tokens[3])) { phenotype_id.push_back(tokens[3]); phenotype_chr.push_back(tokens[0]); phenotype_start.push_back(atoi(tokens[1].c_str()) + 1); phenotype_end.push_back(atoi(tokens[2].c_str())); pair < string, int > temp (tokens[3],phenotype_id_to_idx.size()); phenotype_id_to_idx.insert(temp); n_includedP++; } else n_excludedP ++; } } } //Finalize & verbose tbx_destroy(tbx); if (hts_close(fp)) vrb.error("Cannot properly close file"); phenotype_count = phenotype_id.size(); vrb.bullet(stb.str(n_includedP) + " new phenotypes included"); if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user"); if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in region!"); }
void union_data::readPhenotypes(string fbed, string region) { int n_includedS = 0; int n_includedP = 0; int n_excludedP = 0; vector < int > mappingS; phenotype_id.clear(); phenotype_chr.clear(); phenotype_start.clear(); phenotype_end.clear(); phenotype_val.clear(); phenotype_count=0; phenotype_id_to_idx.clear(); //Open BED file //vrb.title("Reading phenotype data in [" + fbed + "]"); htsFile *fp = hts_open(fbed.c_str(),"r"); if (!fp) vrb.error("Cannot open file"); tbx_t *tbx = tbx_index_load(fbed.c_str()); if (!tbx) vrb.error("Cannot open index file"); kstring_t str = {0,0,0}; if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!"); //Process sample names vector < string > tokens; stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); for (int t = 6 ; t < tokens.size() ; t ++) { mappingS.push_back(findSample(tokens[t])); if (mappingS.back() >= 0) n_includedS++; } unsigned int linecount =0; //Read phenotypes hts_itr_t *itr = tbx_itr_querys(tbx, region.c_str()); //vrb.bullet("target region [" + regionPhenotype.get() + "]"); //if (!itr) vrb.error("Cannot jump to region!"); //Read data while (tbx_itr_next(fp, tbx, itr, &str) >= 0) { linecount ++; if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); if (filter_phenotype.check(tokens[3])) { phenotype_id.push_back(tokens[3]); phenotype_chr.push_back(tokens[0]); phenotype_start.push_back(atoi(tokens[1].c_str()) + 1); phenotype_end.push_back(atoi(tokens[2].c_str())); phenotype_val.push_back(vector < float > (sample_count, 0.0)); for (int t = 6 ; t < tokens.size() ; t ++) { if (mappingS[t-6] >= 0) { if (tokens[t] == "NA") phenotype_val.back()[mappingS[t-6]] = bcf_float_missing; else phenotype_val.back()[mappingS[t-6]] = stof(tokens[t]); } } pair < string, int > temp (tokens[3],n_includedP); phenotype_id_to_idx.insert(temp); n_includedP++; } else n_excludedP ++; } tbx_itr_destroy(itr); //Finalize & verbose tbx_destroy(tbx); if (hts_close(fp)) vrb.error("Cannot properly close file"); phenotype_count = phenotype_id.size(); //vrb.bullet(stb.str(n_includedP) + " phenotypes included"); //if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user"); //if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in target region!"); }
/** * tabix workhorse function */ static int tabix_handler(request_rec *r) { htsFile *fp=NULL; hts_itr_t *itr=NULL; kstring_t line = {0,0,0}; int print_header=1; int print_body=1; struct tabix_callback_t handler; int http_status=OK; memset((void*)&handler,0,sizeof(struct tabix_callback_t)); handler.r=r; handler.limit=DEFAULT_LIMIT_RECORDS; if (!r->handler || strcmp(r->handler, "tabix-handler")) return (DECLINED); if (strcmp(r->method, "GET")!=0) return DECLINED; if(r->canonical_filename==NULL) return DECLINED; /* file must be b-gzipped */ if( !( str_ends_with(r->canonical_filename,".gz") )) return DECLINED; /* file must be indexed with tabix */ if( !( fileExtExists(r->canonical_filename,".tbi") )) return 404; handler.httParams = HttpParamParseGET(r); if(handler.httParams==NULL) return DECLINED; handler.file_format=E_FORMAT_UNDEFINED; if(str_ends_with(r->canonical_filename,".vcf.gz")) { handler.file_format=E_FORMAT_VCF; } else if(str_ends_with(r->canonical_filename,".bed.gz")) { handler.file_format=E_FORMAT_BED; } /* only one loop, we use this to cleanup the code, instead of using a goto statement */ do { const char* format=HttpParamGet(handler.httParams,"format"); const char* limit=HttpParamGet(handler.httParams,"limit"); const char* region=HttpParamGet(handler.httParams,"region"); int iterator_was_requested=FALSE; if(limit!=NULL) { handler.limit=atol(limit); } if(format==NULL) { http_status=DECLINED; break; } else if(strcmp(format,"xml")==0) { SETUP_HANDLER(xml); } else if(strcmp(format,"json")==0 || strcmp(format,"jsonp")==0) { handler.jsonp_callback=HttpParamGet(handler.httParams,"callback"); SETUP_HANDLER(json); } else if(strcmp(format,"html")==0) { SETUP_HANDLER(html); } else { SETUP_HANDLER(plain); } fp=hts_open(r->canonical_filename,"r"); if(fp==NULL) { http_status=HTTP_NOT_FOUND; break; } //read index handler.tbx = tbx_index_load(r->canonical_filename); if(handler.tbx==NULL) { http_status=HTTP_INTERNAL_SERVER_ERROR; break; } if(region!=NULL && !str_is_empty(region)) { iterator_was_requested=TRUE; itr = tbx_itr_querys(handler.tbx,region); } handler.startdocument(&handler); if(print_header) { handler.startheader(&handler); while ( hts_getline(fp, KS_SEP_LINE, &line) >= 0 ) { if ( !line.l || line.s[0]!=handler.tbx->conf.meta_char ) break; handler.header(&handler,&line); handler.count++; } handler.enddheader(&handler); } handler.count=0;//Reset if(print_body) { handler.startbody(&handler); if(iterator_was_requested) { if(itr!=NULL) { while ((handler.limit==-1 || handler.count< handler.limit) && tbx_itr_next(fp, handler.tbx, itr, &line) >= 0) { if(handler.show(&handler,&line)<0) break; handler.count++; } } } else { while ((handler.limit==-1 || handler.count< handler.limit) && \ hts_getline(fp, KS_SEP_LINE, &line) >= 0) { if(handler.show(&handler,&line)<0) break; handler.count++; } } handler.endbody(&handler); } handler.enddocument(&handler); } while(0);/* always abort */ //cleanup if(itr!=NULL) tbx_itr_destroy(itr); HttpParamFree(handler.httParams); free(line.s); if(fp!=NULL) hts_close(fp); if(handler.tbx!=NULL) tbx_destroy(handler.tbx); return http_status; }
void union_data::readGenotypesBED(string fbed,string region) { string buffer; int n_includedG = 0; int n_excludedG_user = 0; int n_includedS = 0; int n_excludedS = 0; int n_missingS = 0; vector < int > mappingS; genotype_id.clear(); genotype_chr.clear(); genotype_start.clear(); genotype_end.clear(); genotype_val.clear(); genotype_count=0; genotype_id_to_idx.clear(); //Opening files htsFile *fp = hts_open(fbed.c_str(),"r"); if (!fp) vrb.error("Cannot open file!"); tbx_t * tbx = tbx_index_load(fbed.c_str()); if (!tbx) vrb.error("Cannot load index file!"); kstring_t str = {0,0,0}; if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!"); //Process sample names vector < string > tokens; stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); for (int i0 = 6 ; i0 < tokens.size() ; i0 ++) { string sid = tokens[i0]; if (filter_sample.check(sid)) { mappingS.push_back(findSample(sid)); if (mappingS.back() >= 0) n_includedS ++; else n_missingS ++; } else { mappingS.push_back(-1); n_excludedS ++; } } //vrb.bullet(stb.str(n_includedS) + " samples included"); //if (n_excludedS > 0) vrb.bullet(stb.str(n_excludedS) + " samples excluded by user"); //if (n_missingS > 0) vrb.bullet(stb.str(n_missingS) + " samples without phenotype data"); //if (n_includedS != sample_count) vrb.error("Cannot find genotype for " + stb.str(sample_count - n_includedS) + " samples!"); unsigned int linecount = 0; //Jump to interesting region hts_itr_t *itr = tbx_itr_querys(tbx, region.c_str()); //vrb.bullet("target region [" + regionGenotype.get() + "]"); //if (!itr) vrb.error("Cannot jump to region!"); while (tbx_itr_next(fp, tbx, itr, &str) >= 0) { linecount ++; if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); if (filter_genotype.check(tokens[3])) { genotype_id.push_back(tokens[3]); genotype_chr.push_back(tokens[0]); genotype_start.push_back(atoi(tokens[1].c_str()) + 1); genotype_end.push_back(atoi(tokens[2].c_str())); genotype_val.push_back(vector < float > (sample_count, 0.0)); for (int t = 6 ; t < tokens.size() ; t ++) { if (mappingS[t-6] >= 0) { if (tokens[t] == "NA") genotype_val.back()[mappingS[t-6]] = bcf_float_missing; else genotype_val.back()[mappingS[t-6]] = stof(tokens[t]); } } pair < string, int > temp (tokens[3],n_includedG); genotype_id_to_idx.insert(temp); n_includedG++; } else n_excludedG_user ++; } tbx_itr_destroy(itr); //Finalize & verbose tbx_destroy(tbx); if (hts_close(fp)) vrb.error("Cannot properly close file!"); genotype_count = n_includedG; //vrb.bullet(stb.str(n_includedG) + " variants included"); //if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user"); //if (genotype_count == 0) vrb.leave("Cannot find variants in target region!"); }
static int query_regions(args_t *args, char *fname, char **regs, int nregs) { int i; htsFile *fp = hts_open(fname,"r"); if ( !fp ) error("Could not read %s\n", fname); enum htsExactFormat format = hts_get_format(fp)->format; regidx_t *reg_idx = NULL; if ( args->targets_fname ) { reg_idx = regidx_init(args->targets_fname, NULL, NULL, 0, NULL); if ( !reg_idx ) error("Could not read %s\n", args->targets_fname); } if ( format == bcf ) { htsFile *out = hts_open("-","w"); if ( !out ) error("Could not open stdout\n", fname); hts_idx_t *idx = bcf_index_load(fname); if ( !idx ) error("Could not load .csi index of %s\n", fname); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Could not read the header: %s\n", fname); if ( args->print_header ) bcf_hdr_write(out,hdr); if ( !args->header_only ) { bcf1_t *rec = bcf_init(); for (i=0; i<nregs; i++) { hts_itr_t *itr = bcf_itr_querys(idx,hdr,regs[i]); while ( bcf_itr_next(fp, itr, rec) >=0 ) { if ( reg_idx && !regidx_overlap(reg_idx, bcf_seqname(hdr,rec),rec->pos,rec->pos+rec->rlen-1, NULL) ) continue; bcf_write(out,hdr,rec); } tbx_itr_destroy(itr); } bcf_destroy(rec); } if ( hts_close(out) ) error("hts_close returned non-zero status for stdout\n"); bcf_hdr_destroy(hdr); hts_idx_destroy(idx); } else if ( format==vcf || format==sam || format==unknown_format ) { tbx_t *tbx = tbx_index_load(fname); if ( !tbx ) error("Could not load .tbi/.csi index of %s\n", fname); kstring_t str = {0,0,0}; if ( args->print_header ) { while ( hts_getline(fp, KS_SEP_LINE, &str) >= 0 ) { if ( !str.l || str.s[0]!=tbx->conf.meta_char ) break; puts(str.s); } } if ( !args->header_only ) { int nseq; const char **seq = NULL; if ( reg_idx ) seq = tbx_seqnames(tbx, &nseq); for (i=0; i<nregs; i++) { hts_itr_t *itr = tbx_itr_querys(tbx, regs[i]); if ( !itr ) continue; while (tbx_itr_next(fp, tbx, itr, &str) >= 0) { if ( reg_idx && !regidx_overlap(reg_idx,seq[itr->curr_tid],itr->curr_beg,itr->curr_end, NULL) ) continue; puts(str.s); } tbx_itr_destroy(itr); } free(seq); } free(str.s); tbx_destroy(tbx); } else if ( format==bam ) error("Please use \"samtools view\" for querying BAM files.\n"); if ( reg_idx ) regidx_destroy(reg_idx); if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname); for (i=0; i<nregs; i++) free(regs[i]); free(regs); return 0; }
void union_data::scanGenotypesBED(string fbed) { string buffer; int n_includedG = 0; int n_excludedG_user = 0; //Opening files htsFile *fp = hts_open(fbed.c_str(),"r"); if (!fp) vrb.error("Cannot open file!"); tbx_t * tbx = tbx_index_load(fbed.c_str()); if (!tbx) vrb.error("Cannot load index file!"); kstring_t str = {0,0,0}; if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!"); //Read genotype data vector < string > tokens; unsigned int linecount = 0; //Jump to interesting region if (regionGenotype.chr != "NA"){ hts_itr_t *itr = tbx_itr_querys(tbx, regionGenotype.get().c_str()); vrb.bullet("target region [" + regionGenotype.get() + "]"); if (!itr) vrb.error("Cannot jump to region!"); while (tbx_itr_next(fp, tbx, itr, &str) >= 0) { linecount ++; if (linecount % 1000000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); if (genotype_id_to_idx.count(tokens[3])) continue; if (filter_genotype.check(tokens[3])) { genotype_id.push_back(tokens[3]); genotype_chr.push_back(tokens[0]); genotype_start.push_back(atoi(tokens[1].c_str()) + 1); genotype_end.push_back(atoi(tokens[2].c_str())); pair < string, int > temp (tokens[3],genotype_id_to_idx.size()); genotype_id_to_idx.insert(temp); n_includedG++; } else n_excludedG_user ++; } tbx_itr_destroy(itr); }else{ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) { linecount ++; if (linecount % 1000000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); stb.split(string(str.s), tokens); if (str.l && str.s[0] != tbx->conf.meta_char) { if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); if (genotype_id_to_idx.count(tokens[3])) continue; if (filter_genotype.check(tokens[3])) { genotype_id.push_back(tokens[3]); genotype_chr.push_back(tokens[0]); genotype_start.push_back(atoi(tokens[1].c_str()) + 1); genotype_end.push_back(atoi(tokens[2].c_str())); pair < string, int > temp (tokens[3],genotype_id_to_idx.size()); genotype_id_to_idx.insert(temp); n_includedG++; } else n_excludedG_user ++; } } } //Finalize & verbose tbx_destroy(tbx); genotype_count += n_includedG; if (hts_close(fp)) vrb.error("Cannot properly close file!"); vrb.bullet(stb.str(n_includedG) + " new variants included"); if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user"); if (n_includedG == 0) vrb.leave("Cannot find variants in target region!"); }
int beds_database_add(struct beds_options *opts, const char *fname, char *columns) { if ( opts->n_files == opts->m_files ) { opts->m_files = opts->m_files == 0 ? 2 : opts->m_files +2; opts->files = (struct beds_anno_file*)realloc(opts->files, opts->m_files*sizeof(struct beds_anno_file)); } struct beds_anno_file *file = &opts->files[opts->n_files]; memset(file, 0, sizeof(struct beds_anno_file)); file->id = opts->n_files; file->fname = strdup(fname); file->fp = hts_open(fname, "r"); if (file->fp == NULL) error("Failed to open %s : %s", fname, strerror(errno)); // int n; file->idx = tbx_index_load(fname); if ( file->idx == NULL) error("Failed to load index of %s.", fname); opts->n_files++; file->last_id = -1; file->last_start = -1; file->last_end = -1; kstring_t string = KSTRING_INIT; int no_columns = 0; int i; if ( columns == NULL && file->no_such_chrom == 0) { warnings("No columns string specified for %s. Will annotate all tags in this data.", fname); file->no_such_chrom = 1; no_columns = 1; } else { int *splits = NULL; kputs(columns, &string); int nfields; splits = ksplit(&string, ',', &nfields); file->m_cols = nfields; file->cols = (struct anno_col*)malloc(sizeof(struct anno_col) * file->m_cols); for ( i = 0; i < nfields; ++i ) { char *ss = string.s + splits[i]; struct anno_col *col = &file->cols[file->n_cols]; col->icol = i; col->replace = REPLACE_MISSING; if (*ss == '+') { col->replace = REPLACE_MISSING; ss++; } else if ( *ss == '-' ) { col->replace = REPLACE_EXISTING; ss++; } if (ss[0] == '\0') continue; if ( strncmp(ss, "INFO/", 5) == 0) ss += 5; col->hdr_key = strdup(ss); col->icol = -1; // debug_print("%s, %d", col->hdr_key, file->n_cols); file->n_cols++; } string.l = 0; } while (1) { string.l =0; if ( hts_getline(file->fp, KS_SEP_LINE, &string) < 0 ) break; // only accept header line in the beginning for file if ( string.s[0] != '#' ) break; if ( strncmp(string.s, "##INFO=", 7) == 0) { char *ss = string.s + 11; char *se = ss; while (se && *se != ',') se++; struct anno_col *col = NULL; // if no column string specified, init all header lines if ( no_columns ) { if ( file->n_cols == file->m_cols ) { file->m_cols = file->m_cols == 0 ? 2 : file->m_cols + 2; file->cols = (struct anno_col *) realloc(file->cols, file->m_cols*sizeof(struct anno_col)); } col = &file->cols[file->n_cols++]; col->icol = -1; col->hdr_key = strndup(ss, se-ss+1); col->hdr_key[se-ss] = '\0'; } else { for ( i = 0; i < file->n_cols; ++i ) { if ( strncmp(file->cols[i].hdr_key, ss, se-ss) == 0) break; } // if header line is not set in the column string, skip if ( i == file->n_cols ) continue; col = &file->cols[i]; } // specify setter functions here col->setter.bed = beds_setter_info_string; bcf_hdr_append(opts->hdr_out, string.s); bcf_hdr_sync(opts->hdr_out); int hdr_id = bcf_hdr_id2int(opts->hdr_out, BCF_DT_ID,col->hdr_key); assert ( bcf_hdr_idinfo_exists(opts->hdr_out, BCF_HL_INFO, hdr_id) ); } string.l = 0; // set column number for each col if ( strncasecmp(string.s, "#chr", 4) == 0) { int nfields; int *splits = ksplit(&string, '\t', &nfields); if (nfields < 4) { fprintf(stderr, "[error] Bad header of bed database : %s. n_fields : %d, %s", fname, nfields, string.s); fprintf(stderr, "[notice] this error usually happened because the header line is seperated by spaces but not tab!"); exit(1); } int k; for ( k = 3; k < nfields; ++k ) { char *ss = string.s + splits[k]; for (i = 0; i < file->n_cols; ++i ) { struct anno_col *col = &file->cols[i]; if ( strcmp(col->hdr_key, ss) == 0) break; } // if name line specify more names than column string or header, skip if ( i == file->n_cols ) continue; struct anno_col *col = &file->cols[i]; col->icol = k; } } } for ( i = 0; i < file->n_cols; ++i ) { struct anno_col *col = &file->cols[i]; if ( col->hdr_key && col->icol == -1 ) error("No column %s found in bed database : %s", col->hdr_key, fname); int hdr_id = bcf_hdr_id2int(opts->hdr_out, BCF_DT_ID, col->hdr_key); assert(hdr_id>-1); col->number = bcf_hdr_id2length(opts->hdr_out, BCF_HL_INFO, hdr_id); if ( col->number == BCF_VL_A || col->number == BCF_VL_R || col->number == BCF_VL_G) error("Only support fixed INFO number for bed database. %s", col->hdr_key); col->ifile = file->id; } if ( string.m ) free(string.s); if ( opts->beds_is_inited == 0 ) opts->beds_is_inited = 1; return 0; }
int main_vcfset(int argc, char *argv[]) { vcfset_conf_t vcfset_conf; char *vcf_header = NULL; int rc = 0; char *vcf_in1, *vcf_in2, *vcf_out; long int num_vars_vcf1; long int num_vars_vcf1_ign, num_vars_out; static int only_passed = 0; static int only_pos = 0; static int only_snvs = 0; static int only_indels = 0; static int count_only = 0; tbx_t *vcf2_tbx = NULL; /* index for second vcf file */ htsFile *vcf2_hts = NULL; char *add_info_field = NULL; int vcf_concat_findex = 0; vcf_in1 = vcf_in2 = vcf_out = NULL; num_vars_vcf1 = 0; num_vars_vcf1_ign = num_vars_out = 0; /* default vcfset options */ memset(&vcfset_conf, 0, sizeof(vcfset_conf_t)); /* vcfset_conf.vcf_in1 = NULL; */ /* vcfset_conf.vcf_in2 = NULL; */ /* vcfset_conf.vcf_out = stdout;*/ /* keep in sync with long_opts_str and usage * * getopt is a pain in the whole when it comes to syncing of long * and short args and usage. check out gopt, libcfu... */ while (1) { int c; static struct option long_opts[] = { /* see usage sync */ {"help", no_argument, NULL, 'h'}, {"verbose", no_argument, &verbose, 1}, {"debug", no_argument, &debug, 1}, {"only-passed", no_argument, &only_passed, 1}, {"only-pos", no_argument, &only_pos, 1}, {"only-indels", no_argument, &only_indels, 1}, {"only-snvs", no_argument, &only_snvs, 1}, {"count-only", no_argument, &count_only, 1}, {"vcf1", required_argument, NULL, '1'}, {"vcf2", required_argument, NULL, '2'}, {"vcfout", required_argument, NULL, 'o'}, {"action", required_argument, NULL, 'a'}, {"add-info", required_argument, NULL, 'I'}, {0, 0, 0, 0} /* sentinel */ }; /* keep in sync with long_opts and usage */ static const char *long_opts_str = "h1:2:o:a:I:"; /* getopt_long stores the option index here. */ int long_opts_index = 0; c = getopt_long(argc-1, argv+1, /* skipping 'lofreq', just leaving 'command', i.e. call */ long_opts_str, long_opts, & long_opts_index); if (c == -1) { break; } switch (c) { /* keep in sync with long_opts etc */ case 'h': usage(& vcfset_conf); free(vcf_in1); free(vcf_in2); free(vcf_out); return 0; case '1': vcf_in1 = strdup(optarg); break; case '2': vcf_in2 = strdup(optarg); break; case 'o': if (0 != strcmp(optarg, "-")) { if (file_exists(optarg)) { LOG_FATAL("Cowardly refusing to overwrite file '%s'. Exiting...\n", optarg); free(vcf_in1); free(vcf_in2); return 1; } } vcf_out = strdup(optarg); break; case 'a': if (0 == strcmp(optarg, "intersect")) { vcfset_conf.vcf_setop = SETOP_INTERSECT; } else if (0 == strcmp(optarg, "complement")) { vcfset_conf.vcf_setop = SETOP_COMPLEMENT; } else if (0 == strcmp(optarg, "concat")) { vcfset_conf.vcf_setop = SETOP_CONCAT; } else { LOG_FATAL("Unknown action '%s'. Exiting...\n", optarg); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } break; case 'I': add_info_field = strdup(optarg); break; case '?': LOG_FATAL("%s\n", "unrecognized arguments found. Exiting...\n"); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; default: break; } } vcfset_conf.only_passed = only_passed; vcfset_conf.only_pos = only_pos; vcfset_conf.only_snvs = only_snvs; vcfset_conf.only_indels = only_indels; if (vcfset_conf.only_indels && vcfset_conf.only_snvs) { LOG_FATAL("%s\n", "Can't take only indels *and* only snvs into account"); return 1; } if (0 != argc - optind - 1) { if (vcfset_conf.vcf_setop == SETOP_CONCAT) { vcf_concat_findex = optind; } else { LOG_FATAL("%s\n", "Unrecognized arguments found\n"); return 1; } } else { if (vcfset_conf.vcf_setop == SETOP_CONCAT) { LOG_FATAL("%s\n", "No extra files for concat given\n"); return 1; } } #if 0 int i; for (i=optind+1; i<argc; i++) { LOG_FIXME("argv[%d]=%s\n", i, argv[i]); } #endif if (argc == 2) { fprintf(stderr, "\n"); usage(& vcfset_conf); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } if (vcfset_conf.vcf_setop == SETOP_UNKNOWN) { LOG_FATAL("%s\n", "No set operation specified"); usage(& vcfset_conf); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } if (vcf_in1 == NULL || (vcf_in2 == NULL && vcfset_conf.vcf_setop != SETOP_CONCAT)) { LOG_FATAL("%s\n\n", "At least one vcf input file not specified"); usage(& vcfset_conf); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } if (vcf_in2 != NULL && vcfset_conf.vcf_setop == SETOP_CONCAT) { LOG_FATAL("%s\n\n", "For concat just use the -1 option followed by all other vcf files instead of using -2"); usage(& vcfset_conf); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } if (vcf_file_open(& vcfset_conf.vcf_in1, vcf_in1, HAS_GZIP_EXT(vcf_in1), 'r')) { LOG_ERROR("Couldn't open %s\n", vcf_in1); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } if (vcf_in2) { vcf2_hts = hts_open(vcf_in2, "r"); if (!vcf2_hts) { LOG_FATAL("Couldn't load %s\n", vcf_in2); return 1; } vcf2_tbx = tbx_index_load(vcf_in2); if (!vcf2_tbx) { LOG_FATAL("Couldn't load tabix index for %s\n", vcf_in2); return 1; } } /* vcf_out default if not set: stdout==- */ if (! vcf_out) { vcf_out = malloc(2 * sizeof(char)); strcpy(vcf_out, "-"); } if (! count_only) { if (vcf_file_open(& vcfset_conf.vcf_out, vcf_out, HAS_GZIP_EXT(vcf_out), 'w')) { LOG_ERROR("Couldn't open %s\n", vcf_out); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } } /* use meta-data/header of vcf_in1 for output */ LOG_DEBUG("Getting header from %s\n", vcf_in1); if (0 != vcf_parse_header(&vcf_header, & vcfset_conf.vcf_in1)) { LOG_WARN("%s\n", "vcf_parse_header() failed"); if (vcf_file_seek(& vcfset_conf.vcf_in1, 0, SEEK_SET)) { LOG_FATAL("%s\n", "Couldn't rewind file to parse variants" " after header parsing failed"); return -1; } } else { if (! count_only) { /* vcf_write_header would write *default* header */ vcf_write_header(& vcfset_conf.vcf_out, vcf_header); } free(vcf_header); } /* parse first vcf file */ LOG_DEBUG("Starting to parse variants from %s\n", vcf_in1); while (1) { var_t *var1 = NULL; int rc; int is_indel; kstring_t var2_kstr = {0, 0, 0}; hts_itr_t *var2_itr = NULL; char regbuf[1024]; int var2_match = 0; vcf_new_var(&var1); rc = vcf_parse_var(& vcfset_conf.vcf_in1, var1); if (rc) { free(var1); if (vcfset_conf.vcf_setop != SETOP_CONCAT) { break; } else { vcf_concat_findex++; if (vcf_concat_findex==argc) { break; } /* set vcf1 up anew and simply continue as if nothing happened */ vcf_file_close(& vcfset_conf.vcf_in1); free(vcf_in1); vcf_in1 = strdup(argv[vcf_concat_findex]); LOG_DEBUG("updated vcf_in1 = %s\n", vcf_in1); if (vcf_file_open(& vcfset_conf.vcf_in1, vcf_in1, HAS_GZIP_EXT(vcf_in1), 'r')) { LOG_ERROR("Couldn't open %s\n", vcf_in1); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } if (0 != vcf_skip_header(& vcfset_conf.vcf_in1)) { LOG_WARN("skip header failed for %s\n", vcf_in1); } continue; } } is_indel = vcf_var_is_indel(var1); if (vcfset_conf.only_snvs && is_indel) { free(var1); continue; } else if (vcfset_conf.only_indels && ! is_indel) { free(var1); continue; } if (! vcfset_conf.only_pos && NULL != strchr(var1->alt, ',')) { LOG_FATAL("%s\n", "No support for multi-allelic SNVs in vcf1"); return -1; } if (vcfset_conf.only_passed && ! VCF_VAR_PASSES(var1)) { #ifdef TRACE LOG_DEBUG("Skipping non-passing var1 %s:%d\n", var1->chrom, var1->pos); #endif num_vars_vcf1_ign += 1; vcf_free_var(& var1); continue; } if (add_info_field) { vcf_var_add_to_info(var1, add_info_field); } num_vars_vcf1 += 1; #ifdef TRACE LOG_DEBUG("Got passing var1 %s:%d\n", var1->chrom, var1->pos); #endif if (vcfset_conf.vcf_setop == SETOP_CONCAT) { num_vars_out += 1; if (! count_only) { vcf_write_var(& vcfset_conf.vcf_out, var1); } vcf_free_var(& var1); /* skip comparison against vcf2 */ continue; } /* use index access to vcf2 */ snprintf(regbuf, 1024, "%s:%ld-%ld", var1->chrom, var1->pos+1, var1->pos+1); var2_itr = tbx_itr_querys(vcf2_tbx, regbuf); if (! var2_itr) { var2_match = 0; } else { var2_match = 0; while (tbx_itr_next(vcf2_hts, vcf2_tbx, var2_itr, &var2_kstr) >= 0) { var_t *var2 = NULL; int var2_is_indel = 0; vcf_new_var(&var2); rc = vcf_parse_var_from_line(var2_kstr.s, var2); /* LOG_FIXME("%d:%s>%s looking at var2 %d:%s>%s (reg %s)\n", var1->pos+1, var1->ref, var1->alt, var2->pos+1, var2->ref, var2->alt, regbuf); */ if (rc) { LOG_FATAL("%s\n", "Error while parsing variant returned from tabix"); return -1; } var2_is_indel = vcf_var_is_indel(var2); /* iterator returns anything overlapping with that * position, i.e. this also includes up/downstream * indels, so make sure actual position matches */ if (var1->pos != var2->pos) { var2_match = 0; } else if (vcfset_conf.only_passed && ! VCF_VAR_PASSES(var2)) { var2_match = 0; } else if (vcfset_conf.only_snvs && var2_is_indel) { var2_match = 0; } else if (vcfset_conf.only_indels && ! var2_is_indel) { var2_match = 0; } else if (vcfset_conf.only_pos) { #ifdef TRACE LOG_DEBUG("Pos match for var2 %s:%d\n", var2->chrom, var2->pos); #endif var2_match = 1; } else { if (0==strcmp(var1->ref, var2->ref) && 0==strcmp(var1->alt, var2->alt)) { #ifdef TRACE LOG_DEBUG("Full match for var2 %s:%d\n", var2->chrom, var2->pos); #endif var2_match = 1;/* FIXME: check type as well i.e. snv vs indel */ } } vcf_free_var(&var2); if (var2_match) { break;/* no need to continue */ } } } if (vcfset_conf.vcf_setop == SETOP_COMPLEMENT) { /* relative complement : elements in A but not B */ if (!var2_match) { num_vars_out += 1; if (! count_only) { vcf_write_var(& vcfset_conf.vcf_out, var1); } } } else if (vcfset_conf.vcf_setop == SETOP_INTERSECT) { if (var2_match) { num_vars_out += 1; if (! count_only) { vcf_write_var(& vcfset_conf.vcf_out, var1); } } } else { LOG_FATAL("Internal error: unsupported vcf_setop %d\n", vcfset_conf.vcf_setop); return 1; } vcf_free_var(& var1); tbx_itr_destroy(var2_itr); }/* while (1) */ vcf_file_close(& vcfset_conf.vcf_in1); if (vcf_in2) { hts_close(vcf2_hts); tbx_destroy(vcf2_tbx); } LOG_VERBOSE("Parsed %d variants from 1st vcf file (ignoring %d non-passed of those)\n", num_vars_vcf1 + num_vars_vcf1_ign, num_vars_vcf1_ign); LOG_VERBOSE("Wrote %d variants to output\n", num_vars_out); if (! count_only) { vcf_file_close(& vcfset_conf.vcf_out); } if (0==rc) { if (count_only) { printf("%ld\n", num_vars_out); } LOG_VERBOSE("%s\n", "Successful exit."); } free(vcf_in1); free(vcf_in2); free(vcf_out); return rc; }
int bcf_sr_add_reader(bcf_srs_t *files, const char *fname) { htsFile* file_ptr = hts_open(fname, "r"); if ( ! file_ptr ) { files->errnum = open_failed; return 0; } files->has_line = (int*) realloc(files->has_line, sizeof(int)*(files->nreaders+1)); files->has_line[files->nreaders] = 0; files->readers = (bcf_sr_t*) realloc(files->readers, sizeof(bcf_sr_t)*(files->nreaders+1)); bcf_sr_t *reader = &files->readers[files->nreaders++]; memset(reader,0,sizeof(bcf_sr_t)); reader->file = file_ptr; files->errnum = 0; if ( files->require_index ) { if ( reader->file->format.format==vcf ) { if ( reader->file->format.compression!=bgzf ) { files->errnum = not_bgzf; return 0; } reader->tbx_idx = tbx_index_load(fname); if ( !reader->tbx_idx ) { files->errnum = idx_load_failed; return 0; } reader->header = bcf_hdr_read(reader->file); } else if ( reader->file->format.format==bcf ) { if ( reader->file->format.compression!=bgzf ) { files->errnum = not_bgzf; return 0; } reader->header = bcf_hdr_read(reader->file); reader->bcf_idx = bcf_index_load(fname); if ( !reader->bcf_idx ) { files->errnum = idx_load_failed; return 0; } } else { files->errnum = file_type_error; return 0; } } else { if ( reader->file->format.format==bcf || reader->file->format.format==vcf ) { reader->header = bcf_hdr_read(reader->file); } else { files->errnum = file_type_error; return 0; } files->streaming = 1; } if ( files->streaming && files->nreaders>1 ) { files->errnum = api_usage_error; fprintf(stderr,"[%s:%d %s] Error: %d readers, yet require_index not set\n", __FILE__,__LINE__,__FUNCTION__,files->nreaders); return 0; } if ( files->streaming && files->regions ) { files->errnum = api_usage_error; fprintf(stderr,"[%s:%d %s] Error: cannot tabix-jump in streaming mode\n", __FILE__,__LINE__,__FUNCTION__); return 0; } if ( !reader->header ) { files->errnum = header_error; return 0; } reader->fname = fname; if ( files->apply_filters ) reader->filter_ids = init_filters(reader->header, files->apply_filters, &reader->nfilter_ids); // Update list of chromosomes if ( !files->explicit_regs && !files->streaming ) { int n,i; const char **names = reader->tbx_idx ? tbx_seqnames(reader->tbx_idx, &n) : bcf_hdr_seqnames(reader->header, &n); for (i=0; i<n; i++) { if ( !files->regions ) files->regions = _regions_init_string(names[i]); else _regions_add(files->regions, names[i], -1, -1); } free(names); } return 1; }
void cis_data::readPhenotypes(string fbed) { int n_includedS = 0; int n_includedP = 0; int n_excludedP = 0; int n_negativeStrd = 0; vector < int > mappingS; //Open BED file vrb.title("Reading phenotype data in [" + fbed + "]"); htsFile *fp = hts_open(fbed.c_str(),"r"); if (!fp) vrb.error("Cannot open file"); tbx_t *tbx = tbx_index_load(fbed.c_str()); if (!tbx) vrb.error("Cannot open index file"); kstring_t str = {0,0,0}; if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!"); //Process sample names vector < string > tokens; stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); for (int t = 6 ; t < tokens.size() ; t ++) { mappingS.push_back(findSample(tokens[t])); if (mappingS.back() >= 0) n_includedS++; } //Read phenotypes unsigned int linecount =0; //Read phenotypes if (regionPhenotype.chr != "NA"){ hts_itr_t *itr = tbx_itr_querys(tbx, regionPhenotype.get().c_str()); vrb.bullet("target region [" + regionPhenotype.get() + "]"); if (!itr) vrb.error("Cannot jump to region!"); //Read data while (tbx_itr_next(fp, tbx, itr, &str) >= 0) { linecount ++; if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); if ((grp_mode == GRP_NONE && filter_phenotype.check(tokens[3])) || (grp_mode != GRP_NONE && filter_phenotype.check(tokens[4]))) { phenotype_id.push_back(tokens[3]); phenotype_chr.push_back(tokens[0]); phenotype_start.push_back(atoi(tokens[1].c_str()) + 1); phenotype_end.push_back(atoi(tokens[2].c_str())); if (grp_mode > 0 && full_test) phenotype_grp.push_back("ALL_GENES"); if (grp_mode > 0 && !full_test) phenotype_grp.push_back(tokens[4]); phenotype_neg.push_back(tokens[5] == "-"); if (phenotype_neg.back()) n_negativeStrd ++; phenotype_val.push_back(vector < float > (sample_count, 0.0)); for (int t = 6 ; t < tokens.size() ; t ++) { if (mappingS[t-6] >= 0) { if (tokens[t] == "NA") phenotype_val.back()[mappingS[t-6]] = bcf_float_missing; else phenotype_val.back()[mappingS[t-6]] = stof(tokens[t]); } } n_includedP++; } else n_excludedP ++; } tbx_itr_destroy(itr); }else{ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) { linecount ++; if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); stb.split(string(str.s), tokens); if (str.l && str.s[0] != tbx->conf.meta_char) { if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); if ((grp_mode == GRP_NONE && filter_phenotype.check(tokens[3])) || (grp_mode != GRP_NONE && filter_phenotype.check(tokens[4]))) { phenotype_id.push_back(tokens[3]); phenotype_chr.push_back(tokens[0]); phenotype_start.push_back(atoi(tokens[1].c_str()) + 1); phenotype_end.push_back(atoi(tokens[2].c_str())); if (grp_mode > 0 && full_test) phenotype_grp.push_back("ALL_GENES"); if (grp_mode > 0 && !full_test) phenotype_grp.push_back(tokens[4]); phenotype_neg.push_back(tokens[5] == "-"); if (phenotype_neg.back()) n_negativeStrd ++; phenotype_val.push_back(vector < float > (sample_count, 0.0)); for (int t = 6 ; t < tokens.size() ; t ++) { if (mappingS[t-6] >= 0) { if (tokens[t] == "NA") phenotype_val.back()[mappingS[t-6]] = bcf_float_missing; else phenotype_val.back()[mappingS[t-6]] = stof(tokens[t]); } } n_includedP++; } else n_excludedP ++; } } } //Finalize & verbose tbx_destroy(tbx); if (hts_close(fp)) vrb.error("Cannot properly close file"); phenotype_count = phenotype_id.size(); vrb.bullet(stb.str(n_includedP) + " phenotypes included"); if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user"); if (n_negativeStrd > 0 ) vrb.bullet(stb.str(n_negativeStrd) + " phenotypes are on the negative strand"); if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in target region!"); }
bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr, int ifrom, int ito) { bcf_sr_regions_t *reg; if ( !is_file ) return _regions_init_string(regions); reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t)); reg->start = reg->end = -1; reg->prev_start = reg->prev_seq = -1; reg->file = hts_open(regions, "rb"); if ( !reg->file ) { fprintf(stderr,"[%s:%d %s] Could not open file: %s\n", __FILE__,__LINE__,__FUNCTION__,regions); free(reg); return NULL; } reg->tbx = tbx_index_load(regions); if ( !reg->tbx ) { int len = strlen(regions); int is_bed = strcasecmp(".bed",regions+len-4) ? 0 : 1; if ( !is_bed && !strcasecmp(".bed.gz",regions+len-7) ) is_bed = 1; if ( reg->file->format.format==vcf ) ito = 1; // read the whole file, tabix index is not present while ( hts_getline(reg->file, KS_SEP_LINE, ®->line) > 0 ) { char *chr, *chr_end; int from, to, ret; ret = _regions_parse_line(reg->line.s, ichr,ifrom,abs(ito), &chr,&chr_end,&from,&to); if ( ret < 0 ) { if ( ito<0 ) ret = _regions_parse_line(reg->line.s, ichr,ifrom,ifrom, &chr,&chr_end,&from,&to); if ( ret<0 ) { fprintf(stderr,"[%s:%d] Could not parse the file %s, using the columns %d,%d[,%d]\n", __FILE__,__LINE__,regions,ichr+1,ifrom+1,ito+1); hts_close(reg->file); reg->file = NULL; free(reg); return NULL; } } if ( !ret ) continue; if ( is_bed ) from++; *chr_end = 0; _regions_add(reg, chr, from, to); *chr_end = '\t'; } hts_close(reg->file); reg->file = NULL; if ( !reg->nseqs ) { free(reg); return NULL; } return reg; } reg->seq_names = (char**) tbx_seqnames(reg->tbx, ®->nseqs); if ( !reg->seq_hash ) reg->seq_hash = khash_str2int_init(); int i; for (i=0; i<reg->nseqs; i++) { khash_str2int_set(reg->seq_hash,reg->seq_names[i],i); } reg->fname = strdup(regions); reg->is_bin = 1; return reg; }