void bcf_sr_destroy(readers_t *files) { if ( !files->nreaders ) return; int i; for (i=0; i<files->nreaders; i++) { reader_t *reader = &files->readers[i]; if ( reader->tbx ) tbx_destroy(reader->tbx); if ( reader->bcf ) hts_idx_destroy(reader->bcf); bcf_hdr_destroy(reader->header); hts_close(reader->file); if ( reader->itr ) tbx_itr_destroy(reader->itr); int j; for (j=0; j<reader->mbuffer; j++) bcf_destroy1(reader->buffer[j]); free(reader->buffer); if ( reader->samples ) free(reader->samples); } free(files->readers); free(files->seqs); for (i=0; i<files->n_smpl; i++) free(files->samples[i]); free(files->samples); if (files->targets) { if (files->targets->itr) tbx_itr_destroy(files->targets->itr); tbx_destroy(files->targets->tbx); if (files->targets->line.m) free(files->targets->line.s); hts_close(files->targets->file); free(files->targets->seq_names); free(files->targets); } if ( files->tmps.m ) free(files->tmps.s); free(files); }
bool Tabix::getNextLine(string& line) { kstring_t str = {0,0,0}; if (iter==NULL){ return false; } if (has_jumped) { if (iter && tbx_itr_next(fn, tbx, iter, &str) >= 0) { line = string(str.s); return true; } else return false; } else { // step through all sequences in the file // we've never jumped, so read everything if (iter && tbx_itr_next(fn, tbx, iter, &str) >= 0) { line = string(str.s); return true; } else { ++current_chrom; while (current_chrom != chroms.end()) { tbx_itr_destroy(iter); iter = tbx_itr_querys(tbx, current_chrom->c_str()); if (iter && tbx_itr_next(fn, tbx, iter, &str) >= 0) { line = string(str.s); return true; } else { ++current_chrom; } } return false; } } }
static char *tgt_next_seq(regions_t *tgt) { if ( ++tgt->cseq >= tgt->nseqs ) return NULL; if ( tgt->itr ) tbx_itr_destroy(tgt->itr); tgt->itr = tbx_itr_querys(tgt->tbx,tgt->seq_names[tgt->cseq]); tgt->tpos.to = -1; return tgt->seq_names[tgt->cseq]; }
hts_streamer:: ~hts_streamer() { if (_titr) tbx_itr_destroy(_titr); if (_tidx) tbx_destroy(_tidx); if (_hfp) hts_close(_hfp); if (_kstr.s) free(_kstr.s); }
void hts_streamer:: resetRegion( const char* region) { if (_titr) tbx_itr_destroy(_titr); _titr = tbx_itr_querys(_tidx, region); _is_stream_end = (! _titr); }
static void bcf_sr_destroy1(bcf_sr_t *reader) { if ( reader->tbx_idx ) tbx_destroy(reader->tbx_idx); if ( reader->bcf_idx ) hts_idx_destroy(reader->bcf_idx); bcf_hdr_destroy(reader->header); hts_close(reader->file); if ( reader->itr ) tbx_itr_destroy(reader->itr); int j; for (j=0; j<reader->mbuffer; j++) bcf_destroy1(reader->buffer[j]); free(reader->buffer); free(reader->samples); free(reader->filter_ids); }
int bcf_sr_regions_seek(bcf_sr_regions_t *reg, const char *seq) { reg->iseq = reg->start = reg->end = -1; if ( khash_str2int_get(reg->seq_hash, seq, ®->iseq) < 0 ) return -1; // sequence seq not in regions // using in-memory regions if ( reg->regs ) return 0; // reading regions from tabix if ( reg->itr ) tbx_itr_destroy(reg->itr); reg->itr = tbx_itr_querys(reg->tbx, seq); if ( reg->itr ) return 0; return -1; }
void Tabix::getHeader(string& header) { header.clear(); kstring_t str = {0,0,0}; while ( hts_getline(fn, KS_SEP_LINE, &str) >= 0 ) { if ( !str.l || str.s[0]!=tbx->conf.meta_char ) { break; } else { header += string(str.s); header += "\n"; } } // set back to start if (iter != NULL){ current_chrom = chroms.begin(); if (iter) tbx_itr_destroy(iter); iter = tbx_itr_querys(tbx, current_chrom->c_str()); } }
void bcf_sr_regions_destroy(bcf_sr_regions_t *reg) { int i; free(reg->fname); if ( reg->itr ) tbx_itr_destroy(reg->itr); if ( reg->tbx ) tbx_destroy(reg->tbx); if ( reg->file ) hts_close(reg->file); if ( reg->als ) free(reg->als); if ( reg->als_str.s ) free(reg->als_str.s); free(reg->line.s); if ( reg->regs ) { // free only in-memory names, tbx names are const for (i=0; i<reg->nseqs; i++) { free(reg->seq_names[i]); free(reg->regs[i].regs); } } free(reg->regs); free(reg->seq_names); khash_str2int_destroy(reg->seq_hash); free(reg); }
int main_vcfset(int argc, char *argv[]) { vcfset_conf_t vcfset_conf; char *vcf_header = NULL; int rc = 0; char *vcf_in1, *vcf_in2, *vcf_out; long int num_vars_vcf1; long int num_vars_vcf1_ign, num_vars_out; static int only_passed = 0; static int only_pos = 0; static int only_snvs = 0; static int only_indels = 0; static int count_only = 0; tbx_t *vcf2_tbx = NULL; /* index for second vcf file */ htsFile *vcf2_hts = NULL; char *add_info_field = NULL; int vcf_concat_findex = 0; vcf_in1 = vcf_in2 = vcf_out = NULL; num_vars_vcf1 = 0; num_vars_vcf1_ign = num_vars_out = 0; /* default vcfset options */ memset(&vcfset_conf, 0, sizeof(vcfset_conf_t)); /* vcfset_conf.vcf_in1 = NULL; */ /* vcfset_conf.vcf_in2 = NULL; */ /* vcfset_conf.vcf_out = stdout;*/ /* keep in sync with long_opts_str and usage * * getopt is a pain in the whole when it comes to syncing of long * and short args and usage. check out gopt, libcfu... */ while (1) { int c; static struct option long_opts[] = { /* see usage sync */ {"help", no_argument, NULL, 'h'}, {"verbose", no_argument, &verbose, 1}, {"debug", no_argument, &debug, 1}, {"only-passed", no_argument, &only_passed, 1}, {"only-pos", no_argument, &only_pos, 1}, {"only-indels", no_argument, &only_indels, 1}, {"only-snvs", no_argument, &only_snvs, 1}, {"count-only", no_argument, &count_only, 1}, {"vcf1", required_argument, NULL, '1'}, {"vcf2", required_argument, NULL, '2'}, {"vcfout", required_argument, NULL, 'o'}, {"action", required_argument, NULL, 'a'}, {"add-info", required_argument, NULL, 'I'}, {0, 0, 0, 0} /* sentinel */ }; /* keep in sync with long_opts and usage */ static const char *long_opts_str = "h1:2:o:a:I:"; /* getopt_long stores the option index here. */ int long_opts_index = 0; c = getopt_long(argc-1, argv+1, /* skipping 'lofreq', just leaving 'command', i.e. call */ long_opts_str, long_opts, & long_opts_index); if (c == -1) { break; } switch (c) { /* keep in sync with long_opts etc */ case 'h': usage(& vcfset_conf); free(vcf_in1); free(vcf_in2); free(vcf_out); return 0; case '1': vcf_in1 = strdup(optarg); break; case '2': vcf_in2 = strdup(optarg); break; case 'o': if (0 != strcmp(optarg, "-")) { if (file_exists(optarg)) { LOG_FATAL("Cowardly refusing to overwrite file '%s'. Exiting...\n", optarg); free(vcf_in1); free(vcf_in2); return 1; } } vcf_out = strdup(optarg); break; case 'a': if (0 == strcmp(optarg, "intersect")) { vcfset_conf.vcf_setop = SETOP_INTERSECT; } else if (0 == strcmp(optarg, "complement")) { vcfset_conf.vcf_setop = SETOP_COMPLEMENT; } else if (0 == strcmp(optarg, "concat")) { vcfset_conf.vcf_setop = SETOP_CONCAT; } else { LOG_FATAL("Unknown action '%s'. Exiting...\n", optarg); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } break; case 'I': add_info_field = strdup(optarg); break; case '?': LOG_FATAL("%s\n", "unrecognized arguments found. Exiting...\n"); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; default: break; } } vcfset_conf.only_passed = only_passed; vcfset_conf.only_pos = only_pos; vcfset_conf.only_snvs = only_snvs; vcfset_conf.only_indels = only_indels; if (vcfset_conf.only_indels && vcfset_conf.only_snvs) { LOG_FATAL("%s\n", "Can't take only indels *and* only snvs into account"); return 1; } if (0 != argc - optind - 1) { if (vcfset_conf.vcf_setop == SETOP_CONCAT) { vcf_concat_findex = optind; } else { LOG_FATAL("%s\n", "Unrecognized arguments found\n"); return 1; } } else { if (vcfset_conf.vcf_setop == SETOP_CONCAT) { LOG_FATAL("%s\n", "No extra files for concat given\n"); return 1; } } #if 0 int i; for (i=optind+1; i<argc; i++) { LOG_FIXME("argv[%d]=%s\n", i, argv[i]); } #endif if (argc == 2) { fprintf(stderr, "\n"); usage(& vcfset_conf); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } if (vcfset_conf.vcf_setop == SETOP_UNKNOWN) { LOG_FATAL("%s\n", "No set operation specified"); usage(& vcfset_conf); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } if (vcf_in1 == NULL || (vcf_in2 == NULL && vcfset_conf.vcf_setop != SETOP_CONCAT)) { LOG_FATAL("%s\n\n", "At least one vcf input file not specified"); usage(& vcfset_conf); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } if (vcf_in2 != NULL && vcfset_conf.vcf_setop == SETOP_CONCAT) { LOG_FATAL("%s\n\n", "For concat just use the -1 option followed by all other vcf files instead of using -2"); usage(& vcfset_conf); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } if (vcf_file_open(& vcfset_conf.vcf_in1, vcf_in1, HAS_GZIP_EXT(vcf_in1), 'r')) { LOG_ERROR("Couldn't open %s\n", vcf_in1); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } if (vcf_in2) { vcf2_hts = hts_open(vcf_in2, "r"); if (!vcf2_hts) { LOG_FATAL("Couldn't load %s\n", vcf_in2); return 1; } vcf2_tbx = tbx_index_load(vcf_in2); if (!vcf2_tbx) { LOG_FATAL("Couldn't load tabix index for %s\n", vcf_in2); return 1; } } /* vcf_out default if not set: stdout==- */ if (! vcf_out) { vcf_out = malloc(2 * sizeof(char)); strcpy(vcf_out, "-"); } if (! count_only) { if (vcf_file_open(& vcfset_conf.vcf_out, vcf_out, HAS_GZIP_EXT(vcf_out), 'w')) { LOG_ERROR("Couldn't open %s\n", vcf_out); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } } /* use meta-data/header of vcf_in1 for output */ LOG_DEBUG("Getting header from %s\n", vcf_in1); if (0 != vcf_parse_header(&vcf_header, & vcfset_conf.vcf_in1)) { LOG_WARN("%s\n", "vcf_parse_header() failed"); if (vcf_file_seek(& vcfset_conf.vcf_in1, 0, SEEK_SET)) { LOG_FATAL("%s\n", "Couldn't rewind file to parse variants" " after header parsing failed"); return -1; } } else { if (! count_only) { /* vcf_write_header would write *default* header */ vcf_write_header(& vcfset_conf.vcf_out, vcf_header); } free(vcf_header); } /* parse first vcf file */ LOG_DEBUG("Starting to parse variants from %s\n", vcf_in1); while (1) { var_t *var1 = NULL; int rc; int is_indel; kstring_t var2_kstr = {0, 0, 0}; hts_itr_t *var2_itr = NULL; char regbuf[1024]; int var2_match = 0; vcf_new_var(&var1); rc = vcf_parse_var(& vcfset_conf.vcf_in1, var1); if (rc) { free(var1); if (vcfset_conf.vcf_setop != SETOP_CONCAT) { break; } else { vcf_concat_findex++; if (vcf_concat_findex==argc) { break; } /* set vcf1 up anew and simply continue as if nothing happened */ vcf_file_close(& vcfset_conf.vcf_in1); free(vcf_in1); vcf_in1 = strdup(argv[vcf_concat_findex]); LOG_DEBUG("updated vcf_in1 = %s\n", vcf_in1); if (vcf_file_open(& vcfset_conf.vcf_in1, vcf_in1, HAS_GZIP_EXT(vcf_in1), 'r')) { LOG_ERROR("Couldn't open %s\n", vcf_in1); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } if (0 != vcf_skip_header(& vcfset_conf.vcf_in1)) { LOG_WARN("skip header failed for %s\n", vcf_in1); } continue; } } is_indel = vcf_var_is_indel(var1); if (vcfset_conf.only_snvs && is_indel) { free(var1); continue; } else if (vcfset_conf.only_indels && ! is_indel) { free(var1); continue; } if (! vcfset_conf.only_pos && NULL != strchr(var1->alt, ',')) { LOG_FATAL("%s\n", "No support for multi-allelic SNVs in vcf1"); return -1; } if (vcfset_conf.only_passed && ! VCF_VAR_PASSES(var1)) { #ifdef TRACE LOG_DEBUG("Skipping non-passing var1 %s:%d\n", var1->chrom, var1->pos); #endif num_vars_vcf1_ign += 1; vcf_free_var(& var1); continue; } if (add_info_field) { vcf_var_add_to_info(var1, add_info_field); } num_vars_vcf1 += 1; #ifdef TRACE LOG_DEBUG("Got passing var1 %s:%d\n", var1->chrom, var1->pos); #endif if (vcfset_conf.vcf_setop == SETOP_CONCAT) { num_vars_out += 1; if (! count_only) { vcf_write_var(& vcfset_conf.vcf_out, var1); } vcf_free_var(& var1); /* skip comparison against vcf2 */ continue; } /* use index access to vcf2 */ snprintf(regbuf, 1024, "%s:%ld-%ld", var1->chrom, var1->pos+1, var1->pos+1); var2_itr = tbx_itr_querys(vcf2_tbx, regbuf); if (! var2_itr) { var2_match = 0; } else { var2_match = 0; while (tbx_itr_next(vcf2_hts, vcf2_tbx, var2_itr, &var2_kstr) >= 0) { var_t *var2 = NULL; int var2_is_indel = 0; vcf_new_var(&var2); rc = vcf_parse_var_from_line(var2_kstr.s, var2); /* LOG_FIXME("%d:%s>%s looking at var2 %d:%s>%s (reg %s)\n", var1->pos+1, var1->ref, var1->alt, var2->pos+1, var2->ref, var2->alt, regbuf); */ if (rc) { LOG_FATAL("%s\n", "Error while parsing variant returned from tabix"); return -1; } var2_is_indel = vcf_var_is_indel(var2); /* iterator returns anything overlapping with that * position, i.e. this also includes up/downstream * indels, so make sure actual position matches */ if (var1->pos != var2->pos) { var2_match = 0; } else if (vcfset_conf.only_passed && ! VCF_VAR_PASSES(var2)) { var2_match = 0; } else if (vcfset_conf.only_snvs && var2_is_indel) { var2_match = 0; } else if (vcfset_conf.only_indels && ! var2_is_indel) { var2_match = 0; } else if (vcfset_conf.only_pos) { #ifdef TRACE LOG_DEBUG("Pos match for var2 %s:%d\n", var2->chrom, var2->pos); #endif var2_match = 1; } else { if (0==strcmp(var1->ref, var2->ref) && 0==strcmp(var1->alt, var2->alt)) { #ifdef TRACE LOG_DEBUG("Full match for var2 %s:%d\n", var2->chrom, var2->pos); #endif var2_match = 1;/* FIXME: check type as well i.e. snv vs indel */ } } vcf_free_var(&var2); if (var2_match) { break;/* no need to continue */ } } } if (vcfset_conf.vcf_setop == SETOP_COMPLEMENT) { /* relative complement : elements in A but not B */ if (!var2_match) { num_vars_out += 1; if (! count_only) { vcf_write_var(& vcfset_conf.vcf_out, var1); } } } else if (vcfset_conf.vcf_setop == SETOP_INTERSECT) { if (var2_match) { num_vars_out += 1; if (! count_only) { vcf_write_var(& vcfset_conf.vcf_out, var1); } } } else { LOG_FATAL("Internal error: unsupported vcf_setop %d\n", vcfset_conf.vcf_setop); return 1; } vcf_free_var(& var1); tbx_itr_destroy(var2_itr); }/* while (1) */ vcf_file_close(& vcfset_conf.vcf_in1); if (vcf_in2) { hts_close(vcf2_hts); tbx_destroy(vcf2_tbx); } LOG_VERBOSE("Parsed %d variants from 1st vcf file (ignoring %d non-passed of those)\n", num_vars_vcf1 + num_vars_vcf1_ign, num_vars_vcf1_ign); LOG_VERBOSE("Wrote %d variants to output\n", num_vars_out); if (! count_only) { vcf_file_close(& vcfset_conf.vcf_out); } if (0==rc) { if (count_only) { printf("%ld\n", num_vars_out); } LOG_VERBOSE("%s\n", "Successful exit."); } free(vcf_in1); free(vcf_in2); free(vcf_out); return rc; }
void cis_data::readPhenotypes(string fbed) { int n_includedS = 0; int n_includedP = 0; int n_excludedP = 0; int n_negativeStrd = 0; vector < int > mappingS; //Open BED file vrb.title("Reading phenotype data in [" + fbed + "]"); htsFile *fp = hts_open(fbed.c_str(),"r"); if (!fp) vrb.error("Cannot open file"); tbx_t *tbx = tbx_index_load(fbed.c_str()); if (!tbx) vrb.error("Cannot open index file"); kstring_t str = {0,0,0}; if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!"); //Process sample names vector < string > tokens; stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); for (int t = 6 ; t < tokens.size() ; t ++) { mappingS.push_back(findSample(tokens[t])); if (mappingS.back() >= 0) n_includedS++; } //Read phenotypes unsigned int linecount =0; //Read phenotypes if (regionPhenotype.chr != "NA"){ hts_itr_t *itr = tbx_itr_querys(tbx, regionPhenotype.get().c_str()); vrb.bullet("target region [" + regionPhenotype.get() + "]"); if (!itr) vrb.error("Cannot jump to region!"); //Read data while (tbx_itr_next(fp, tbx, itr, &str) >= 0) { linecount ++; if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); if ((grp_mode == GRP_NONE && filter_phenotype.check(tokens[3])) || (grp_mode != GRP_NONE && filter_phenotype.check(tokens[4]))) { phenotype_id.push_back(tokens[3]); phenotype_chr.push_back(tokens[0]); phenotype_start.push_back(atoi(tokens[1].c_str()) + 1); phenotype_end.push_back(atoi(tokens[2].c_str())); if (grp_mode > 0 && full_test) phenotype_grp.push_back("ALL_GENES"); if (grp_mode > 0 && !full_test) phenotype_grp.push_back(tokens[4]); phenotype_neg.push_back(tokens[5] == "-"); if (phenotype_neg.back()) n_negativeStrd ++; phenotype_val.push_back(vector < float > (sample_count, 0.0)); for (int t = 6 ; t < tokens.size() ; t ++) { if (mappingS[t-6] >= 0) { if (tokens[t] == "NA") phenotype_val.back()[mappingS[t-6]] = bcf_float_missing; else phenotype_val.back()[mappingS[t-6]] = stof(tokens[t]); } } n_includedP++; } else n_excludedP ++; } tbx_itr_destroy(itr); }else{ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) { linecount ++; if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); stb.split(string(str.s), tokens); if (str.l && str.s[0] != tbx->conf.meta_char) { if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); if ((grp_mode == GRP_NONE && filter_phenotype.check(tokens[3])) || (grp_mode != GRP_NONE && filter_phenotype.check(tokens[4]))) { phenotype_id.push_back(tokens[3]); phenotype_chr.push_back(tokens[0]); phenotype_start.push_back(atoi(tokens[1].c_str()) + 1); phenotype_end.push_back(atoi(tokens[2].c_str())); if (grp_mode > 0 && full_test) phenotype_grp.push_back("ALL_GENES"); if (grp_mode > 0 && !full_test) phenotype_grp.push_back(tokens[4]); phenotype_neg.push_back(tokens[5] == "-"); if (phenotype_neg.back()) n_negativeStrd ++; phenotype_val.push_back(vector < float > (sample_count, 0.0)); for (int t = 6 ; t < tokens.size() ; t ++) { if (mappingS[t-6] >= 0) { if (tokens[t] == "NA") phenotype_val.back()[mappingS[t-6]] = bcf_float_missing; else phenotype_val.back()[mappingS[t-6]] = stof(tokens[t]); } } n_includedP++; } else n_excludedP ++; } } } //Finalize & verbose tbx_destroy(tbx); if (hts_close(fp)) vrb.error("Cannot properly close file"); phenotype_count = phenotype_id.size(); vrb.bullet(stb.str(n_includedP) + " phenotypes included"); if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user"); if (n_negativeStrd > 0 ) vrb.bullet(stb.str(n_negativeStrd) + " phenotypes are on the negative strand"); if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in target region!"); }
void union_data::readGenotypesBED(string fbed,string region) { string buffer; int n_includedG = 0; int n_excludedG_user = 0; int n_includedS = 0; int n_excludedS = 0; int n_missingS = 0; vector < int > mappingS; genotype_id.clear(); genotype_chr.clear(); genotype_start.clear(); genotype_end.clear(); genotype_val.clear(); genotype_count=0; genotype_id_to_idx.clear(); //Opening files htsFile *fp = hts_open(fbed.c_str(),"r"); if (!fp) vrb.error("Cannot open file!"); tbx_t * tbx = tbx_index_load(fbed.c_str()); if (!tbx) vrb.error("Cannot load index file!"); kstring_t str = {0,0,0}; if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!"); //Process sample names vector < string > tokens; stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); for (int i0 = 6 ; i0 < tokens.size() ; i0 ++) { string sid = tokens[i0]; if (filter_sample.check(sid)) { mappingS.push_back(findSample(sid)); if (mappingS.back() >= 0) n_includedS ++; else n_missingS ++; } else { mappingS.push_back(-1); n_excludedS ++; } } //vrb.bullet(stb.str(n_includedS) + " samples included"); //if (n_excludedS > 0) vrb.bullet(stb.str(n_excludedS) + " samples excluded by user"); //if (n_missingS > 0) vrb.bullet(stb.str(n_missingS) + " samples without phenotype data"); //if (n_includedS != sample_count) vrb.error("Cannot find genotype for " + stb.str(sample_count - n_includedS) + " samples!"); unsigned int linecount = 0; //Jump to interesting region hts_itr_t *itr = tbx_itr_querys(tbx, region.c_str()); //vrb.bullet("target region [" + regionGenotype.get() + "]"); //if (!itr) vrb.error("Cannot jump to region!"); while (tbx_itr_next(fp, tbx, itr, &str) >= 0) { linecount ++; if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); if (filter_genotype.check(tokens[3])) { genotype_id.push_back(tokens[3]); genotype_chr.push_back(tokens[0]); genotype_start.push_back(atoi(tokens[1].c_str()) + 1); genotype_end.push_back(atoi(tokens[2].c_str())); genotype_val.push_back(vector < float > (sample_count, 0.0)); for (int t = 6 ; t < tokens.size() ; t ++) { if (mappingS[t-6] >= 0) { if (tokens[t] == "NA") genotype_val.back()[mappingS[t-6]] = bcf_float_missing; else genotype_val.back()[mappingS[t-6]] = stof(tokens[t]); } } pair < string, int > temp (tokens[3],n_includedG); genotype_id_to_idx.insert(temp); n_includedG++; } else n_excludedG_user ++; } tbx_itr_destroy(itr); //Finalize & verbose tbx_destroy(tbx); if (hts_close(fp)) vrb.error("Cannot properly close file!"); genotype_count = n_includedG; //vrb.bullet(stb.str(n_includedG) + " variants included"); //if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user"); //if (genotype_count == 0) vrb.leave("Cannot find variants in target region!"); }
void union_data::scanGenotypesBED(string fbed) { string buffer; int n_includedG = 0; int n_excludedG_user = 0; //Opening files htsFile *fp = hts_open(fbed.c_str(),"r"); if (!fp) vrb.error("Cannot open file!"); tbx_t * tbx = tbx_index_load(fbed.c_str()); if (!tbx) vrb.error("Cannot load index file!"); kstring_t str = {0,0,0}; if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!"); //Read genotype data vector < string > tokens; unsigned int linecount = 0; //Jump to interesting region if (regionGenotype.chr != "NA"){ hts_itr_t *itr = tbx_itr_querys(tbx, regionGenotype.get().c_str()); vrb.bullet("target region [" + regionGenotype.get() + "]"); if (!itr) vrb.error("Cannot jump to region!"); while (tbx_itr_next(fp, tbx, itr, &str) >= 0) { linecount ++; if (linecount % 1000000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); if (genotype_id_to_idx.count(tokens[3])) continue; if (filter_genotype.check(tokens[3])) { genotype_id.push_back(tokens[3]); genotype_chr.push_back(tokens[0]); genotype_start.push_back(atoi(tokens[1].c_str()) + 1); genotype_end.push_back(atoi(tokens[2].c_str())); pair < string, int > temp (tokens[3],genotype_id_to_idx.size()); genotype_id_to_idx.insert(temp); n_includedG++; } else n_excludedG_user ++; } tbx_itr_destroy(itr); }else{ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) { linecount ++; if (linecount % 1000000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); stb.split(string(str.s), tokens); if (str.l && str.s[0] != tbx->conf.meta_char) { if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); if (genotype_id_to_idx.count(tokens[3])) continue; if (filter_genotype.check(tokens[3])) { genotype_id.push_back(tokens[3]); genotype_chr.push_back(tokens[0]); genotype_start.push_back(atoi(tokens[1].c_str()) + 1); genotype_end.push_back(atoi(tokens[2].c_str())); pair < string, int > temp (tokens[3],genotype_id_to_idx.size()); genotype_id_to_idx.insert(temp); n_includedG++; } else n_excludedG_user ++; } } } //Finalize & verbose tbx_destroy(tbx); genotype_count += n_includedG; if (hts_close(fp)) vrb.error("Cannot properly close file!"); vrb.bullet(stb.str(n_includedG) + " new variants included"); if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user"); if (n_includedG == 0) vrb.leave("Cannot find variants in target region!"); }
int bcf_sr_next_line(readers_t *files) { int32_t min_pos = INT_MAX; int ret,i,j; kstring_t *str = &files->tmps; while ( min_pos==INT_MAX ) { // Need to open new chromosome? int eos = 0; for (i=0; i<files->nreaders; i++) if ( !files->readers[i].itr && !files->readers[i].nbuffer ) eos++; if ( eos==files->nreaders ) { const char *seq; if ( files->targets ) { seq = tgt_next_seq(files->targets); if ( !seq ) return 0; // all chroms scanned } else { if ( ++files->iseq >= files->nseqs ) return 0; // all chroms scanned seq = files->seqs[files->iseq]; } for (i=0; i<files->nreaders; i++) { reader_t *reader = &files->readers[i]; if ( reader->tbx ) reader->itr = tbx_itr_querys(reader->tbx,seq); else reader->itr = bcf_itr_querys(reader->bcf,reader->header,seq); } } // Find the smallest coordinate for (i=0; i<files->nreaders; i++) { reader_t *reader = &files->readers[i]; int buffer_full = ( reader->nbuffer && reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) ? 1 : 0; if ( reader->itr && !buffer_full ) { // Fill the buffer with records starting at the same position while (1) { if ( reader->nbuffer+1 >= reader->mbuffer ) { reader->mbuffer += 8; reader->buffer = (bcf1_t**) realloc(reader->buffer, sizeof(bcf1_t*)*reader->mbuffer); for (j=8; j>0; j--) reader->buffer[reader->mbuffer-j] = bcf_init1(); } if ( reader->tbx ) { ret = tbx_itr_next((BGZF*)reader->file->fp, reader->tbx, reader->itr, str); if ( ret<0 ) break; vcf_parse1(str, reader->header, reader->buffer[reader->nbuffer+1]); } else { ret = bcf_itr_next((BGZF*)reader->file->fp, reader->itr, reader->buffer[reader->nbuffer+1]); if ( ret<0 ) break; } bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR|BCF_UN_FLT); // apply filter if ( reader->filter_id!=-1 && reader->buffer[reader->nbuffer+1]->d.n_flt && reader->filter_id!=reader->buffer[reader->nbuffer+1]->d.flt[0] ) continue; set_variant_types(reader->buffer[reader->nbuffer+1]); reader->nbuffer++; if ( reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break; } if ( ret<0 ) { tbx_itr_destroy(reader->itr); reader->itr = NULL; } // done for this chromosome } if ( reader->nbuffer ) { if ( min_pos > reader->buffer[1]->pos ) min_pos = reader->buffer[1]->pos; } // The buffer is full - either there is nothing else to read or the last record has a different coordinate if ( files->collapse && reader->nbuffer>2 && reader->buffer[1]->pos==reader->buffer[2]->pos ) { collapse_buffer(files, reader); } } if ( files->targets && min_pos!=INT_MAX ) { int ret = tgt_has_position(files->targets, min_pos); if ( ret==1 ) continue; // The position must be skipped if ( ret==-1 ) { // done for this chromosome, don't read the rest for (i=0; i<files->nreaders; i++) { files->readers[i].nbuffer = 0; if ( files->readers[i].itr ) { tbx_itr_destroy(files->readers[i].itr); files->readers[i].itr = NULL; } } min_pos = INT_MAX; continue; } // remove the active line, save the buffer line for (i=0; i<files->nreaders; i++) { reader_t *reader = &files->readers[i]; for (j=1; j<=reader->nbuffer; j++) if ( reader->buffer[j]->pos!=min_pos ) break; if ( j==1 ) continue; if ( j<=reader->nbuffer ) { bcf1_t *tmp = reader->buffer[1]; reader->buffer[1] = reader->buffer[j]; reader->buffer[j] = tmp; reader->nbuffer = 1; } else reader->nbuffer = 0; } min_pos = INT_MAX; } } //printf("[next_line] min_pos=%d\n", min_pos+1); //debug_buffers(files); // Set the current line ret = 0; bcf1_t *first = NULL; for (i=0; i<files->nreaders; i++) { reader_t *reader = &files->readers[i]; if ( !reader->nbuffer || reader->buffer[1]->pos!=min_pos ) continue; // Match the records by REF and ALT int j, irec = -1; if ( first ) { for (j=1; j<=reader->nbuffer; j++) { bcf1_t *line = reader->buffer[j]; if ( min_pos != line->pos ) break; // done with this buffer if ( files->collapse&COLLAPSE_ANY ) { irec=j; break; } // checking position only if ( files->collapse&COLLAPSE_SNPS && first->d.var_type&VCF_SNP && line->d.var_type&VCF_SNP ) { irec=j; break; } if ( files->collapse&COLLAPSE_INDELS && first->d.var_type&VCF_INDEL && line->d.var_type&VCF_INDEL ) { irec=j; break; } if ( first->rlen != line->rlen ) continue; // REFs do not match if ( strcmp(first->d.allele[0], line->d.allele[0]) ) continue; // REFs do not match int ial,jal; if ( files->collapse==COLLAPSE_NONE ) { // require exact match, all alleles must be identical if ( first->n_allele!=line->n_allele ) continue; // different number of alleles int nmatch = 1; // REF has been already checked for (ial=1; ial<first->n_allele; ial++) { for (jal=1; jal<line->n_allele; jal++) if ( !strcmp(first->d.allele[ial], line->d.allele[jal]) ) { nmatch++; break; } } if ( nmatch>=first->n_allele ) { irec=j; break; } } else { // thorough check: the REFs and some of the alleles have to be shared // (neglecting different representations of the same indel for now) for (ial=1; ial<first->n_allele; ial++) { for (jal=1; jal<line->n_allele; jal++) if ( !strcmp(first->d.allele[ial], line->d.allele[jal]) ) { irec=j; break; } if ( irec>=1 ) break; } } if ( irec>=1 ) break; } if ( irec==-1 ) continue; } else { first = reader->buffer[1]; irec = 1; } bcf1_t *tmp = reader->buffer[0]; reader->buffer[0] = reader->buffer[irec]; for (j=irec+1; j<=reader->nbuffer; j++) reader->buffer[j-1] = reader->buffer[j]; reader->buffer[ reader->nbuffer ] = tmp; reader->nbuffer--; ret |= 1<<i; } // fprintf(stdout,"[next_line] min_pos=%d mask=%d\n", min_pos+1, ret); // debug_buffers(stdout,files); return ret; }
/** * tabix workhorse function */ static int tabix_handler(request_rec *r) { htsFile *fp=NULL; hts_itr_t *itr=NULL; kstring_t line = {0,0,0}; int print_header=1; int print_body=1; struct tabix_callback_t handler; int http_status=OK; memset((void*)&handler,0,sizeof(struct tabix_callback_t)); handler.r=r; handler.limit=DEFAULT_LIMIT_RECORDS; if (!r->handler || strcmp(r->handler, "tabix-handler")) return (DECLINED); if (strcmp(r->method, "GET")!=0) return DECLINED; if(r->canonical_filename==NULL) return DECLINED; /* file must be b-gzipped */ if( !( str_ends_with(r->canonical_filename,".gz") )) return DECLINED; /* file must be indexed with tabix */ if( !( fileExtExists(r->canonical_filename,".tbi") )) return 404; handler.httParams = HttpParamParseGET(r); if(handler.httParams==NULL) return DECLINED; handler.file_format=E_FORMAT_UNDEFINED; if(str_ends_with(r->canonical_filename,".vcf.gz")) { handler.file_format=E_FORMAT_VCF; } else if(str_ends_with(r->canonical_filename,".bed.gz")) { handler.file_format=E_FORMAT_BED; } /* only one loop, we use this to cleanup the code, instead of using a goto statement */ do { const char* format=HttpParamGet(handler.httParams,"format"); const char* limit=HttpParamGet(handler.httParams,"limit"); const char* region=HttpParamGet(handler.httParams,"region"); int iterator_was_requested=FALSE; if(limit!=NULL) { handler.limit=atol(limit); } if(format==NULL) { http_status=DECLINED; break; } else if(strcmp(format,"xml")==0) { SETUP_HANDLER(xml); } else if(strcmp(format,"json")==0 || strcmp(format,"jsonp")==0) { handler.jsonp_callback=HttpParamGet(handler.httParams,"callback"); SETUP_HANDLER(json); } else if(strcmp(format,"html")==0) { SETUP_HANDLER(html); } else { SETUP_HANDLER(plain); } fp=hts_open(r->canonical_filename,"r"); if(fp==NULL) { http_status=HTTP_NOT_FOUND; break; } //read index handler.tbx = tbx_index_load(r->canonical_filename); if(handler.tbx==NULL) { http_status=HTTP_INTERNAL_SERVER_ERROR; break; } if(region!=NULL && !str_is_empty(region)) { iterator_was_requested=TRUE; itr = tbx_itr_querys(handler.tbx,region); } handler.startdocument(&handler); if(print_header) { handler.startheader(&handler); while ( hts_getline(fp, KS_SEP_LINE, &line) >= 0 ) { if ( !line.l || line.s[0]!=handler.tbx->conf.meta_char ) break; handler.header(&handler,&line); handler.count++; } handler.enddheader(&handler); } handler.count=0;//Reset if(print_body) { handler.startbody(&handler); if(iterator_was_requested) { if(itr!=NULL) { while ((handler.limit==-1 || handler.count< handler.limit) && tbx_itr_next(fp, handler.tbx, itr, &line) >= 0) { if(handler.show(&handler,&line)<0) break; handler.count++; } } } else { while ((handler.limit==-1 || handler.count< handler.limit) && \ hts_getline(fp, KS_SEP_LINE, &line) >= 0) { if(handler.show(&handler,&line)<0) break; handler.count++; } } handler.endbody(&handler); } handler.enddocument(&handler); } while(0);/* always abort */ //cleanup if(itr!=NULL) tbx_itr_destroy(itr); HttpParamFree(handler.httParams); free(line.s); if(fp!=NULL) hts_close(fp); if(handler.tbx!=NULL) tbx_destroy(handler.tbx); return http_status; }
static int query_regions(args_t *args, char *fname, char **regs, int nregs) { int i; htsFile *fp = hts_open(fname,"r"); if ( !fp ) error("Could not read %s\n", fname); enum htsExactFormat format = hts_get_format(fp)->format; regidx_t *reg_idx = NULL; if ( args->targets_fname ) { reg_idx = regidx_init(args->targets_fname, NULL, NULL, 0, NULL); if ( !reg_idx ) error("Could not read %s\n", args->targets_fname); } if ( format == bcf ) { htsFile *out = hts_open("-","w"); if ( !out ) error("Could not open stdout\n", fname); hts_idx_t *idx = bcf_index_load(fname); if ( !idx ) error("Could not load .csi index of %s\n", fname); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Could not read the header: %s\n", fname); if ( args->print_header ) bcf_hdr_write(out,hdr); if ( !args->header_only ) { bcf1_t *rec = bcf_init(); for (i=0; i<nregs; i++) { hts_itr_t *itr = bcf_itr_querys(idx,hdr,regs[i]); while ( bcf_itr_next(fp, itr, rec) >=0 ) { if ( reg_idx && !regidx_overlap(reg_idx, bcf_seqname(hdr,rec),rec->pos,rec->pos+rec->rlen-1, NULL) ) continue; bcf_write(out,hdr,rec); } tbx_itr_destroy(itr); } bcf_destroy(rec); } if ( hts_close(out) ) error("hts_close returned non-zero status for stdout\n"); bcf_hdr_destroy(hdr); hts_idx_destroy(idx); } else if ( format==vcf || format==sam || format==unknown_format ) { tbx_t *tbx = tbx_index_load(fname); if ( !tbx ) error("Could not load .tbi/.csi index of %s\n", fname); kstring_t str = {0,0,0}; if ( args->print_header ) { while ( hts_getline(fp, KS_SEP_LINE, &str) >= 0 ) { if ( !str.l || str.s[0]!=tbx->conf.meta_char ) break; puts(str.s); } } if ( !args->header_only ) { int nseq; const char **seq = NULL; if ( reg_idx ) seq = tbx_seqnames(tbx, &nseq); for (i=0; i<nregs; i++) { hts_itr_t *itr = tbx_itr_querys(tbx, regs[i]); if ( !itr ) continue; while (tbx_itr_next(fp, tbx, itr, &str) >= 0) { if ( reg_idx && !regidx_overlap(reg_idx,seq[itr->curr_tid],itr->curr_beg,itr->curr_end, NULL) ) continue; puts(str.s); } tbx_itr_destroy(itr); } free(seq); } free(str.s); tbx_destroy(tbx); } else if ( format==bam ) error("Please use \"samtools view\" for querying BAM files.\n"); if ( reg_idx ) regidx_destroy(reg_idx); if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname); for (i=0; i<nregs; i++) free(regs[i]); free(regs); return 0; }
Tabix::~Tabix(void) { tbx_itr_destroy(iter); tbx_destroy(tbx); }
/* * _reader_fill_buffer() - buffers all records with the same coordinate */ static void _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) { // Return if the buffer is full: the coordinate of the last buffered record differs if ( reader->nbuffer && reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) return; // No iterator (sequence not present in this file) and not streaming if ( !reader->itr && !files->streaming ) return; // Fill the buffer with records starting at the same position int i, ret = 0; while (1) { if ( reader->nbuffer+1 >= reader->mbuffer ) { // Increase buffer size reader->mbuffer += 8; reader->buffer = (bcf1_t**) realloc(reader->buffer, sizeof(bcf1_t*)*reader->mbuffer); for (i=8; i>0; i--) // initialize { reader->buffer[reader->mbuffer-i] = bcf_init1(); reader->buffer[reader->mbuffer-i]->max_unpack = files->max_unpack; reader->buffer[reader->mbuffer-i]->pos = -1; // for rare cases when VCF starts from 1 } } if ( files->streaming ) { if ( reader->file->format.format==vcf ) { if ( (ret=hts_getline(reader->file, KS_SEP_LINE, &files->tmps)) < 0 ) break; // no more lines int ret = vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]); if ( ret<0 ) break; } else if ( reader->file->format.format==bcf ) { if ( (ret=bcf_read1(reader->file, reader->header, reader->buffer[reader->nbuffer+1])) < 0 ) break; // no more lines } else { fprintf(stderr,"[%s:%d %s] fixme: not ready for this\n", __FILE__,__LINE__,__FUNCTION__); exit(1); } } else if ( reader->tbx_idx ) { if ( (ret=tbx_itr_next(reader->file, reader->tbx_idx, reader->itr, &files->tmps)) < 0 ) break; // no more lines vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]); } else { if ( (ret=bcf_itr_next(reader->file, reader->itr, reader->buffer[reader->nbuffer+1])) < 0 ) break; // no more lines bcf_subset_format(reader->header,reader->buffer[reader->nbuffer+1]); } // apply filter if ( !reader->nfilter_ids ) bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR); else { bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR|BCF_UN_FLT); if ( !has_filter(reader, reader->buffer[reader->nbuffer+1]) ) continue; } reader->nbuffer++; if ( reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break; // the buffer is full } if ( ret<0 ) { // done for this region tbx_itr_destroy(reader->itr); reader->itr = NULL; } if ( files->collapse && reader->nbuffer>=2 && reader->buffer[1]->pos==reader->buffer[2]->pos ) collapse_buffer(files, reader); }
bool Tabix::setRegion(string& region) { tbx_itr_destroy(iter); iter = tbx_itr_querys(tbx, region.c_str()); has_jumped = true; return true; }
void union_data::readPhenotypes(string fbed, string region) { int n_includedS = 0; int n_includedP = 0; int n_excludedP = 0; vector < int > mappingS; phenotype_id.clear(); phenotype_chr.clear(); phenotype_start.clear(); phenotype_end.clear(); phenotype_val.clear(); phenotype_count=0; phenotype_id_to_idx.clear(); //Open BED file //vrb.title("Reading phenotype data in [" + fbed + "]"); htsFile *fp = hts_open(fbed.c_str(),"r"); if (!fp) vrb.error("Cannot open file"); tbx_t *tbx = tbx_index_load(fbed.c_str()); if (!tbx) vrb.error("Cannot open index file"); kstring_t str = {0,0,0}; if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!"); //Process sample names vector < string > tokens; stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); for (int t = 6 ; t < tokens.size() ; t ++) { mappingS.push_back(findSample(tokens[t])); if (mappingS.back() >= 0) n_includedS++; } unsigned int linecount =0; //Read phenotypes hts_itr_t *itr = tbx_itr_querys(tbx, region.c_str()); //vrb.bullet("target region [" + regionPhenotype.get() + "]"); //if (!itr) vrb.error("Cannot jump to region!"); //Read data while (tbx_itr_next(fp, tbx, itr, &str) >= 0) { linecount ++; if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); if (filter_phenotype.check(tokens[3])) { phenotype_id.push_back(tokens[3]); phenotype_chr.push_back(tokens[0]); phenotype_start.push_back(atoi(tokens[1].c_str()) + 1); phenotype_end.push_back(atoi(tokens[2].c_str())); phenotype_val.push_back(vector < float > (sample_count, 0.0)); for (int t = 6 ; t < tokens.size() ; t ++) { if (mappingS[t-6] >= 0) { if (tokens[t] == "NA") phenotype_val.back()[mappingS[t-6]] = bcf_float_missing; else phenotype_val.back()[mappingS[t-6]] = stof(tokens[t]); } } pair < string, int > temp (tokens[3],n_includedP); phenotype_id_to_idx.insert(temp); n_includedP++; } else n_excludedP ++; } tbx_itr_destroy(itr); //Finalize & verbose tbx_destroy(tbx); if (hts_close(fp)) vrb.error("Cannot properly close file"); phenotype_count = phenotype_id.size(); //vrb.bullet(stb.str(n_includedP) + " phenotypes included"); //if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user"); //if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in target region!"); }
int main_tabix(int argc, char *argv[]) { int c, min_shift = -1, is_force = 0, is_all = 0; tbx_conf_t conf = tbx_conf_gff, *conf_ptr = NULL; while ((c = getopt(argc, argv, "0fap:s:b:e:S:c:m:")) >= 0) if (c == '0') conf.preset |= TBX_UCSC; else if (c == 'f') is_force = 1; else if (c == 'a') is_all = 1; else if (c == 'm') min_shift = atoi(optarg); else if (c == 's') conf.sc = atoi(optarg); else if (c == 'b') conf.bc = atoi(optarg); else if (c == 'e') conf.ec = atoi(optarg); else if (c == 'c') conf.meta_char = *optarg; else if (c == 'S') conf.line_skip = atoi(optarg); else if (c == 'p') { if (strcmp(optarg, "gff") == 0) conf_ptr = &tbx_conf_gff; else if (strcmp(optarg, "bed") == 0) conf_ptr = &tbx_conf_bed; else if (strcmp(optarg, "sam") == 0) conf_ptr = &tbx_conf_sam; else if (strcmp(optarg, "vcf") == 0) conf_ptr = &tbx_conf_vcf; else { fprintf(stderr, "The type '%s' not recognised\n", optarg); return 1; } } if (optind == argc) { fprintf(stderr, "\nUsage: bcftools tabix [options] <in.gz> [reg1 [...]]\n\n"); fprintf(stderr, "Options: -p STR preset: gff, bed, sam or vcf [gff]\n"); fprintf(stderr, " -s INT column number for sequence names (suppressed by -p) [1]\n"); fprintf(stderr, " -b INT column number for region start [4]\n"); fprintf(stderr, " -e INT column number for region end (if no end, set INT to -b) [5]\n"); fprintf(stderr, " -0 specify coordinates are zero-based\n"); fprintf(stderr, " -S INT skip first INT lines [0]\n"); fprintf(stderr, " -c CHAR skip lines starting with CHAR [null]\n"); fprintf(stderr, " -a print all records\n"); fprintf(stderr, " -f force to overwrite existing index\n"); fprintf(stderr, " -m INT set the minimal interval size to 1<<INT; 0 for the old tabix index [0]\n"); fprintf(stderr, "\n"); return 1; } if (is_all) { // read without random access kstring_t s; BGZF *fp; s.l = s.m = 0; s.s = 0; fp = bgzf_open(argv[optind], "r"); while (bgzf_getline(fp, '\n', &s) >= 0) puts(s.s); bgzf_close(fp); free(s.s); } else if (optind + 2 > argc) { // create index if ( !conf_ptr ) { // auto-detect file type by file name int l = strlen(argv[optind]); int strcasecmp(const char *s1, const char *s2); if (l>=7 && strcasecmp(argv[optind]+l-7, ".gff.gz") == 0) conf_ptr = &tbx_conf_gff; else if (l>=7 && strcasecmp(argv[optind]+l-7, ".bed.gz") == 0) conf_ptr = &tbx_conf_bed; else if (l>=7 && strcasecmp(argv[optind]+l-7, ".sam.gz") == 0) conf_ptr = &tbx_conf_sam; else if (l>=7 && strcasecmp(argv[optind]+l-7, ".vcf.gz") == 0) conf_ptr = &tbx_conf_vcf; } if ( conf_ptr ) conf = *conf_ptr; if (!is_force) { char *fn; FILE *fp; fn = (char*)alloca(strlen(argv[optind]) + 5); strcat(strcpy(fn, argv[optind]), min_shift <= 0? ".tbi" : ".csi"); if ((fp = fopen(fn, "rb")) != 0) { fclose(fp); fprintf(stderr, "[E::%s] the index file exists; use option '-f' to overwrite\n", __func__); return 1; } } if ( tbx_index_build(argv[optind], min_shift, &conf) ) { fprintf(stderr,"tbx_index_build failed: Is the file bgzip-compressed? Was wrong -p [type] option used?\n"); return 1; } } else { // read with random access tbx_t *tbx; BGZF *fp; kstring_t s; int i; if ((tbx = tbx_index_load(argv[optind])) == 0) return 1; if ((fp = bgzf_open(argv[optind], "r")) == 0) return 1; s.s = 0; s.l = s.m = 0; for (i = optind + 1; i < argc; ++i) { hts_itr_t *itr; if ((itr = tbx_itr_querys(tbx, argv[i])) == 0) continue; while (tbx_bgzf_itr_next(fp, tbx, itr, &s) >= 0) puts(s.s); tbx_itr_destroy(itr); } free(s.s); bgzf_close(fp); tbx_destroy(tbx); } return 0; }
void union_data::scanPhenotypes(string fbed) { int n_includedP = 0; int n_excludedP = 0; //Open BED file vrb.title("Scanning phenotype data in [" + fbed + "]"); htsFile *fp = hts_open(fbed.c_str(),"r"); if (!fp) vrb.error("Cannot open file"); tbx_t * tbx = tbx_index_load(fbed.c_str()); if (!tbx) vrb.error("Cannot open index file"); //Read header kstring_t str = {0,0,0}; if (!hts_getline(fp, KS_SEP_LINE, &str) || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line"); //Scan file vector < string > tokens; unsigned int linecount =0; if (regionPhenotype.chr != "NA"){ hts_itr_t *itr = tbx_itr_querys(tbx, regionPhenotype.get().c_str()); vrb.bullet("target region [" + regionPhenotype.get() + "]"); if (!itr) vrb.error("Cannot jump to region!"); //Read data while (tbx_itr_next(fp, tbx, itr, &str) >= 0) { linecount ++; if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); if (phenotype_id_to_idx.count(tokens[3])) continue; if (filter_phenotype.check(tokens[3])) { phenotype_id.push_back(tokens[3]); phenotype_chr.push_back(tokens[0]); phenotype_start.push_back(atoi(tokens[1].c_str()) + 1); phenotype_end.push_back(atoi(tokens[2].c_str())); pair < string, int > temp (tokens[3],phenotype_id_to_idx.size()); phenotype_id_to_idx.insert(temp); n_includedP++; } else n_excludedP ++; } tbx_itr_destroy(itr); }else{ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) { linecount ++; if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); if (str.l && str.s[0] != tbx->conf.meta_char) { stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); if (phenotype_id_to_idx.count(tokens[3])) continue; if (filter_phenotype.check(tokens[3])) { phenotype_id.push_back(tokens[3]); phenotype_chr.push_back(tokens[0]); phenotype_start.push_back(atoi(tokens[1].c_str()) + 1); phenotype_end.push_back(atoi(tokens[2].c_str())); pair < string, int > temp (tokens[3],phenotype_id_to_idx.size()); phenotype_id_to_idx.insert(temp); n_includedP++; } else n_excludedP ++; } } } //Finalize & verbose tbx_destroy(tbx); if (hts_close(fp)) vrb.error("Cannot properly close file"); phenotype_count = phenotype_id.size(); vrb.bullet(stb.str(n_includedP) + " new phenotypes included"); if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user"); if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in region!"); }