void bcf_sr_destroy(readers_t *files) { if ( !files->nreaders ) return; int i; for (i=0; i<files->nreaders; i++) { reader_t *reader = &files->readers[i]; if ( reader->tbx ) tbx_destroy(reader->tbx); if ( reader->bcf ) hts_idx_destroy(reader->bcf); bcf_hdr_destroy(reader->header); hts_close(reader->file); if ( reader->itr ) tbx_itr_destroy(reader->itr); int j; for (j=0; j<reader->mbuffer; j++) bcf_destroy1(reader->buffer[j]); free(reader->buffer); if ( reader->samples ) free(reader->samples); } free(files->readers); free(files->seqs); for (i=0; i<files->n_smpl; i++) free(files->samples[i]); free(files->samples); if (files->targets) { if (files->targets->itr) tbx_itr_destroy(files->targets->itr); tbx_destroy(files->targets->tbx); if (files->targets->line.m) free(files->targets->line.s); hts_close(files->targets->file); free(files->targets->seq_names); free(files->targets); } if ( files->tmps.m ) free(files->tmps.s); free(files); }
static int query_chroms(char *fname) { const char **seq; int i, nseq, ftype = file_type(fname); if ( ftype & IS_TXT || !ftype ) { tbx_t *tbx = tbx_index_load(fname); if ( !tbx ) error("Could not load .tbi index of %s\n", fname); seq = tbx_seqnames(tbx, &nseq); for (i=0; i<nseq; i++) printf("%s\n", seq[i]); free(seq); tbx_destroy(tbx); } else if ( ftype==IS_BCF ) { htsFile *fp = hts_open(fname,"r"); if ( !fp ) error("Could not read %s\n", fname); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Could not read the header: %s\n", fname); hts_close(fp); hts_idx_t *idx = bcf_index_load(fname); if ( !idx ) error("Could not load .csi index of %s\n", fname); seq = bcf_index_seqnames(idx, hdr, &nseq); for (i=0; i<nseq; i++) printf("%s\n", seq[i]); free(seq); bcf_hdr_destroy(hdr); hts_idx_destroy(idx); } else if ( ftype==IS_BAM ) // todo: BAM error("BAM: todo\n"); return 0; }
hts_streamer:: ~hts_streamer() { if (_titr) tbx_itr_destroy(_titr); if (_tidx) tbx_destroy(_tidx); if (_hfp) hts_close(_hfp); if (_kstr.s) free(_kstr.s); }
static void bcf_sr_destroy1(bcf_sr_t *reader) { if ( reader->tbx_idx ) tbx_destroy(reader->tbx_idx); if ( reader->bcf_idx ) hts_idx_destroy(reader->bcf_idx); bcf_hdr_destroy(reader->header); hts_close(reader->file); if ( reader->itr ) tbx_itr_destroy(reader->itr); int j; for (j=0; j<reader->mbuffer; j++) bcf_destroy1(reader->buffer[j]); free(reader->buffer); free(reader->samples); free(reader->filter_ids); }
int beds_file_destroy(struct beds_anno_file *file) { int i; hts_close(file->fp); tbx_destroy(file->idx); for ( i = 0; i < file->n_cols; ++i ) free(file->cols[i].hdr_key); free(file->cols); for ( i = 0; i < file->max; ++i ) beds_anno_tsv_destroy(file->buffer[i]); if ( file->fname ) free(file->fname); if ( file->max ) free(file->buffer); return 0; }
void cis_data::scanPhenotypes(string fbed) { int n_includedP = 0; int n_excludedP = 0; int n_negativeStrd = 0; //Open BED file vrb.title("Scanning phenotype data in [" + fbed + "]"); htsFile *fp = hts_open(fbed.c_str(),"r"); if (!fp) vrb.error("Cannot open file"); tbx_t * tbx = tbx_index_load(fbed.c_str()); if (!tbx) vrb.error("Cannot open index file"); //Read header kstring_t str = {0,0,0}; if (!hts_getline(fp, KS_SEP_LINE, &str) || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line"); //Scan file vector < string > tokens; while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) { if (str.l && str.s[0] != tbx->conf.meta_char) { stb.split(string(str.s), tokens); if (tokens.size() < 5) vrb.error("Incorrect number of columns!"); if ((grp_mode == GRP_NONE && filter_phenotype.check(tokens[3])) || (grp_mode != GRP_NONE && filter_phenotype.check(tokens[4]))) { phenotype_id.push_back(tokens[3]); phenotype_chr.push_back(tokens[0]); phenotype_start.push_back(atoi(tokens[1].c_str()) + 1); phenotype_end.push_back(atoi(tokens[2].c_str())); if (grp_mode > 0 && full_test) phenotype_grp.push_back("ALL_GENES"); if (grp_mode > 0 && !full_test) phenotype_grp.push_back(tokens[4]); phenotype_neg.push_back(tokens[5] == "-"); if (phenotype_neg.back()) n_negativeStrd ++; n_includedP++; } else n_excludedP ++; } } //Finalize & verbose tbx_destroy(tbx); if (hts_close(fp)) vrb.error("Cannot properly close file"); phenotype_count = phenotype_id.size(); vrb.bullet(stb.str(n_includedP) + " phenotypes included"); if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user"); if (n_negativeStrd > 0 ) vrb.bullet(stb.str(n_negativeStrd) + " phenotypes are on the negative strand"); if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in region!"); }
void bcf_sr_regions_destroy(bcf_sr_regions_t *reg) { int i; free(reg->fname); if ( reg->itr ) tbx_itr_destroy(reg->itr); if ( reg->tbx ) tbx_destroy(reg->tbx); if ( reg->file ) hts_close(reg->file); if ( reg->als ) free(reg->als); if ( reg->als_str.s ) free(reg->als_str.s); free(reg->line.s); if ( reg->regs ) { // free only in-memory names, tbx names are const for (i=0; i<reg->nseqs; i++) { free(reg->seq_names[i]); free(reg->regs[i].regs); } } free(reg->regs); free(reg->seq_names); khash_str2int_destroy(reg->seq_hash); free(reg); }
int main_vcfset(int argc, char *argv[]) { vcfset_conf_t vcfset_conf; char *vcf_header = NULL; int rc = 0; char *vcf_in1, *vcf_in2, *vcf_out; long int num_vars_vcf1; long int num_vars_vcf1_ign, num_vars_out; static int only_passed = 0; static int only_pos = 0; static int only_snvs = 0; static int only_indels = 0; static int count_only = 0; tbx_t *vcf2_tbx = NULL; /* index for second vcf file */ htsFile *vcf2_hts = NULL; char *add_info_field = NULL; int vcf_concat_findex = 0; vcf_in1 = vcf_in2 = vcf_out = NULL; num_vars_vcf1 = 0; num_vars_vcf1_ign = num_vars_out = 0; /* default vcfset options */ memset(&vcfset_conf, 0, sizeof(vcfset_conf_t)); /* vcfset_conf.vcf_in1 = NULL; */ /* vcfset_conf.vcf_in2 = NULL; */ /* vcfset_conf.vcf_out = stdout;*/ /* keep in sync with long_opts_str and usage * * getopt is a pain in the whole when it comes to syncing of long * and short args and usage. check out gopt, libcfu... */ while (1) { int c; static struct option long_opts[] = { /* see usage sync */ {"help", no_argument, NULL, 'h'}, {"verbose", no_argument, &verbose, 1}, {"debug", no_argument, &debug, 1}, {"only-passed", no_argument, &only_passed, 1}, {"only-pos", no_argument, &only_pos, 1}, {"only-indels", no_argument, &only_indels, 1}, {"only-snvs", no_argument, &only_snvs, 1}, {"count-only", no_argument, &count_only, 1}, {"vcf1", required_argument, NULL, '1'}, {"vcf2", required_argument, NULL, '2'}, {"vcfout", required_argument, NULL, 'o'}, {"action", required_argument, NULL, 'a'}, {"add-info", required_argument, NULL, 'I'}, {0, 0, 0, 0} /* sentinel */ }; /* keep in sync with long_opts and usage */ static const char *long_opts_str = "h1:2:o:a:I:"; /* getopt_long stores the option index here. */ int long_opts_index = 0; c = getopt_long(argc-1, argv+1, /* skipping 'lofreq', just leaving 'command', i.e. call */ long_opts_str, long_opts, & long_opts_index); if (c == -1) { break; } switch (c) { /* keep in sync with long_opts etc */ case 'h': usage(& vcfset_conf); free(vcf_in1); free(vcf_in2); free(vcf_out); return 0; case '1': vcf_in1 = strdup(optarg); break; case '2': vcf_in2 = strdup(optarg); break; case 'o': if (0 != strcmp(optarg, "-")) { if (file_exists(optarg)) { LOG_FATAL("Cowardly refusing to overwrite file '%s'. Exiting...\n", optarg); free(vcf_in1); free(vcf_in2); return 1; } } vcf_out = strdup(optarg); break; case 'a': if (0 == strcmp(optarg, "intersect")) { vcfset_conf.vcf_setop = SETOP_INTERSECT; } else if (0 == strcmp(optarg, "complement")) { vcfset_conf.vcf_setop = SETOP_COMPLEMENT; } else if (0 == strcmp(optarg, "concat")) { vcfset_conf.vcf_setop = SETOP_CONCAT; } else { LOG_FATAL("Unknown action '%s'. Exiting...\n", optarg); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } break; case 'I': add_info_field = strdup(optarg); break; case '?': LOG_FATAL("%s\n", "unrecognized arguments found. Exiting...\n"); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; default: break; } } vcfset_conf.only_passed = only_passed; vcfset_conf.only_pos = only_pos; vcfset_conf.only_snvs = only_snvs; vcfset_conf.only_indels = only_indels; if (vcfset_conf.only_indels && vcfset_conf.only_snvs) { LOG_FATAL("%s\n", "Can't take only indels *and* only snvs into account"); return 1; } if (0 != argc - optind - 1) { if (vcfset_conf.vcf_setop == SETOP_CONCAT) { vcf_concat_findex = optind; } else { LOG_FATAL("%s\n", "Unrecognized arguments found\n"); return 1; } } else { if (vcfset_conf.vcf_setop == SETOP_CONCAT) { LOG_FATAL("%s\n", "No extra files for concat given\n"); return 1; } } #if 0 int i; for (i=optind+1; i<argc; i++) { LOG_FIXME("argv[%d]=%s\n", i, argv[i]); } #endif if (argc == 2) { fprintf(stderr, "\n"); usage(& vcfset_conf); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } if (vcfset_conf.vcf_setop == SETOP_UNKNOWN) { LOG_FATAL("%s\n", "No set operation specified"); usage(& vcfset_conf); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } if (vcf_in1 == NULL || (vcf_in2 == NULL && vcfset_conf.vcf_setop != SETOP_CONCAT)) { LOG_FATAL("%s\n\n", "At least one vcf input file not specified"); usage(& vcfset_conf); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } if (vcf_in2 != NULL && vcfset_conf.vcf_setop == SETOP_CONCAT) { LOG_FATAL("%s\n\n", "For concat just use the -1 option followed by all other vcf files instead of using -2"); usage(& vcfset_conf); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } if (vcf_file_open(& vcfset_conf.vcf_in1, vcf_in1, HAS_GZIP_EXT(vcf_in1), 'r')) { LOG_ERROR("Couldn't open %s\n", vcf_in1); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } if (vcf_in2) { vcf2_hts = hts_open(vcf_in2, "r"); if (!vcf2_hts) { LOG_FATAL("Couldn't load %s\n", vcf_in2); return 1; } vcf2_tbx = tbx_index_load(vcf_in2); if (!vcf2_tbx) { LOG_FATAL("Couldn't load tabix index for %s\n", vcf_in2); return 1; } } /* vcf_out default if not set: stdout==- */ if (! vcf_out) { vcf_out = malloc(2 * sizeof(char)); strcpy(vcf_out, "-"); } if (! count_only) { if (vcf_file_open(& vcfset_conf.vcf_out, vcf_out, HAS_GZIP_EXT(vcf_out), 'w')) { LOG_ERROR("Couldn't open %s\n", vcf_out); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } } /* use meta-data/header of vcf_in1 for output */ LOG_DEBUG("Getting header from %s\n", vcf_in1); if (0 != vcf_parse_header(&vcf_header, & vcfset_conf.vcf_in1)) { LOG_WARN("%s\n", "vcf_parse_header() failed"); if (vcf_file_seek(& vcfset_conf.vcf_in1, 0, SEEK_SET)) { LOG_FATAL("%s\n", "Couldn't rewind file to parse variants" " after header parsing failed"); return -1; } } else { if (! count_only) { /* vcf_write_header would write *default* header */ vcf_write_header(& vcfset_conf.vcf_out, vcf_header); } free(vcf_header); } /* parse first vcf file */ LOG_DEBUG("Starting to parse variants from %s\n", vcf_in1); while (1) { var_t *var1 = NULL; int rc; int is_indel; kstring_t var2_kstr = {0, 0, 0}; hts_itr_t *var2_itr = NULL; char regbuf[1024]; int var2_match = 0; vcf_new_var(&var1); rc = vcf_parse_var(& vcfset_conf.vcf_in1, var1); if (rc) { free(var1); if (vcfset_conf.vcf_setop != SETOP_CONCAT) { break; } else { vcf_concat_findex++; if (vcf_concat_findex==argc) { break; } /* set vcf1 up anew and simply continue as if nothing happened */ vcf_file_close(& vcfset_conf.vcf_in1); free(vcf_in1); vcf_in1 = strdup(argv[vcf_concat_findex]); LOG_DEBUG("updated vcf_in1 = %s\n", vcf_in1); if (vcf_file_open(& vcfset_conf.vcf_in1, vcf_in1, HAS_GZIP_EXT(vcf_in1), 'r')) { LOG_ERROR("Couldn't open %s\n", vcf_in1); free(vcf_in1); free(vcf_in2); free(vcf_out); return 1; } if (0 != vcf_skip_header(& vcfset_conf.vcf_in1)) { LOG_WARN("skip header failed for %s\n", vcf_in1); } continue; } } is_indel = vcf_var_is_indel(var1); if (vcfset_conf.only_snvs && is_indel) { free(var1); continue; } else if (vcfset_conf.only_indels && ! is_indel) { free(var1); continue; } if (! vcfset_conf.only_pos && NULL != strchr(var1->alt, ',')) { LOG_FATAL("%s\n", "No support for multi-allelic SNVs in vcf1"); return -1; } if (vcfset_conf.only_passed && ! VCF_VAR_PASSES(var1)) { #ifdef TRACE LOG_DEBUG("Skipping non-passing var1 %s:%d\n", var1->chrom, var1->pos); #endif num_vars_vcf1_ign += 1; vcf_free_var(& var1); continue; } if (add_info_field) { vcf_var_add_to_info(var1, add_info_field); } num_vars_vcf1 += 1; #ifdef TRACE LOG_DEBUG("Got passing var1 %s:%d\n", var1->chrom, var1->pos); #endif if (vcfset_conf.vcf_setop == SETOP_CONCAT) { num_vars_out += 1; if (! count_only) { vcf_write_var(& vcfset_conf.vcf_out, var1); } vcf_free_var(& var1); /* skip comparison against vcf2 */ continue; } /* use index access to vcf2 */ snprintf(regbuf, 1024, "%s:%ld-%ld", var1->chrom, var1->pos+1, var1->pos+1); var2_itr = tbx_itr_querys(vcf2_tbx, regbuf); if (! var2_itr) { var2_match = 0; } else { var2_match = 0; while (tbx_itr_next(vcf2_hts, vcf2_tbx, var2_itr, &var2_kstr) >= 0) { var_t *var2 = NULL; int var2_is_indel = 0; vcf_new_var(&var2); rc = vcf_parse_var_from_line(var2_kstr.s, var2); /* LOG_FIXME("%d:%s>%s looking at var2 %d:%s>%s (reg %s)\n", var1->pos+1, var1->ref, var1->alt, var2->pos+1, var2->ref, var2->alt, regbuf); */ if (rc) { LOG_FATAL("%s\n", "Error while parsing variant returned from tabix"); return -1; } var2_is_indel = vcf_var_is_indel(var2); /* iterator returns anything overlapping with that * position, i.e. this also includes up/downstream * indels, so make sure actual position matches */ if (var1->pos != var2->pos) { var2_match = 0; } else if (vcfset_conf.only_passed && ! VCF_VAR_PASSES(var2)) { var2_match = 0; } else if (vcfset_conf.only_snvs && var2_is_indel) { var2_match = 0; } else if (vcfset_conf.only_indels && ! var2_is_indel) { var2_match = 0; } else if (vcfset_conf.only_pos) { #ifdef TRACE LOG_DEBUG("Pos match for var2 %s:%d\n", var2->chrom, var2->pos); #endif var2_match = 1; } else { if (0==strcmp(var1->ref, var2->ref) && 0==strcmp(var1->alt, var2->alt)) { #ifdef TRACE LOG_DEBUG("Full match for var2 %s:%d\n", var2->chrom, var2->pos); #endif var2_match = 1;/* FIXME: check type as well i.e. snv vs indel */ } } vcf_free_var(&var2); if (var2_match) { break;/* no need to continue */ } } } if (vcfset_conf.vcf_setop == SETOP_COMPLEMENT) { /* relative complement : elements in A but not B */ if (!var2_match) { num_vars_out += 1; if (! count_only) { vcf_write_var(& vcfset_conf.vcf_out, var1); } } } else if (vcfset_conf.vcf_setop == SETOP_INTERSECT) { if (var2_match) { num_vars_out += 1; if (! count_only) { vcf_write_var(& vcfset_conf.vcf_out, var1); } } } else { LOG_FATAL("Internal error: unsupported vcf_setop %d\n", vcfset_conf.vcf_setop); return 1; } vcf_free_var(& var1); tbx_itr_destroy(var2_itr); }/* while (1) */ vcf_file_close(& vcfset_conf.vcf_in1); if (vcf_in2) { hts_close(vcf2_hts); tbx_destroy(vcf2_tbx); } LOG_VERBOSE("Parsed %d variants from 1st vcf file (ignoring %d non-passed of those)\n", num_vars_vcf1 + num_vars_vcf1_ign, num_vars_vcf1_ign); LOG_VERBOSE("Wrote %d variants to output\n", num_vars_out); if (! count_only) { vcf_file_close(& vcfset_conf.vcf_out); } if (0==rc) { if (count_only) { printf("%ld\n", num_vars_out); } LOG_VERBOSE("%s\n", "Successful exit."); } free(vcf_in1); free(vcf_in2); free(vcf_out); return rc; }
int vcf_index_stats(char *fname, int stats) { char *fn_out = NULL; FILE *out; out = fn_out ? fopen(fn_out, "w") : stdout; const char **seq; int i, nseq; tbx_t *tbx = NULL; hts_idx_t *idx = NULL; htsFile *fp = hts_open(fname,"r"); if ( !fp ) { fprintf(stderr,"Could not read %s\n", fname); return 1; } bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) { fprintf(stderr,"Could not read the header: %s\n", fname); return 1; } if ( hts_get_format(fp)->format==vcf ) { tbx = tbx_index_load(fname); if ( !tbx ) { fprintf(stderr,"Could not load TBI index: %s\n", fname); return 1; } } else if ( hts_get_format(fp)->format==bcf ) { idx = bcf_index_load(fname); if ( !idx ) { fprintf(stderr,"Could not load CSI index: %s\n", fname); return 1; } } else { fprintf(stderr,"Could not detect the file type as VCF or BCF: %s\n", fname); return 1; } seq = tbx ? tbx_seqnames(tbx, &nseq) : bcf_index_seqnames(idx, hdr, &nseq); uint64_t sum = 0; for (i=0; i<nseq; i++) { uint64_t records, v; hts_idx_get_stat(tbx ? tbx->idx : idx, i, &records, &v); sum+=records; if (stats&2 || !records) continue; bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", seq[i], NULL); int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1; fprintf(out,"%s\t%s\t%" PRIu64 "\n", seq[i], hkey<0?".":hrec->vals[hkey], records); } if (!sum) { // No counts found. // Is this because index version has no stored count data, or no records? bcf1_t *rec = bcf_init1(); if (bcf_read1(fp, hdr, rec) >= 0) { fprintf(stderr,"%s index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", tbx ? "TBI" : "CSI", fname); return 1; } bcf_destroy1(rec); } if (stats&2) fprintf(out, "%" PRIu64 "\n", sum); free(seq); fclose(out); hts_close(fp); bcf_hdr_destroy(hdr); if (tbx) tbx_destroy(tbx); if (idx) hts_idx_destroy(idx); return 0; }
void union_data::scanGenotypesBED(string fbed) { string buffer; int n_includedG = 0; int n_excludedG_user = 0; //Opening files htsFile *fp = hts_open(fbed.c_str(),"r"); if (!fp) vrb.error("Cannot open file!"); tbx_t * tbx = tbx_index_load(fbed.c_str()); if (!tbx) vrb.error("Cannot load index file!"); kstring_t str = {0,0,0}; if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!"); //Read genotype data vector < string > tokens; unsigned int linecount = 0; //Jump to interesting region if (regionGenotype.chr != "NA"){ hts_itr_t *itr = tbx_itr_querys(tbx, regionGenotype.get().c_str()); vrb.bullet("target region [" + regionGenotype.get() + "]"); if (!itr) vrb.error("Cannot jump to region!"); while (tbx_itr_next(fp, tbx, itr, &str) >= 0) { linecount ++; if (linecount % 1000000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); if (genotype_id_to_idx.count(tokens[3])) continue; if (filter_genotype.check(tokens[3])) { genotype_id.push_back(tokens[3]); genotype_chr.push_back(tokens[0]); genotype_start.push_back(atoi(tokens[1].c_str()) + 1); genotype_end.push_back(atoi(tokens[2].c_str())); pair < string, int > temp (tokens[3],genotype_id_to_idx.size()); genotype_id_to_idx.insert(temp); n_includedG++; } else n_excludedG_user ++; } tbx_itr_destroy(itr); }else{ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) { linecount ++; if (linecount % 1000000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); stb.split(string(str.s), tokens); if (str.l && str.s[0] != tbx->conf.meta_char) { if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); if (genotype_id_to_idx.count(tokens[3])) continue; if (filter_genotype.check(tokens[3])) { genotype_id.push_back(tokens[3]); genotype_chr.push_back(tokens[0]); genotype_start.push_back(atoi(tokens[1].c_str()) + 1); genotype_end.push_back(atoi(tokens[2].c_str())); pair < string, int > temp (tokens[3],genotype_id_to_idx.size()); genotype_id_to_idx.insert(temp); n_includedG++; } else n_excludedG_user ++; } } } //Finalize & verbose tbx_destroy(tbx); genotype_count += n_includedG; if (hts_close(fp)) vrb.error("Cannot properly close file!"); vrb.bullet(stb.str(n_includedG) + " new variants included"); if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user"); if (n_includedG == 0) vrb.leave("Cannot find variants in target region!"); }
void union_data::readGenotypesBED(string fbed,string region) { string buffer; int n_includedG = 0; int n_excludedG_user = 0; int n_includedS = 0; int n_excludedS = 0; int n_missingS = 0; vector < int > mappingS; genotype_id.clear(); genotype_chr.clear(); genotype_start.clear(); genotype_end.clear(); genotype_val.clear(); genotype_count=0; genotype_id_to_idx.clear(); //Opening files htsFile *fp = hts_open(fbed.c_str(),"r"); if (!fp) vrb.error("Cannot open file!"); tbx_t * tbx = tbx_index_load(fbed.c_str()); if (!tbx) vrb.error("Cannot load index file!"); kstring_t str = {0,0,0}; if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!"); //Process sample names vector < string > tokens; stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); for (int i0 = 6 ; i0 < tokens.size() ; i0 ++) { string sid = tokens[i0]; if (filter_sample.check(sid)) { mappingS.push_back(findSample(sid)); if (mappingS.back() >= 0) n_includedS ++; else n_missingS ++; } else { mappingS.push_back(-1); n_excludedS ++; } } //vrb.bullet(stb.str(n_includedS) + " samples included"); //if (n_excludedS > 0) vrb.bullet(stb.str(n_excludedS) + " samples excluded by user"); //if (n_missingS > 0) vrb.bullet(stb.str(n_missingS) + " samples without phenotype data"); //if (n_includedS != sample_count) vrb.error("Cannot find genotype for " + stb.str(sample_count - n_includedS) + " samples!"); unsigned int linecount = 0; //Jump to interesting region hts_itr_t *itr = tbx_itr_querys(tbx, region.c_str()); //vrb.bullet("target region [" + regionGenotype.get() + "]"); //if (!itr) vrb.error("Cannot jump to region!"); while (tbx_itr_next(fp, tbx, itr, &str) >= 0) { linecount ++; if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); if (filter_genotype.check(tokens[3])) { genotype_id.push_back(tokens[3]); genotype_chr.push_back(tokens[0]); genotype_start.push_back(atoi(tokens[1].c_str()) + 1); genotype_end.push_back(atoi(tokens[2].c_str())); genotype_val.push_back(vector < float > (sample_count, 0.0)); for (int t = 6 ; t < tokens.size() ; t ++) { if (mappingS[t-6] >= 0) { if (tokens[t] == "NA") genotype_val.back()[mappingS[t-6]] = bcf_float_missing; else genotype_val.back()[mappingS[t-6]] = stof(tokens[t]); } } pair < string, int > temp (tokens[3],n_includedG); genotype_id_to_idx.insert(temp); n_includedG++; } else n_excludedG_user ++; } tbx_itr_destroy(itr); //Finalize & verbose tbx_destroy(tbx); if (hts_close(fp)) vrb.error("Cannot properly close file!"); genotype_count = n_includedG; //vrb.bullet(stb.str(n_includedG) + " variants included"); //if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user"); //if (genotype_count == 0) vrb.leave("Cannot find variants in target region!"); }
int main(int argc, char* argv[]) { namespace po = boost::program_options; std::string file; std::string output; try { // Declare the supported options. po::options_description desc("Allowed options"); desc.add_options() ("help,h", "produce help message") ("version", "Show version") ("input-file", po::value< std::string >(), "The input files") ("output-file", po::value<std::string>(), "The output file name.") ; po::positional_options_description popts; popts.add("input-file", 1); popts.add("output-file", 1); po::options_description cmdline_options; cmdline_options .add(desc) ; po::variables_map vm; po::store(po::command_line_parser(argc, argv). options(cmdline_options).positional(popts).run(), vm); po::notify(vm); if (vm.count("version")) { std::cout << "vcfhdr2json version " << HAPLOTYPES_VERSION << "\n"; return 0; } if (vm.count("help")) { std::cout << desc << "\n"; return 1; } if (vm.count("input-file")) { file = vm["input-file"].as< std::string > (); } if (vm.count("output-file")) { output = vm["output-file"].as< std::string >(); } if(file.size() == 0) { std::cerr << "Please specify an input file.\n"; return 1; } if (output == "") { std::cerr << "Please specify an output file.\n"; return 1; } } catch (po::error & e) { std::cerr << e.what() << "\n"; return 1; } try { Json::StyledWriter writer; htsFile * fp = bcf_open(file.c_str(), "r"); bcf_hdr_t * hdr = bcf_hdr_read(fp); Json::Value root; Json::Value a; for (int i = 0; i < bcf_hdr_nsamples(hdr); ++i) { a.append(hdr->samples[i]); } root["samples"] = a; Json::Value fields; for (int i = 0; i < hdr->nhrec; i++) { Json::Value field; field["key"] = hdr->hrec[i]->key; if (!hdr->hrec[i]->value) { Json::Value values; for (int j = 0; j < hdr->hrec[i]->nkeys; j++) { values[hdr->hrec[i]->keys[j]] = hdr->hrec[i]->vals[j]; } field["values"] = values; } else { field["value"] = hdr->hrec[i]->value; } fields.append(field); } root["fields"] = fields; tbx_t * tbx_idx = tbx_index_load(file.c_str()); if ( !tbx_idx ) { hts_idx_t * csi_idx = bcf_index_load(file.c_str()); if(!csi_idx) { root["tabix"] = Json::Value::null; } else { root["tabix"] = Json::Value(); root["tabix"]["chromosomes"] = Json::Value(); int count = 0; const char ** tbx_names = bcf_index_seqnames(csi_idx, hdr, &count); for (int i = 0; i < count; ++i) { root["tabix"]["chromosomes"].append(tbx_names[i]); } free(tbx_names); hts_idx_destroy(csi_idx); } } else { root["tabix"] = Json::Value(); root["tabix"]["chromosomes"] = Json::Value(); int count = 0; const char ** tbx_names = tbx_seqnames(tbx_idx, &count); for (int i = 0; i < count; ++i) { root["tabix"]["chromosomes"].append(tbx_names[i]); } free(tbx_names); tbx_destroy(tbx_idx); } std::ofstream out(output.c_str()); out << writer.write(root); bcf_close(fp); bcf_hdr_destroy(hdr); } catch(std::runtime_error & e) { std::cerr << e.what() << std::endl; return 1; } catch(std::logic_error & e) { std::cerr << e.what() << std::endl; return 1; } return 0; }
bcf_hdr_t *vcf_hdr_read(htsFile *fp) { if (!fp->is_bin) { kstring_t txt, *s = &fp->line; bcf_hdr_t *h; h = bcf_hdr_init(); txt.l = txt.m = 0; txt.s = 0; while (hts_getline(fp, KS_SEP_LINE, s) >= 0) { if (s->l == 0) continue; if (s->s[0] != '#') { if (hts_verbose >= 2) fprintf(stderr, "[E::%s] no sample line\n", __func__); free(txt.s); bcf_hdr_destroy(h); return 0; } if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here int dret; gzFile f; kstream_t *ks; kstring_t tmp; tmp.l = tmp.m = 0; tmp.s = 0; f = gzopen(fp->fn_aux, "r"); ks = ks_init(f); while (ks_getuntil(ks, 0, &tmp, &dret) >= 0) { int c; kputs("##contig=<ID=", &txt); kputs(tmp.s, &txt); ks_getuntil(ks, 0, &tmp, &dret); kputs(",length=", &txt); kputw(atol(tmp.s), &txt); kputsn(">\n", 2, &txt); if (dret != '\n') while ((c = ks_getc(ks)) != '\n' && c != -1); // skip the rest of the line } free(tmp.s); ks_destroy(ks); gzclose(f); } kputsn(s->s, s->l, &txt); if (s->s[1] != '#') break; kputc('\n', &txt); } h->l_text = txt.l + 1; // including NULL h->text = txt.s; bcf_hdr_parse(h); // check tabix index, are all contigs listed in the header? add the missing ones tbx_t *idx = tbx_index_load(fp->fn); if ( idx ) { int i, n, need_sync = 0; const char **names = tbx_seqnames(idx, &n); for (i=0; i<n; i++) { bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_DT_CTG, (char*) names[i]); if ( hrec ) continue; hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t)); hrec->key = strdup("contig"); bcf_hrec_add_key(hrec, "ID", strlen("ID")); bcf_hrec_set_val(hrec, hrec->nkeys-1, (char*) names[i], strlen(names[i]), 0); bcf_hrec_add_key(hrec, "length", strlen("length")); bcf_hrec_set_val(hrec, hrec->nkeys-1, "-1", strlen("-1"), 0); // what is a good default value? bcf_hdr_add_hrec(h, hrec); need_sync = 1; } free(names); tbx_destroy(idx); if ( need_sync ) { bcf_hdr_sync(h); bcf_hdr_fmt_text(h); } } return h; } else return bcf_hdr_read((BGZF*)fp->fp); }
/** * tabix workhorse function */ static int tabix_handler(request_rec *r) { htsFile *fp=NULL; hts_itr_t *itr=NULL; kstring_t line = {0,0,0}; int print_header=1; int print_body=1; struct tabix_callback_t handler; int http_status=OK; memset((void*)&handler,0,sizeof(struct tabix_callback_t)); handler.r=r; handler.limit=DEFAULT_LIMIT_RECORDS; if (!r->handler || strcmp(r->handler, "tabix-handler")) return (DECLINED); if (strcmp(r->method, "GET")!=0) return DECLINED; if(r->canonical_filename==NULL) return DECLINED; /* file must be b-gzipped */ if( !( str_ends_with(r->canonical_filename,".gz") )) return DECLINED; /* file must be indexed with tabix */ if( !( fileExtExists(r->canonical_filename,".tbi") )) return 404; handler.httParams = HttpParamParseGET(r); if(handler.httParams==NULL) return DECLINED; handler.file_format=E_FORMAT_UNDEFINED; if(str_ends_with(r->canonical_filename,".vcf.gz")) { handler.file_format=E_FORMAT_VCF; } else if(str_ends_with(r->canonical_filename,".bed.gz")) { handler.file_format=E_FORMAT_BED; } /* only one loop, we use this to cleanup the code, instead of using a goto statement */ do { const char* format=HttpParamGet(handler.httParams,"format"); const char* limit=HttpParamGet(handler.httParams,"limit"); const char* region=HttpParamGet(handler.httParams,"region"); int iterator_was_requested=FALSE; if(limit!=NULL) { handler.limit=atol(limit); } if(format==NULL) { http_status=DECLINED; break; } else if(strcmp(format,"xml")==0) { SETUP_HANDLER(xml); } else if(strcmp(format,"json")==0 || strcmp(format,"jsonp")==0) { handler.jsonp_callback=HttpParamGet(handler.httParams,"callback"); SETUP_HANDLER(json); } else if(strcmp(format,"html")==0) { SETUP_HANDLER(html); } else { SETUP_HANDLER(plain); } fp=hts_open(r->canonical_filename,"r"); if(fp==NULL) { http_status=HTTP_NOT_FOUND; break; } //read index handler.tbx = tbx_index_load(r->canonical_filename); if(handler.tbx==NULL) { http_status=HTTP_INTERNAL_SERVER_ERROR; break; } if(region!=NULL && !str_is_empty(region)) { iterator_was_requested=TRUE; itr = tbx_itr_querys(handler.tbx,region); } handler.startdocument(&handler); if(print_header) { handler.startheader(&handler); while ( hts_getline(fp, KS_SEP_LINE, &line) >= 0 ) { if ( !line.l || line.s[0]!=handler.tbx->conf.meta_char ) break; handler.header(&handler,&line); handler.count++; } handler.enddheader(&handler); } handler.count=0;//Reset if(print_body) { handler.startbody(&handler); if(iterator_was_requested) { if(itr!=NULL) { while ((handler.limit==-1 || handler.count< handler.limit) && tbx_itr_next(fp, handler.tbx, itr, &line) >= 0) { if(handler.show(&handler,&line)<0) break; handler.count++; } } } else { while ((handler.limit==-1 || handler.count< handler.limit) && \ hts_getline(fp, KS_SEP_LINE, &line) >= 0) { if(handler.show(&handler,&line)<0) break; handler.count++; } } handler.endbody(&handler); } handler.enddocument(&handler); } while(0);/* always abort */ //cleanup if(itr!=NULL) tbx_itr_destroy(itr); HttpParamFree(handler.httParams); free(line.s); if(fp!=NULL) hts_close(fp); if(handler.tbx!=NULL) tbx_destroy(handler.tbx); return http_status; }
int convert(int argc, char **argv) { if (argc < 2) return convert_help(); int c; char *in=NULL, *out=NULL, *bim=NULL, *vid=NULL, *tmp_dir=NULL, *ped=NULL; uint32_t num_fields, num_records, col = 2; int i_is_set = 0, o_is_set = 0, f_is_set = 0, b_is_set = 0, v_is_set = 0, t_is_set = 0, p_is_set = 0, r_is_set = 0; while((c = getopt (argc, argv, "hi:o:f:r:b:v:t:p:c:")) != -1) { switch (c) { case 'c': col = atoi(optarg); break; case 'p': p_is_set = 1; ped = optarg; break; case 't': t_is_set = 1; tmp_dir = optarg; break; case 'v': v_is_set = 1; vid = optarg; break; case 'b': b_is_set = 1; bim = optarg; break; case 'i': i_is_set = 1; in = optarg; break; case 'o': o_is_set = 1; out = optarg; break; case 'f': f_is_set = 1; num_fields = atoi(optarg); break; case 'r': r_is_set = 1; num_records = atoi(optarg); break; case 'h': convert_help(); return 1; case '?': if ( (optopt == 'i') || (optopt == 'f') || (optopt == 'r') || (optopt == 't') || (optopt == 's') || (optopt == 'p') || (optopt == 'c') || (optopt == 'o') ) fprintf (stderr, "Option -%c requires an argument.\n", optopt); else if (isprint (optopt)) fprintf (stderr, "Unknown option `-%c'.\n", optopt); else fprintf (stderr, "Unknown option character `\\x%x'.\n", optopt); default: convert_help(); return 1; } } char *type = argv[0]; if (i_is_set == 0) { printf("Input file is not set\n"); return convert_help(); } if (strcmp(type, "bcf") == 0) { if ( (f_is_set == 0) || (r_is_set == 0) ) { fprintf(stderr,"Attempting to autodetect num of records " "and fields from %s\n", in); //Try and auto detect the sizes, need the index tbx_t *tbx = NULL; hts_idx_t *idx = NULL; htsFile *fp = hts_open(in,"rb"); if ( !fp ) { fprintf(stderr,"Could not read %s\n", in); return 1; } bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) { fprintf(stderr,"Could not read the header: %s\n", in); return 1; } if (hts_get_format(fp)->format==vcf) { tbx = tbx_index_load(in); if ( !tbx ) { fprintf(stderr,"Could not load TBI index: %s\n", in); return 1; } } else if ( hts_get_format(fp)->format==bcf ) { idx = bcf_index_load(in); if ( !idx ) { fprintf(stderr,"Could not load CSI index: %s\n", in); return 1; } } else { fprintf(stderr, "Could not detect the file type as VCF or BCF: %s\n", in); return 1; } num_fields = hdr->n[BCF_DT_SAMPLE]; num_records = 0; const char **seq; int nseq; seq = tbx ? tbx_seqnames(tbx, &nseq) : bcf_index_seqnames(idx, hdr, &nseq); int i; uint32_t sum = 0; for (i = 0; i < nseq; ++i) { uint64_t records, v; hts_idx_get_stat(tbx ? tbx->idx: idx, i, &records, &v); num_records += records; } fprintf(stderr, "Number of records:%u\tNumber of fields:%u\n", num_records, num_fields); free(seq); hts_close(fp); bcf_hdr_destroy(hdr); if (idx) hts_idx_destroy(idx); if (tbx) tbx_destroy(tbx); } if (o_is_set == 0) { out = (char*)malloc(strlen(in) + 5); // 5 for ext and \0 strcpy(out,in); strcat(out, ".gqt"); } if (b_is_set == 0) { bim = (char*)malloc(strlen(in) + 5); // 5 for ext and \0 strcpy(bim,in); strcat(bim, ".bim"); } if (v_is_set == 0) { vid = (char*)malloc(strlen(in) + 5); // 5 for ext and \0 strcpy(vid,in); strcat(vid, ".vid"); } if (t_is_set == 0) { tmp_dir = (char*)malloc(3*sizeof(char)); // "./\0" strcpy(tmp_dir,"./"); } int r = bcf_wahbm(in, out, bim, vid, tmp_dir, num_fields, num_records); return r; } if (strcmp(type, "ped") == 0) { if (o_is_set == 0) { if (p_is_set == 1) { out = (char*)malloc(strlen(ped) + 4); // 4 for ext and \0 strcpy(out,ped); strcat(out, ".db"); } else { out = (char*)malloc(strlen(in) + 4); // 4 for ext and \0 strcpy(out,in); strcat(out, ".db"); } } fprintf(stderr, "Creating sample database %s\n", out); return ped_ped(in, ped, col, out); } return convert_help(); }
int main_tabix(int argc, char *argv[]) { int c, min_shift = -1, is_force = 0, is_all = 0; tbx_conf_t conf = tbx_conf_gff, *conf_ptr = NULL; while ((c = getopt(argc, argv, "0fap:s:b:e:S:c:m:")) >= 0) if (c == '0') conf.preset |= TBX_UCSC; else if (c == 'f') is_force = 1; else if (c == 'a') is_all = 1; else if (c == 'm') min_shift = atoi(optarg); else if (c == 's') conf.sc = atoi(optarg); else if (c == 'b') conf.bc = atoi(optarg); else if (c == 'e') conf.ec = atoi(optarg); else if (c == 'c') conf.meta_char = *optarg; else if (c == 'S') conf.line_skip = atoi(optarg); else if (c == 'p') { if (strcmp(optarg, "gff") == 0) conf_ptr = &tbx_conf_gff; else if (strcmp(optarg, "bed") == 0) conf_ptr = &tbx_conf_bed; else if (strcmp(optarg, "sam") == 0) conf_ptr = &tbx_conf_sam; else if (strcmp(optarg, "vcf") == 0) conf_ptr = &tbx_conf_vcf; else { fprintf(stderr, "The type '%s' not recognised\n", optarg); return 1; } } if (optind == argc) { fprintf(stderr, "\nUsage: bcftools tabix [options] <in.gz> [reg1 [...]]\n\n"); fprintf(stderr, "Options: -p STR preset: gff, bed, sam or vcf [gff]\n"); fprintf(stderr, " -s INT column number for sequence names (suppressed by -p) [1]\n"); fprintf(stderr, " -b INT column number for region start [4]\n"); fprintf(stderr, " -e INT column number for region end (if no end, set INT to -b) [5]\n"); fprintf(stderr, " -0 specify coordinates are zero-based\n"); fprintf(stderr, " -S INT skip first INT lines [0]\n"); fprintf(stderr, " -c CHAR skip lines starting with CHAR [null]\n"); fprintf(stderr, " -a print all records\n"); fprintf(stderr, " -f force to overwrite existing index\n"); fprintf(stderr, " -m INT set the minimal interval size to 1<<INT; 0 for the old tabix index [0]\n"); fprintf(stderr, "\n"); return 1; } if (is_all) { // read without random access kstring_t s; BGZF *fp; s.l = s.m = 0; s.s = 0; fp = bgzf_open(argv[optind], "r"); while (bgzf_getline(fp, '\n', &s) >= 0) puts(s.s); bgzf_close(fp); free(s.s); } else if (optind + 2 > argc) { // create index if ( !conf_ptr ) { // auto-detect file type by file name int l = strlen(argv[optind]); int strcasecmp(const char *s1, const char *s2); if (l>=7 && strcasecmp(argv[optind]+l-7, ".gff.gz") == 0) conf_ptr = &tbx_conf_gff; else if (l>=7 && strcasecmp(argv[optind]+l-7, ".bed.gz") == 0) conf_ptr = &tbx_conf_bed; else if (l>=7 && strcasecmp(argv[optind]+l-7, ".sam.gz") == 0) conf_ptr = &tbx_conf_sam; else if (l>=7 && strcasecmp(argv[optind]+l-7, ".vcf.gz") == 0) conf_ptr = &tbx_conf_vcf; } if ( conf_ptr ) conf = *conf_ptr; if (!is_force) { char *fn; FILE *fp; fn = (char*)alloca(strlen(argv[optind]) + 5); strcat(strcpy(fn, argv[optind]), min_shift <= 0? ".tbi" : ".csi"); if ((fp = fopen(fn, "rb")) != 0) { fclose(fp); fprintf(stderr, "[E::%s] the index file exists; use option '-f' to overwrite\n", __func__); return 1; } } if ( tbx_index_build(argv[optind], min_shift, &conf) ) { fprintf(stderr,"tbx_index_build failed: Is the file bgzip-compressed? Was wrong -p [type] option used?\n"); return 1; } } else { // read with random access tbx_t *tbx; BGZF *fp; kstring_t s; int i; if ((tbx = tbx_index_load(argv[optind])) == 0) return 1; if ((fp = bgzf_open(argv[optind], "r")) == 0) return 1; s.s = 0; s.l = s.m = 0; for (i = optind + 1; i < argc; ++i) { hts_itr_t *itr; if ((itr = tbx_itr_querys(tbx, argv[i])) == 0) continue; while (tbx_bgzf_itr_next(fp, tbx, itr, &s) >= 0) puts(s.s); tbx_itr_destroy(itr); } free(s.s); bgzf_close(fp); tbx_destroy(tbx); } return 0; }
void union_data::scanPhenotypes(string fbed) { int n_includedP = 0; int n_excludedP = 0; //Open BED file vrb.title("Scanning phenotype data in [" + fbed + "]"); htsFile *fp = hts_open(fbed.c_str(),"r"); if (!fp) vrb.error("Cannot open file"); tbx_t * tbx = tbx_index_load(fbed.c_str()); if (!tbx) vrb.error("Cannot open index file"); //Read header kstring_t str = {0,0,0}; if (!hts_getline(fp, KS_SEP_LINE, &str) || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line"); //Scan file vector < string > tokens; unsigned int linecount =0; if (regionPhenotype.chr != "NA"){ hts_itr_t *itr = tbx_itr_querys(tbx, regionPhenotype.get().c_str()); vrb.bullet("target region [" + regionPhenotype.get() + "]"); if (!itr) vrb.error("Cannot jump to region!"); //Read data while (tbx_itr_next(fp, tbx, itr, &str) >= 0) { linecount ++; if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); if (phenotype_id_to_idx.count(tokens[3])) continue; if (filter_phenotype.check(tokens[3])) { phenotype_id.push_back(tokens[3]); phenotype_chr.push_back(tokens[0]); phenotype_start.push_back(atoi(tokens[1].c_str()) + 1); phenotype_end.push_back(atoi(tokens[2].c_str())); pair < string, int > temp (tokens[3],phenotype_id_to_idx.size()); phenotype_id_to_idx.insert(temp); n_includedP++; } else n_excludedP ++; } tbx_itr_destroy(itr); }else{ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) { linecount ++; if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); if (str.l && str.s[0] != tbx->conf.meta_char) { stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); if (phenotype_id_to_idx.count(tokens[3])) continue; if (filter_phenotype.check(tokens[3])) { phenotype_id.push_back(tokens[3]); phenotype_chr.push_back(tokens[0]); phenotype_start.push_back(atoi(tokens[1].c_str()) + 1); phenotype_end.push_back(atoi(tokens[2].c_str())); pair < string, int > temp (tokens[3],phenotype_id_to_idx.size()); phenotype_id_to_idx.insert(temp); n_includedP++; } else n_excludedP ++; } } } //Finalize & verbose tbx_destroy(tbx); if (hts_close(fp)) vrb.error("Cannot properly close file"); phenotype_count = phenotype_id.size(); vrb.bullet(stb.str(n_includedP) + " new phenotypes included"); if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user"); if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in region!"); }
void union_data::readPhenotypes(string fbed, string region) { int n_includedS = 0; int n_includedP = 0; int n_excludedP = 0; vector < int > mappingS; phenotype_id.clear(); phenotype_chr.clear(); phenotype_start.clear(); phenotype_end.clear(); phenotype_val.clear(); phenotype_count=0; phenotype_id_to_idx.clear(); //Open BED file //vrb.title("Reading phenotype data in [" + fbed + "]"); htsFile *fp = hts_open(fbed.c_str(),"r"); if (!fp) vrb.error("Cannot open file"); tbx_t *tbx = tbx_index_load(fbed.c_str()); if (!tbx) vrb.error("Cannot open index file"); kstring_t str = {0,0,0}; if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!"); //Process sample names vector < string > tokens; stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); for (int t = 6 ; t < tokens.size() ; t ++) { mappingS.push_back(findSample(tokens[t])); if (mappingS.back() >= 0) n_includedS++; } unsigned int linecount =0; //Read phenotypes hts_itr_t *itr = tbx_itr_querys(tbx, region.c_str()); //vrb.bullet("target region [" + regionPhenotype.get() + "]"); //if (!itr) vrb.error("Cannot jump to region!"); //Read data while (tbx_itr_next(fp, tbx, itr, &str) >= 0) { linecount ++; if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); if (filter_phenotype.check(tokens[3])) { phenotype_id.push_back(tokens[3]); phenotype_chr.push_back(tokens[0]); phenotype_start.push_back(atoi(tokens[1].c_str()) + 1); phenotype_end.push_back(atoi(tokens[2].c_str())); phenotype_val.push_back(vector < float > (sample_count, 0.0)); for (int t = 6 ; t < tokens.size() ; t ++) { if (mappingS[t-6] >= 0) { if (tokens[t] == "NA") phenotype_val.back()[mappingS[t-6]] = bcf_float_missing; else phenotype_val.back()[mappingS[t-6]] = stof(tokens[t]); } } pair < string, int > temp (tokens[3],n_includedP); phenotype_id_to_idx.insert(temp); n_includedP++; } else n_excludedP ++; } tbx_itr_destroy(itr); //Finalize & verbose tbx_destroy(tbx); if (hts_close(fp)) vrb.error("Cannot properly close file"); phenotype_count = phenotype_id.size(); //vrb.bullet(stb.str(n_includedP) + " phenotypes included"); //if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user"); //if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in target region!"); }
Tabix::~Tabix(void) { tbx_itr_destroy(iter); tbx_destroy(tbx); }
void cis_data::readPhenotypes(string fbed) { int n_includedS = 0; int n_includedP = 0; int n_excludedP = 0; int n_negativeStrd = 0; vector < int > mappingS; //Open BED file vrb.title("Reading phenotype data in [" + fbed + "]"); htsFile *fp = hts_open(fbed.c_str(),"r"); if (!fp) vrb.error("Cannot open file"); tbx_t *tbx = tbx_index_load(fbed.c_str()); if (!tbx) vrb.error("Cannot open index file"); kstring_t str = {0,0,0}; if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!"); //Process sample names vector < string > tokens; stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); for (int t = 6 ; t < tokens.size() ; t ++) { mappingS.push_back(findSample(tokens[t])); if (mappingS.back() >= 0) n_includedS++; } //Read phenotypes unsigned int linecount =0; //Read phenotypes if (regionPhenotype.chr != "NA"){ hts_itr_t *itr = tbx_itr_querys(tbx, regionPhenotype.get().c_str()); vrb.bullet("target region [" + regionPhenotype.get() + "]"); if (!itr) vrb.error("Cannot jump to region!"); //Read data while (tbx_itr_next(fp, tbx, itr, &str) >= 0) { linecount ++; if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); if ((grp_mode == GRP_NONE && filter_phenotype.check(tokens[3])) || (grp_mode != GRP_NONE && filter_phenotype.check(tokens[4]))) { phenotype_id.push_back(tokens[3]); phenotype_chr.push_back(tokens[0]); phenotype_start.push_back(atoi(tokens[1].c_str()) + 1); phenotype_end.push_back(atoi(tokens[2].c_str())); if (grp_mode > 0 && full_test) phenotype_grp.push_back("ALL_GENES"); if (grp_mode > 0 && !full_test) phenotype_grp.push_back(tokens[4]); phenotype_neg.push_back(tokens[5] == "-"); if (phenotype_neg.back()) n_negativeStrd ++; phenotype_val.push_back(vector < float > (sample_count, 0.0)); for (int t = 6 ; t < tokens.size() ; t ++) { if (mappingS[t-6] >= 0) { if (tokens[t] == "NA") phenotype_val.back()[mappingS[t-6]] = bcf_float_missing; else phenotype_val.back()[mappingS[t-6]] = stof(tokens[t]); } } n_includedP++; } else n_excludedP ++; } tbx_itr_destroy(itr); }else{ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) { linecount ++; if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); stb.split(string(str.s), tokens); if (str.l && str.s[0] != tbx->conf.meta_char) { if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); if ((grp_mode == GRP_NONE && filter_phenotype.check(tokens[3])) || (grp_mode != GRP_NONE && filter_phenotype.check(tokens[4]))) { phenotype_id.push_back(tokens[3]); phenotype_chr.push_back(tokens[0]); phenotype_start.push_back(atoi(tokens[1].c_str()) + 1); phenotype_end.push_back(atoi(tokens[2].c_str())); if (grp_mode > 0 && full_test) phenotype_grp.push_back("ALL_GENES"); if (grp_mode > 0 && !full_test) phenotype_grp.push_back(tokens[4]); phenotype_neg.push_back(tokens[5] == "-"); if (phenotype_neg.back()) n_negativeStrd ++; phenotype_val.push_back(vector < float > (sample_count, 0.0)); for (int t = 6 ; t < tokens.size() ; t ++) { if (mappingS[t-6] >= 0) { if (tokens[t] == "NA") phenotype_val.back()[mappingS[t-6]] = bcf_float_missing; else phenotype_val.back()[mappingS[t-6]] = stof(tokens[t]); } } n_includedP++; } else n_excludedP ++; } } } //Finalize & verbose tbx_destroy(tbx); if (hts_close(fp)) vrb.error("Cannot properly close file"); phenotype_count = phenotype_id.size(); vrb.bullet(stb.str(n_includedP) + " phenotypes included"); if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user"); if (n_negativeStrd > 0 ) vrb.bullet(stb.str(n_negativeStrd) + " phenotypes are on the negative strand"); if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in target region!"); }
static int query_regions(args_t *args, char *fname, char **regs, int nregs) { int i; htsFile *fp = hts_open(fname,"r"); if ( !fp ) error("Could not read %s\n", fname); enum htsExactFormat format = hts_get_format(fp)->format; regidx_t *reg_idx = NULL; if ( args->targets_fname ) { reg_idx = regidx_init(args->targets_fname, NULL, NULL, 0, NULL); if ( !reg_idx ) error("Could not read %s\n", args->targets_fname); } if ( format == bcf ) { htsFile *out = hts_open("-","w"); if ( !out ) error("Could not open stdout\n", fname); hts_idx_t *idx = bcf_index_load(fname); if ( !idx ) error("Could not load .csi index of %s\n", fname); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Could not read the header: %s\n", fname); if ( args->print_header ) bcf_hdr_write(out,hdr); if ( !args->header_only ) { bcf1_t *rec = bcf_init(); for (i=0; i<nregs; i++) { hts_itr_t *itr = bcf_itr_querys(idx,hdr,regs[i]); while ( bcf_itr_next(fp, itr, rec) >=0 ) { if ( reg_idx && !regidx_overlap(reg_idx, bcf_seqname(hdr,rec),rec->pos,rec->pos+rec->rlen-1, NULL) ) continue; bcf_write(out,hdr,rec); } tbx_itr_destroy(itr); } bcf_destroy(rec); } if ( hts_close(out) ) error("hts_close returned non-zero status for stdout\n"); bcf_hdr_destroy(hdr); hts_idx_destroy(idx); } else if ( format==vcf || format==sam || format==unknown_format ) { tbx_t *tbx = tbx_index_load(fname); if ( !tbx ) error("Could not load .tbi/.csi index of %s\n", fname); kstring_t str = {0,0,0}; if ( args->print_header ) { while ( hts_getline(fp, KS_SEP_LINE, &str) >= 0 ) { if ( !str.l || str.s[0]!=tbx->conf.meta_char ) break; puts(str.s); } } if ( !args->header_only ) { int nseq; const char **seq = NULL; if ( reg_idx ) seq = tbx_seqnames(tbx, &nseq); for (i=0; i<nregs; i++) { hts_itr_t *itr = tbx_itr_querys(tbx, regs[i]); if ( !itr ) continue; while (tbx_itr_next(fp, tbx, itr, &str) >= 0) { if ( reg_idx && !regidx_overlap(reg_idx,seq[itr->curr_tid],itr->curr_beg,itr->curr_end, NULL) ) continue; puts(str.s); } tbx_itr_destroy(itr); } free(seq); } free(str.s); tbx_destroy(tbx); } else if ( format==bam ) error("Please use \"samtools view\" for querying BAM files.\n"); if ( reg_idx ) regidx_destroy(reg_idx); if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname); for (i=0; i<nregs; i++) free(regs[i]); free(regs); return 0; }