static int load_genmap(args_t *args, bcf1_t *line) { if ( !args->genmap_fname ) { args->ngenmap = 0; return 0; } kstring_t str = {0,0,0}; char *fname = strstr(args->genmap_fname,"{CHROM}"); if ( fname ) { kputsn(args->genmap_fname, fname - args->genmap_fname, &str); kputs(bcf_seqname(args->hdr,line), &str); kputs(fname+7,&str); fname = str.s; } else fname = args->genmap_fname; htsFile *fp = hts_open(fname, "rb"); if ( !fp ) { args->ngenmap = 0; return -1; } hts_getline(fp, KS_SEP_LINE, &str); if ( strcmp(str.s,"position COMBINED_rate(cM/Mb) Genetic_Map(cM)") ) error("Unexpected header, found:\n\t[%s], but expected:\n\t[position COMBINED_rate(cM/Mb) Genetic_Map(cM)]\n", fname, str.s); args->ngenmap = args->igenmap = 0; while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 ) { args->ngenmap++; hts_expand(genmap_t,args->ngenmap,args->mgenmap,args->genmap); genmap_t *gm = &args->genmap[args->ngenmap-1]; char *tmp, *end; gm->pos = strtol(str.s, &tmp, 10); if ( str.s==tmp ) error("Could not parse %s: %s\n", fname, str.s); // skip second column tmp++; while ( *tmp && !isspace(*tmp) ) tmp++; // read the genetic map in cM gm->rate = strtod(tmp+1, &end); if ( tmp+1==end ) error("Could not parse %s: %s\n", fname, str.s); } if ( !args->ngenmap ) error("Genetic map empty?\n"); int i; for (i=0; i<args->ngenmap; i++) args->genmap[i].rate /= args->genmap[args->ngenmap-1].rate; // scale to 1 if ( hts_close(fp) ) error("Close failed\n"); free(str.s); return 0; }
/** * annots_reader_next() - reads next line from annots.tab.gz and sets: class, vals * Returns 1 on successful read or 0 if no further record could be read. */ int annots_reader_next(args_t *args) { args->str.l = 0; if ( hts_getline(args->file,'\n',&args->str)<=0 ) return 0; char *t, *line = args->str.s; if ( !args->mvals ) { t = line; while ( *t ) { if ( *t=='\t' ) args->mvals++; t++; } args->vals = (double*) malloc(args->mvals*sizeof(double)); } // class args->dclass = atoi(line); t = column_next(line, '\t'); // values int i; for (i=0; i<args->mvals; i++) { if ( !*t ) error("Could not parse %d-th data field: is the line truncated?\nThe line was: [%s]\n",i+2,line); args->vals[i] = atof(++t); t = column_next(t,'\t'); } return 1; }
/** * Parse multiple files from command line unlabeled arguments or -L denoted file list. If both are defined, the files are merged. * * @files - file names are stored in this vector * @argument_files - vector of input files * @file_list - file names stored in a file * */ void Program::parse_files(std::vector<std::string>& files, const std::vector<std::string>& arg_files, std::string file_list) { files.clear(); if (arg_files.size()!=0) { files = arg_files; } if (file_list != "") { htsFile *file = hts_open(file_list.c_str(), "r"); if (file==NULL) { std::cerr << "cannot open " << file_list << "\n"; exit(1); } kstring_t *s = &file->line; while (hts_getline(file, '\n', s) >= 0) { if (s->s[0]!='#') { files.push_back(std::string(s->s)); } } hts_close(file); } }
/** * Reads next record, hides the random access of different regions from the user. */ bool TBXOrderedReader::read(kstring_t *s) { if (random_access_enabled) { while(true) { if (itr && tbx_itr_next(hts, tbx, itr, s)>=0) { return true; } else if (!initialize_next_interval()) { return false; } } } else { if (hts_getline(hts, '\n', s) >= 0) { return true; } else { return false; } } return false; };
void cis_data::scanPhenotypes(string fbed) { int n_includedP = 0; int n_excludedP = 0; int n_negativeStrd = 0; //Open BED file vrb.title("Scanning phenotype data in [" + fbed + "]"); htsFile *fp = hts_open(fbed.c_str(),"r"); if (!fp) vrb.error("Cannot open file"); tbx_t * tbx = tbx_index_load(fbed.c_str()); if (!tbx) vrb.error("Cannot open index file"); //Read header kstring_t str = {0,0,0}; if (!hts_getline(fp, KS_SEP_LINE, &str) || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line"); //Scan file vector < string > tokens; while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) { if (str.l && str.s[0] != tbx->conf.meta_char) { stb.split(string(str.s), tokens); if (tokens.size() < 5) vrb.error("Incorrect number of columns!"); if ((grp_mode == GRP_NONE && filter_phenotype.check(tokens[3])) || (grp_mode != GRP_NONE && filter_phenotype.check(tokens[4]))) { phenotype_id.push_back(tokens[3]); phenotype_chr.push_back(tokens[0]); phenotype_start.push_back(atoi(tokens[1].c_str()) + 1); phenotype_end.push_back(atoi(tokens[2].c_str())); if (grp_mode > 0 && full_test) phenotype_grp.push_back("ALL_GENES"); if (grp_mode > 0 && !full_test) phenotype_grp.push_back(tokens[4]); phenotype_neg.push_back(tokens[5] == "-"); if (phenotype_neg.back()) n_negativeStrd ++; n_includedP++; } else n_excludedP ++; } } //Finalize & verbose tbx_destroy(tbx); if (hts_close(fp)) vrb.error("Cannot properly close file"); phenotype_count = phenotype_id.size(); vrb.bullet(stb.str(n_includedP) + " phenotypes included"); if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user"); if (n_negativeStrd > 0 ) vrb.bullet(stb.str(n_negativeStrd) + " phenotypes are on the negative strand"); if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in region!"); }
static void reheader_vcf(args_t *args) { kstring_t hdr = {0,0,0}; htsFile *fp = hts_open(args->fname, "r"); if ( !fp ) error("Failed to open: %s\n", args->fname); while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 ) { kputc('\n',&fp->line); // hts_getline eats the newline character if ( fp->line.s[0]!='#' ) break; kputsn(fp->line.s,fp->line.l,&hdr); } int nsamples = 0; char **samples = NULL; if ( args->samples_fname ) samples = hts_readlines(args->samples_fname, &nsamples); if ( args->header_fname ) { free(hdr.s); hdr.s = NULL; hdr.l = hdr.m = 0; read_header_file(args->header_fname, &hdr); } if ( samples ) { set_samples(samples, nsamples, &hdr); int i; for (i=0; i<nsamples; i++) free(samples[i]); free(samples); } int out = STDOUT_FILENO; if ( write(out, hdr.s, hdr.l)!=hdr.l ) error("Failed to write %d bytes\n", hdr.l); free(hdr.s); if ( fp->line.l ) { if ( write(out, fp->line.s, fp->line.l)!=fp->line.l ) error("Failed to write %d bytes\n", fp->line.l); } while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 ) // uncompressed file implies small size, we don't worry about speed { kputc('\n',&fp->line); if ( write(out, fp->line.s, fp->line.l)!=fp->line.l ) error("Failed to write %d bytes\n", fp->line.l); } hts_close(fp); }
void Tabix::getHeader(string& header) { header.clear(); kstring_t str = {0,0,0}; while ( hts_getline(fn, KS_SEP_LINE, &str) >= 0 ) { if ( !str.l || str.s[0]!=tbx->conf.meta_char ) { break; } else { header += string(str.s); header += "\n"; } } // set back to start if (iter != NULL){ current_chrom = chroms.begin(); if (iter) tbx_itr_destroy(iter); iter = tbx_itr_querys(tbx, current_chrom->c_str()); } }
static void read_header_file(char *fname, kstring_t *hdr) { kstring_t tmp = {0,0,0}; hdr->l = 0; htsFile *fp = hts_open(fname, "r"); if ( !fp ) error("Could not read: %s\n", fname); while ( hts_getline(fp, KS_SEP_LINE, &tmp) > 0 ) { kputsn(tmp.s,tmp.l,hdr); kputc('\n',hdr); } if ( hts_close(fp) ) error("Close failed: %s\n", fname); free(tmp.s); while ( hdr->l>0 && isspace(hdr->s[hdr->l-1]) ) hdr->l--; // remove trailing newlines kputc('\n',hdr); }
/** * Parse intervals. Processes the interval list first followed by the interval string. Duplicates are dropped. * * @intervals - intervals stored in this vector * @interval_list - file containing intervals * @interval_string - comma delimited intervals in a string * * todo: merge overlapping sites? */ void Program::parse_intervals(std::vector<GenomeInterval>& intervals, std::string interval_list, std::string interval_string) { intervals.clear(); std::map<std::string, uint32_t> m; if (interval_list!="") { htsFile *file = hts_open(interval_list.c_str(), "r"); if (file) { kstring_t *s = &file->line; while (hts_getline(file, '\n', s)>=0) { std::string ss = std::string(s->s); if (m.find(ss)==m.end()) { m[ss] = 1; GenomeInterval interval(ss); intervals.push_back(interval); } } hts_close(file); } } std::vector<std::string> v; if (interval_string!="") split(v, ",", interval_string); for (uint32_t i=0; i<v.size(); ++i) { if (m.find(v[i])==m.end()) { m[v[i]] = 1; GenomeInterval interval(v[i]); intervals.push_back(interval); } } }
/** * Parse samples. Processes the sample list. Duplicates are dropped. * * @samples - samples stored in this vector * @sample_map - samples stored in this map * @sample_list - file containing sample names */ void Program::read_sample_list(std::vector<std::string>& samples, std::string sample_list) { samples.clear(); std::map<std::string, int32_t> map; if (sample_list!="") { htsFile *file = hts_open(sample_list.c_str(), "r"); if (file) { kstring_t *s = &file->line; while (hts_getline(file, '\n', s)>=0) { std::string ss = std::string(s->s); if (map.find(ss)==map.end()) { map[ss] = 1; samples.push_back(ss); } } hts_close(file); } } }
void union_data::scanGenotypesBED(string fbed) { string buffer; int n_includedG = 0; int n_excludedG_user = 0; //Opening files htsFile *fp = hts_open(fbed.c_str(),"r"); if (!fp) vrb.error("Cannot open file!"); tbx_t * tbx = tbx_index_load(fbed.c_str()); if (!tbx) vrb.error("Cannot load index file!"); kstring_t str = {0,0,0}; if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!"); //Read genotype data vector < string > tokens; unsigned int linecount = 0; //Jump to interesting region if (regionGenotype.chr != "NA"){ hts_itr_t *itr = tbx_itr_querys(tbx, regionGenotype.get().c_str()); vrb.bullet("target region [" + regionGenotype.get() + "]"); if (!itr) vrb.error("Cannot jump to region!"); while (tbx_itr_next(fp, tbx, itr, &str) >= 0) { linecount ++; if (linecount % 1000000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); if (genotype_id_to_idx.count(tokens[3])) continue; if (filter_genotype.check(tokens[3])) { genotype_id.push_back(tokens[3]); genotype_chr.push_back(tokens[0]); genotype_start.push_back(atoi(tokens[1].c_str()) + 1); genotype_end.push_back(atoi(tokens[2].c_str())); pair < string, int > temp (tokens[3],genotype_id_to_idx.size()); genotype_id_to_idx.insert(temp); n_includedG++; } else n_excludedG_user ++; } tbx_itr_destroy(itr); }else{ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) { linecount ++; if (linecount % 1000000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); stb.split(string(str.s), tokens); if (str.l && str.s[0] != tbx->conf.meta_char) { if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); if (genotype_id_to_idx.count(tokens[3])) continue; if (filter_genotype.check(tokens[3])) { genotype_id.push_back(tokens[3]); genotype_chr.push_back(tokens[0]); genotype_start.push_back(atoi(tokens[1].c_str()) + 1); genotype_end.push_back(atoi(tokens[2].c_str())); pair < string, int > temp (tokens[3],genotype_id_to_idx.size()); genotype_id_to_idx.insert(temp); n_includedG++; } else n_excludedG_user ++; } } } //Finalize & verbose tbx_destroy(tbx); genotype_count += n_includedG; if (hts_close(fp)) vrb.error("Cannot properly close file!"); vrb.bullet(stb.str(n_includedG) + " new variants included"); if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user"); if (n_includedG == 0) vrb.leave("Cannot find variants in target region!"); }
void union_data::readGenotypesBED(string fbed,string region) { string buffer; int n_includedG = 0; int n_excludedG_user = 0; int n_includedS = 0; int n_excludedS = 0; int n_missingS = 0; vector < int > mappingS; genotype_id.clear(); genotype_chr.clear(); genotype_start.clear(); genotype_end.clear(); genotype_val.clear(); genotype_count=0; genotype_id_to_idx.clear(); //Opening files htsFile *fp = hts_open(fbed.c_str(),"r"); if (!fp) vrb.error("Cannot open file!"); tbx_t * tbx = tbx_index_load(fbed.c_str()); if (!tbx) vrb.error("Cannot load index file!"); kstring_t str = {0,0,0}; if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!"); //Process sample names vector < string > tokens; stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); for (int i0 = 6 ; i0 < tokens.size() ; i0 ++) { string sid = tokens[i0]; if (filter_sample.check(sid)) { mappingS.push_back(findSample(sid)); if (mappingS.back() >= 0) n_includedS ++; else n_missingS ++; } else { mappingS.push_back(-1); n_excludedS ++; } } //vrb.bullet(stb.str(n_includedS) + " samples included"); //if (n_excludedS > 0) vrb.bullet(stb.str(n_excludedS) + " samples excluded by user"); //if (n_missingS > 0) vrb.bullet(stb.str(n_missingS) + " samples without phenotype data"); //if (n_includedS != sample_count) vrb.error("Cannot find genotype for " + stb.str(sample_count - n_includedS) + " samples!"); unsigned int linecount = 0; //Jump to interesting region hts_itr_t *itr = tbx_itr_querys(tbx, region.c_str()); //vrb.bullet("target region [" + regionGenotype.get() + "]"); //if (!itr) vrb.error("Cannot jump to region!"); while (tbx_itr_next(fp, tbx, itr, &str) >= 0) { linecount ++; if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); if (filter_genotype.check(tokens[3])) { genotype_id.push_back(tokens[3]); genotype_chr.push_back(tokens[0]); genotype_start.push_back(atoi(tokens[1].c_str()) + 1); genotype_end.push_back(atoi(tokens[2].c_str())); genotype_val.push_back(vector < float > (sample_count, 0.0)); for (int t = 6 ; t < tokens.size() ; t ++) { if (mappingS[t-6] >= 0) { if (tokens[t] == "NA") genotype_val.back()[mappingS[t-6]] = bcf_float_missing; else genotype_val.back()[mappingS[t-6]] = stof(tokens[t]); } } pair < string, int > temp (tokens[3],n_includedG); genotype_id_to_idx.insert(temp); n_includedG++; } else n_excludedG_user ++; } tbx_itr_destroy(itr); //Finalize & verbose tbx_destroy(tbx); if (hts_close(fp)) vrb.error("Cannot properly close file!"); genotype_count = n_includedG; //vrb.bullet(stb.str(n_includedG) + " variants included"); //if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user"); //if (genotype_count == 0) vrb.leave("Cannot find variants in target region!"); }
bcf_hdr_t *vcf_hdr_read(htsFile *fp) { if (!fp->is_bin) { kstring_t txt, *s = &fp->line; bcf_hdr_t *h; h = bcf_hdr_init(); txt.l = txt.m = 0; txt.s = 0; while (hts_getline(fp, KS_SEP_LINE, s) >= 0) { if (s->l == 0) continue; if (s->s[0] != '#') { if (hts_verbose >= 2) fprintf(stderr, "[E::%s] no sample line\n", __func__); free(txt.s); bcf_hdr_destroy(h); return 0; } if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here int dret; gzFile f; kstream_t *ks; kstring_t tmp; tmp.l = tmp.m = 0; tmp.s = 0; f = gzopen(fp->fn_aux, "r"); ks = ks_init(f); while (ks_getuntil(ks, 0, &tmp, &dret) >= 0) { int c; kputs("##contig=<ID=", &txt); kputs(tmp.s, &txt); ks_getuntil(ks, 0, &tmp, &dret); kputs(",length=", &txt); kputw(atol(tmp.s), &txt); kputsn(">\n", 2, &txt); if (dret != '\n') while ((c = ks_getc(ks)) != '\n' && c != -1); // skip the rest of the line } free(tmp.s); ks_destroy(ks); gzclose(f); } kputsn(s->s, s->l, &txt); if (s->s[1] != '#') break; kputc('\n', &txt); } h->l_text = txt.l + 1; // including NULL h->text = txt.s; bcf_hdr_parse(h); // check tabix index, are all contigs listed in the header? add the missing ones tbx_t *idx = tbx_index_load(fp->fn); if ( idx ) { int i, n, need_sync = 0; const char **names = tbx_seqnames(idx, &n); for (i=0; i<n; i++) { bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_DT_CTG, (char*) names[i]); if ( hrec ) continue; hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t)); hrec->key = strdup("contig"); bcf_hrec_add_key(hrec, "ID", strlen("ID")); bcf_hrec_set_val(hrec, hrec->nkeys-1, (char*) names[i], strlen(names[i]), 0); bcf_hrec_add_key(hrec, "length", strlen("length")); bcf_hrec_set_val(hrec, hrec->nkeys-1, "-1", strlen("-1"), 0); // what is a good default value? bcf_hdr_add_hrec(h, hrec); need_sync = 1; } free(names); tbx_destroy(idx); if ( need_sync ) { bcf_hdr_sync(h); bcf_hdr_fmt_text(h); } } return h; } else return bcf_hdr_read((BGZF*)fp->fp); }
static int query_regions(args_t *args, char *fname, char **regs, int nregs) { int i; htsFile *fp = hts_open(fname,"r"); if ( !fp ) error("Could not read %s\n", fname); enum htsExactFormat format = hts_get_format(fp)->format; regidx_t *reg_idx = NULL; if ( args->targets_fname ) { reg_idx = regidx_init(args->targets_fname, NULL, NULL, 0, NULL); if ( !reg_idx ) error("Could not read %s\n", args->targets_fname); } if ( format == bcf ) { htsFile *out = hts_open("-","w"); if ( !out ) error("Could not open stdout\n", fname); hts_idx_t *idx = bcf_index_load(fname); if ( !idx ) error("Could not load .csi index of %s\n", fname); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Could not read the header: %s\n", fname); if ( args->print_header ) bcf_hdr_write(out,hdr); if ( !args->header_only ) { bcf1_t *rec = bcf_init(); for (i=0; i<nregs; i++) { hts_itr_t *itr = bcf_itr_querys(idx,hdr,regs[i]); while ( bcf_itr_next(fp, itr, rec) >=0 ) { if ( reg_idx && !regidx_overlap(reg_idx, bcf_seqname(hdr,rec),rec->pos,rec->pos+rec->rlen-1, NULL) ) continue; bcf_write(out,hdr,rec); } tbx_itr_destroy(itr); } bcf_destroy(rec); } if ( hts_close(out) ) error("hts_close returned non-zero status for stdout\n"); bcf_hdr_destroy(hdr); hts_idx_destroy(idx); } else if ( format==vcf || format==sam || format==unknown_format ) { tbx_t *tbx = tbx_index_load(fname); if ( !tbx ) error("Could not load .tbi/.csi index of %s\n", fname); kstring_t str = {0,0,0}; if ( args->print_header ) { while ( hts_getline(fp, KS_SEP_LINE, &str) >= 0 ) { if ( !str.l || str.s[0]!=tbx->conf.meta_char ) break; puts(str.s); } } if ( !args->header_only ) { int nseq; const char **seq = NULL; if ( reg_idx ) seq = tbx_seqnames(tbx, &nseq); for (i=0; i<nregs; i++) { hts_itr_t *itr = tbx_itr_querys(tbx, regs[i]); if ( !itr ) continue; while (tbx_itr_next(fp, tbx, itr, &str) >= 0) { if ( reg_idx && !regidx_overlap(reg_idx,seq[itr->curr_tid],itr->curr_beg,itr->curr_end, NULL) ) continue; puts(str.s); } tbx_itr_destroy(itr); } free(seq); } free(str.s); tbx_destroy(tbx); } else if ( format==bam ) error("Please use \"samtools view\" for querying BAM files.\n"); if ( reg_idx ) regidx_destroy(reg_idx); if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname); for (i=0; i<nregs; i++) free(regs[i]); free(regs); return 0; }
bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr, int ifrom, int ito) { bcf_sr_regions_t *reg; if ( !is_file ) return _regions_init_string(regions); reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t)); reg->start = reg->end = -1; reg->prev_start = reg->prev_seq = -1; reg->file = hts_open(regions, "rb"); if ( !reg->file ) { fprintf(stderr,"[%s:%d %s] Could not open file: %s\n", __FILE__,__LINE__,__FUNCTION__,regions); free(reg); return NULL; } reg->tbx = tbx_index_load(regions); if ( !reg->tbx ) { int len = strlen(regions); int is_bed = strcasecmp(".bed",regions+len-4) ? 0 : 1; if ( !is_bed && !strcasecmp(".bed.gz",regions+len-7) ) is_bed = 1; if ( reg->file->format.format==vcf ) ito = 1; // read the whole file, tabix index is not present while ( hts_getline(reg->file, KS_SEP_LINE, ®->line) > 0 ) { char *chr, *chr_end; int from, to, ret; ret = _regions_parse_line(reg->line.s, ichr,ifrom,abs(ito), &chr,&chr_end,&from,&to); if ( ret < 0 ) { if ( ito<0 ) ret = _regions_parse_line(reg->line.s, ichr,ifrom,ifrom, &chr,&chr_end,&from,&to); if ( ret<0 ) { fprintf(stderr,"[%s:%d] Could not parse the file %s, using the columns %d,%d[,%d]\n", __FILE__,__LINE__,regions,ichr+1,ifrom+1,ito+1); hts_close(reg->file); reg->file = NULL; free(reg); return NULL; } } if ( !ret ) continue; if ( is_bed ) from++; *chr_end = 0; _regions_add(reg, chr, from, to); *chr_end = '\t'; } hts_close(reg->file); reg->file = NULL; if ( !reg->nseqs ) { free(reg); return NULL; } return reg; } reg->seq_names = (char**) tbx_seqnames(reg->tbx, ®->nseqs); if ( !reg->seq_hash ) reg->seq_hash = khash_str2int_init(); int i; for (i=0; i<reg->nseqs; i++) { khash_str2int_set(reg->seq_hash,reg->seq_names[i],i); } reg->fname = strdup(regions); reg->is_bin = 1; return reg; }
int beds_database_add(struct beds_options *opts, const char *fname, char *columns) { if ( opts->n_files == opts->m_files ) { opts->m_files = opts->m_files == 0 ? 2 : opts->m_files +2; opts->files = (struct beds_anno_file*)realloc(opts->files, opts->m_files*sizeof(struct beds_anno_file)); } struct beds_anno_file *file = &opts->files[opts->n_files]; memset(file, 0, sizeof(struct beds_anno_file)); file->id = opts->n_files; file->fname = strdup(fname); file->fp = hts_open(fname, "r"); if (file->fp == NULL) error("Failed to open %s : %s", fname, strerror(errno)); // int n; file->idx = tbx_index_load(fname); if ( file->idx == NULL) error("Failed to load index of %s.", fname); opts->n_files++; file->last_id = -1; file->last_start = -1; file->last_end = -1; kstring_t string = KSTRING_INIT; int no_columns = 0; int i; if ( columns == NULL && file->no_such_chrom == 0) { warnings("No columns string specified for %s. Will annotate all tags in this data.", fname); file->no_such_chrom = 1; no_columns = 1; } else { int *splits = NULL; kputs(columns, &string); int nfields; splits = ksplit(&string, ',', &nfields); file->m_cols = nfields; file->cols = (struct anno_col*)malloc(sizeof(struct anno_col) * file->m_cols); for ( i = 0; i < nfields; ++i ) { char *ss = string.s + splits[i]; struct anno_col *col = &file->cols[file->n_cols]; col->icol = i; col->replace = REPLACE_MISSING; if (*ss == '+') { col->replace = REPLACE_MISSING; ss++; } else if ( *ss == '-' ) { col->replace = REPLACE_EXISTING; ss++; } if (ss[0] == '\0') continue; if ( strncmp(ss, "INFO/", 5) == 0) ss += 5; col->hdr_key = strdup(ss); col->icol = -1; // debug_print("%s, %d", col->hdr_key, file->n_cols); file->n_cols++; } string.l = 0; } while (1) { string.l =0; if ( hts_getline(file->fp, KS_SEP_LINE, &string) < 0 ) break; // only accept header line in the beginning for file if ( string.s[0] != '#' ) break; if ( strncmp(string.s, "##INFO=", 7) == 0) { char *ss = string.s + 11; char *se = ss; while (se && *se != ',') se++; struct anno_col *col = NULL; // if no column string specified, init all header lines if ( no_columns ) { if ( file->n_cols == file->m_cols ) { file->m_cols = file->m_cols == 0 ? 2 : file->m_cols + 2; file->cols = (struct anno_col *) realloc(file->cols, file->m_cols*sizeof(struct anno_col)); } col = &file->cols[file->n_cols++]; col->icol = -1; col->hdr_key = strndup(ss, se-ss+1); col->hdr_key[se-ss] = '\0'; } else { for ( i = 0; i < file->n_cols; ++i ) { if ( strncmp(file->cols[i].hdr_key, ss, se-ss) == 0) break; } // if header line is not set in the column string, skip if ( i == file->n_cols ) continue; col = &file->cols[i]; } // specify setter functions here col->setter.bed = beds_setter_info_string; bcf_hdr_append(opts->hdr_out, string.s); bcf_hdr_sync(opts->hdr_out); int hdr_id = bcf_hdr_id2int(opts->hdr_out, BCF_DT_ID,col->hdr_key); assert ( bcf_hdr_idinfo_exists(opts->hdr_out, BCF_HL_INFO, hdr_id) ); } string.l = 0; // set column number for each col if ( strncasecmp(string.s, "#chr", 4) == 0) { int nfields; int *splits = ksplit(&string, '\t', &nfields); if (nfields < 4) { fprintf(stderr, "[error] Bad header of bed database : %s. n_fields : %d, %s", fname, nfields, string.s); fprintf(stderr, "[notice] this error usually happened because the header line is seperated by spaces but not tab!"); exit(1); } int k; for ( k = 3; k < nfields; ++k ) { char *ss = string.s + splits[k]; for (i = 0; i < file->n_cols; ++i ) { struct anno_col *col = &file->cols[i]; if ( strcmp(col->hdr_key, ss) == 0) break; } // if name line specify more names than column string or header, skip if ( i == file->n_cols ) continue; struct anno_col *col = &file->cols[i]; col->icol = k; } } } for ( i = 0; i < file->n_cols; ++i ) { struct anno_col *col = &file->cols[i]; if ( col->hdr_key && col->icol == -1 ) error("No column %s found in bed database : %s", col->hdr_key, fname); int hdr_id = bcf_hdr_id2int(opts->hdr_out, BCF_DT_ID, col->hdr_key); assert(hdr_id>-1); col->number = bcf_hdr_id2length(opts->hdr_out, BCF_HL_INFO, hdr_id); if ( col->number == BCF_VL_A || col->number == BCF_VL_R || col->number == BCF_VL_G) error("Only support fixed INFO number for bed database. %s", col->hdr_key); col->ifile = file->id; } if ( string.m ) free(string.s); if ( opts->beds_is_inited == 0 ) opts->beds_is_inited = 1; return 0; }
void cis_data::readPhenotypes(string fbed) { int n_includedS = 0; int n_includedP = 0; int n_excludedP = 0; int n_negativeStrd = 0; vector < int > mappingS; //Open BED file vrb.title("Reading phenotype data in [" + fbed + "]"); htsFile *fp = hts_open(fbed.c_str(),"r"); if (!fp) vrb.error("Cannot open file"); tbx_t *tbx = tbx_index_load(fbed.c_str()); if (!tbx) vrb.error("Cannot open index file"); kstring_t str = {0,0,0}; if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!"); //Process sample names vector < string > tokens; stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); for (int t = 6 ; t < tokens.size() ; t ++) { mappingS.push_back(findSample(tokens[t])); if (mappingS.back() >= 0) n_includedS++; } //Read phenotypes unsigned int linecount =0; //Read phenotypes if (regionPhenotype.chr != "NA"){ hts_itr_t *itr = tbx_itr_querys(tbx, regionPhenotype.get().c_str()); vrb.bullet("target region [" + regionPhenotype.get() + "]"); if (!itr) vrb.error("Cannot jump to region!"); //Read data while (tbx_itr_next(fp, tbx, itr, &str) >= 0) { linecount ++; if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); if ((grp_mode == GRP_NONE && filter_phenotype.check(tokens[3])) || (grp_mode != GRP_NONE && filter_phenotype.check(tokens[4]))) { phenotype_id.push_back(tokens[3]); phenotype_chr.push_back(tokens[0]); phenotype_start.push_back(atoi(tokens[1].c_str()) + 1); phenotype_end.push_back(atoi(tokens[2].c_str())); if (grp_mode > 0 && full_test) phenotype_grp.push_back("ALL_GENES"); if (grp_mode > 0 && !full_test) phenotype_grp.push_back(tokens[4]); phenotype_neg.push_back(tokens[5] == "-"); if (phenotype_neg.back()) n_negativeStrd ++; phenotype_val.push_back(vector < float > (sample_count, 0.0)); for (int t = 6 ; t < tokens.size() ; t ++) { if (mappingS[t-6] >= 0) { if (tokens[t] == "NA") phenotype_val.back()[mappingS[t-6]] = bcf_float_missing; else phenotype_val.back()[mappingS[t-6]] = stof(tokens[t]); } } n_includedP++; } else n_excludedP ++; } tbx_itr_destroy(itr); }else{ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) { linecount ++; if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); stb.split(string(str.s), tokens); if (str.l && str.s[0] != tbx->conf.meta_char) { if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); if ((grp_mode == GRP_NONE && filter_phenotype.check(tokens[3])) || (grp_mode != GRP_NONE && filter_phenotype.check(tokens[4]))) { phenotype_id.push_back(tokens[3]); phenotype_chr.push_back(tokens[0]); phenotype_start.push_back(atoi(tokens[1].c_str()) + 1); phenotype_end.push_back(atoi(tokens[2].c_str())); if (grp_mode > 0 && full_test) phenotype_grp.push_back("ALL_GENES"); if (grp_mode > 0 && !full_test) phenotype_grp.push_back(tokens[4]); phenotype_neg.push_back(tokens[5] == "-"); if (phenotype_neg.back()) n_negativeStrd ++; phenotype_val.push_back(vector < float > (sample_count, 0.0)); for (int t = 6 ; t < tokens.size() ; t ++) { if (mappingS[t-6] >= 0) { if (tokens[t] == "NA") phenotype_val.back()[mappingS[t-6]] = bcf_float_missing; else phenotype_val.back()[mappingS[t-6]] = stof(tokens[t]); } } n_includedP++; } else n_excludedP ++; } } } //Finalize & verbose tbx_destroy(tbx); if (hts_close(fp)) vrb.error("Cannot properly close file"); phenotype_count = phenotype_id.size(); vrb.bullet(stb.str(n_includedP) + " phenotypes included"); if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user"); if (n_negativeStrd > 0 ) vrb.bullet(stb.str(n_negativeStrd) + " phenotypes are on the negative strand"); if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in target region!"); }
void union_data::scanPhenotypes(string fbed) { int n_includedP = 0; int n_excludedP = 0; //Open BED file vrb.title("Scanning phenotype data in [" + fbed + "]"); htsFile *fp = hts_open(fbed.c_str(),"r"); if (!fp) vrb.error("Cannot open file"); tbx_t * tbx = tbx_index_load(fbed.c_str()); if (!tbx) vrb.error("Cannot open index file"); //Read header kstring_t str = {0,0,0}; if (!hts_getline(fp, KS_SEP_LINE, &str) || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line"); //Scan file vector < string > tokens; unsigned int linecount =0; if (regionPhenotype.chr != "NA"){ hts_itr_t *itr = tbx_itr_querys(tbx, regionPhenotype.get().c_str()); vrb.bullet("target region [" + regionPhenotype.get() + "]"); if (!itr) vrb.error("Cannot jump to region!"); //Read data while (tbx_itr_next(fp, tbx, itr, &str) >= 0) { linecount ++; if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); if (phenotype_id_to_idx.count(tokens[3])) continue; if (filter_phenotype.check(tokens[3])) { phenotype_id.push_back(tokens[3]); phenotype_chr.push_back(tokens[0]); phenotype_start.push_back(atoi(tokens[1].c_str()) + 1); phenotype_end.push_back(atoi(tokens[2].c_str())); pair < string, int > temp (tokens[3],phenotype_id_to_idx.size()); phenotype_id_to_idx.insert(temp); n_includedP++; } else n_excludedP ++; } tbx_itr_destroy(itr); }else{ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) { linecount ++; if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); if (str.l && str.s[0] != tbx->conf.meta_char) { stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); if (phenotype_id_to_idx.count(tokens[3])) continue; if (filter_phenotype.check(tokens[3])) { phenotype_id.push_back(tokens[3]); phenotype_chr.push_back(tokens[0]); phenotype_start.push_back(atoi(tokens[1].c_str()) + 1); phenotype_end.push_back(atoi(tokens[2].c_str())); pair < string, int > temp (tokens[3],phenotype_id_to_idx.size()); phenotype_id_to_idx.insert(temp); n_includedP++; } else n_excludedP ++; } } } //Finalize & verbose tbx_destroy(tbx); if (hts_close(fp)) vrb.error("Cannot properly close file"); phenotype_count = phenotype_id.size(); vrb.bullet(stb.str(n_includedP) + " new phenotypes included"); if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user"); if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in region!"); }
void union_data::readPhenotypes(string fbed, string region) { int n_includedS = 0; int n_includedP = 0; int n_excludedP = 0; vector < int > mappingS; phenotype_id.clear(); phenotype_chr.clear(); phenotype_start.clear(); phenotype_end.clear(); phenotype_val.clear(); phenotype_count=0; phenotype_id_to_idx.clear(); //Open BED file //vrb.title("Reading phenotype data in [" + fbed + "]"); htsFile *fp = hts_open(fbed.c_str(),"r"); if (!fp) vrb.error("Cannot open file"); tbx_t *tbx = tbx_index_load(fbed.c_str()); if (!tbx) vrb.error("Cannot open index file"); kstring_t str = {0,0,0}; if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!"); //Process sample names vector < string > tokens; stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); for (int t = 6 ; t < tokens.size() ; t ++) { mappingS.push_back(findSample(tokens[t])); if (mappingS.back() >= 0) n_includedS++; } unsigned int linecount =0; //Read phenotypes hts_itr_t *itr = tbx_itr_querys(tbx, region.c_str()); //vrb.bullet("target region [" + regionPhenotype.get() + "]"); //if (!itr) vrb.error("Cannot jump to region!"); //Read data while (tbx_itr_next(fp, tbx, itr, &str) >= 0) { linecount ++; if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); stb.split(string(str.s), tokens); if (tokens.size() < 7) vrb.error("Incorrect number of columns!"); if (filter_phenotype.check(tokens[3])) { phenotype_id.push_back(tokens[3]); phenotype_chr.push_back(tokens[0]); phenotype_start.push_back(atoi(tokens[1].c_str()) + 1); phenotype_end.push_back(atoi(tokens[2].c_str())); phenotype_val.push_back(vector < float > (sample_count, 0.0)); for (int t = 6 ; t < tokens.size() ; t ++) { if (mappingS[t-6] >= 0) { if (tokens[t] == "NA") phenotype_val.back()[mappingS[t-6]] = bcf_float_missing; else phenotype_val.back()[mappingS[t-6]] = stof(tokens[t]); } } pair < string, int > temp (tokens[3],n_includedP); phenotype_id_to_idx.insert(temp); n_includedP++; } else n_excludedP ++; } tbx_itr_destroy(itr); //Finalize & verbose tbx_destroy(tbx); if (hts_close(fp)) vrb.error("Cannot properly close file"); phenotype_count = phenotype_id.size(); //vrb.bullet(stb.str(n_includedP) + " phenotypes included"); //if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user"); //if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in target region!"); }
static void consensus(args_t *args) { htsFile *fasta = hts_open(args->ref_fname, "rb"); if ( !fasta ) error("Error reading %s\n", args->ref_fname); kstring_t str = {0,0,0}; while ( hts_getline(fasta, KS_SEP_LINE, &str) > 0 ) { if ( str.s[0]=='>' ) { // new sequence encountered, apply all chached variants while ( args->vcf_rbuf.n ) { if (args->chain) { print_chain(args); destroy_chain(args); } bcf1_t *rec = args->vcf_buf[args->vcf_rbuf.f]; if ( rec->rid!=args->rid || ( args->fa_end_pos && rec->pos > args->fa_end_pos ) ) break; int i = rbuf_shift(&args->vcf_rbuf); apply_variant(args, args->vcf_buf[i]); } flush_fa_buffer(args, 0); init_region(args, str.s+1); continue; } args->fa_length += str.l; args->fa_src_pos += str.l; // determine if uppercase or lowercase is used in this fasta file if ( args->fa_case==-1 ) args->fa_case = toupper(str.s[0])==str.s[0] ? 1 : 0; if ( args->mask && args->rid>=0) mask_region(args, str.s, str.l); kputs(str.s, &args->fa_buf); bcf1_t **rec_ptr = NULL; while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) ) { bcf1_t *rec = *rec_ptr; // still the same chr and the same region? if not, fasta buf can be flushed if ( rec->rid!=args->rid || ( args->fa_end_pos && rec->pos > args->fa_end_pos ) ) { // save the vcf record until next time and flush unread_vcf_line(args, rec_ptr); rec_ptr = NULL; break; } // is the vcf record well beyond cached fasta buffer? if yes, the buf can be flushed if ( args->fa_ori_pos + args->fa_buf.l - args->fa_mod_off <= rec->pos ) { unread_vcf_line(args, rec_ptr); rec_ptr = NULL; break; } // is the cached fasta buffer full enough? if not, read more fasta, no flushing if ( args->fa_ori_pos + args->fa_buf.l - args->fa_mod_off < rec->pos + rec->rlen ) { unread_vcf_line(args, rec_ptr); break; } apply_variant(args, rec); } if ( !rec_ptr ) flush_fa_buffer(args, 60); } if (args->chain) { print_chain(args); destroy_chain(args); } flush_fa_buffer(args, 0); hts_close(fasta); free(str.s); }
int bcf_sr_regions_next(bcf_sr_regions_t *reg) { if ( reg->iseq<0 ) return -1; reg->start = reg->end = -1; reg->nals = 0; // using in-memory regions if ( reg->regs ) { while ( reg->iseq < reg->nseqs ) { reg->regs[reg->iseq].creg++; if ( reg->regs[reg->iseq].creg < reg->regs[reg->iseq].nregs ) break; reg->iseq++; } if ( reg->iseq >= reg->nseqs ) { reg->iseq = -1; return -1; } // no more regions left region1_t *creg = ®->regs[reg->iseq].regs[reg->regs[reg->iseq].creg]; reg->start = creg->start; reg->end = creg->end; return 0; } // reading from tabix char *chr, *chr_end; int ichr = 0, ifrom = 1, ito = 2, is_bed = 0, from, to; if ( reg->tbx ) { ichr = reg->tbx->conf.sc-1; ifrom = reg->tbx->conf.bc-1; ito = reg->tbx->conf.ec-1; if ( ito<0 ) ito = ifrom; is_bed = reg->tbx->conf.preset==TBX_UCSC ? 1 : 0; } int ret = 0; while ( !ret ) { if ( reg->itr ) { // tabix index present, reading a chromosome block ret = tbx_itr_next(reg->file, reg->tbx, reg->itr, ®->line); if ( ret<0 ) { reg->iseq = -1; return -1; } } else { if ( reg->is_bin ) { // Waited for seek which never came. Reopen in text mode and stream // through the regions, otherwise hts_getline would fail hts_close(reg->file); reg->file = hts_open(reg->fname, "r"); if ( !reg->file ) { fprintf(stderr,"[%s:%d %s] Could not open file: %s\n", __FILE__,__LINE__,__FUNCTION__,reg->fname); reg->file = NULL; bcf_sr_regions_destroy(reg); return -1; } reg->is_bin = 0; } // tabix index absent, reading the whole file ret = hts_getline(reg->file, KS_SEP_LINE, ®->line); if ( ret<0 ) { reg->iseq = -1; return -1; } } ret = _regions_parse_line(reg->line.s, ichr,ifrom,ito, &chr,&chr_end,&from,&to); if ( ret<0 ) { fprintf(stderr,"[%s:%d] Could not parse the file %s, using the columns %d,%d,%d\n", __FILE__,__LINE__,reg->fname,ichr+1,ifrom+1,ito+1); return -1; } } if ( is_bed ) from++; *chr_end = 0; if ( khash_str2int_get(reg->seq_hash, chr, ®->iseq)<0 ) { fprintf(stderr,"Broken tabix index? The sequence \"%s\" not in dictionary [%s]\n", chr,reg->line.s); exit(1); } *chr_end = '\t'; reg->start = from - 1; reg->end = to - 1; return 0; }
/* * _reader_fill_buffer() - buffers all records with the same coordinate */ static void _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) { // Return if the buffer is full: the coordinate of the last buffered record differs if ( reader->nbuffer && reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) return; // No iterator (sequence not present in this file) and not streaming if ( !reader->itr && !files->streaming ) return; // Fill the buffer with records starting at the same position int i, ret = 0; while (1) { if ( reader->nbuffer+1 >= reader->mbuffer ) { // Increase buffer size reader->mbuffer += 8; reader->buffer = (bcf1_t**) realloc(reader->buffer, sizeof(bcf1_t*)*reader->mbuffer); for (i=8; i>0; i--) // initialize { reader->buffer[reader->mbuffer-i] = bcf_init1(); reader->buffer[reader->mbuffer-i]->max_unpack = files->max_unpack; reader->buffer[reader->mbuffer-i]->pos = -1; // for rare cases when VCF starts from 1 } } if ( files->streaming ) { if ( reader->file->format.format==vcf ) { if ( (ret=hts_getline(reader->file, KS_SEP_LINE, &files->tmps)) < 0 ) break; // no more lines int ret = vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]); if ( ret<0 ) break; } else if ( reader->file->format.format==bcf ) { if ( (ret=bcf_read1(reader->file, reader->header, reader->buffer[reader->nbuffer+1])) < 0 ) break; // no more lines } else { fprintf(stderr,"[%s:%d %s] fixme: not ready for this\n", __FILE__,__LINE__,__FUNCTION__); exit(1); } } else if ( reader->tbx_idx ) { if ( (ret=tbx_itr_next(reader->file, reader->tbx_idx, reader->itr, &files->tmps)) < 0 ) break; // no more lines vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]); } else { if ( (ret=bcf_itr_next(reader->file, reader->itr, reader->buffer[reader->nbuffer+1])) < 0 ) break; // no more lines bcf_subset_format(reader->header,reader->buffer[reader->nbuffer+1]); } // apply filter if ( !reader->nfilter_ids ) bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR); else { bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR|BCF_UN_FLT); if ( !has_filter(reader, reader->buffer[reader->nbuffer+1]) ) continue; } reader->nbuffer++; if ( reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break; // the buffer is full } if ( ret<0 ) { // done for this region tbx_itr_destroy(reader->itr); reader->itr = NULL; } if ( files->collapse && reader->nbuffer>=2 && reader->buffer[1]->pos==reader->buffer[2]->pos ) collapse_buffer(files, reader); }
static void concat(args_t *args) { int i; if ( args->phased_concat ) // phased concat { // keep only two open files at a time while ( args->ifname < args->nfnames ) { int new_file = 0; while ( args->files->nreaders < 2 && args->ifname < args->nfnames ) { if ( !bcf_sr_add_reader(args->files,args->fnames[args->ifname]) ) error("Failed to open %s: %s\n", args->fnames[args->ifname],bcf_sr_strerror(args->files->errnum)); new_file = 1; args->ifname++; if ( args->start_pos[args->ifname-1]==-1 ) break; // new chromosome, start with only one file open if ( args->ifname < args->nfnames && args->start_pos[args->ifname]==-1 ) break; // next file starts on a different chromosome } // is there a line from the previous run? Seek the newly opened reader to that position int seek_pos = -1; int seek_chr = -1; if ( bcf_sr_has_line(args->files,0) ) { bcf1_t *line = bcf_sr_get_line(args->files,0); bcf_sr_seek(args->files, bcf_seqname(args->files->readers[0].header,line), line->pos); seek_pos = line->pos; seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[0].header,line)); } else if ( new_file ) bcf_sr_seek(args->files,NULL,0); // set to start int nret; while ( (nret = bcf_sr_next_line(args->files)) ) { if ( !bcf_sr_has_line(args->files,0) ) // no input from the first reader { // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped if ( ! bcf_sr_region_done(args->files,0) ) continue; phased_flush(args); bcf_sr_remove_reader(args->files, 0); } // Get a line to learn about current position for (i=0; i<args->files->nreaders; i++) if ( bcf_sr_has_line(args->files,i) ) break; bcf1_t *line = bcf_sr_get_line(args->files,i); // This can happen after bcf_sr_seek: indel may start before the coordinate which we seek to. if ( seek_chr>=0 && seek_pos>line->pos && seek_chr==bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line)) ) continue; seek_pos = seek_chr = -1; // Check if the position overlaps with the next, yet unopened, reader int must_seek = 0; while ( args->ifname < args->nfnames && args->start_pos[args->ifname]!=-1 && line->pos >= args->start_pos[args->ifname] ) { must_seek = 1; if ( !bcf_sr_add_reader(args->files,args->fnames[args->ifname]) ) error("Failed to open %s: %s\n", args->fnames[args->ifname],bcf_sr_strerror(args->files->errnum)); args->ifname++; } if ( must_seek ) { bcf_sr_seek(args->files, bcf_seqname(args->files->readers[i].header,line), line->pos); seek_pos = line->pos; seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line)); continue; } // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped if ( args->files->nreaders>1 && !bcf_sr_has_line(args->files,1) && !bcf_sr_region_done(args->files,1) ) continue; phased_push(args, bcf_sr_get_line(args->files,0), args->files->nreaders>1 ? bcf_sr_get_line(args->files,1) : NULL); } if ( args->files->nreaders ) { phased_flush(args); while ( args->files->nreaders ) bcf_sr_remove_reader(args->files, 0); } } } else if ( args->files ) // combining overlapping files, using synced reader { while ( bcf_sr_next_line(args->files) ) { for (i=0; i<args->files->nreaders; i++) { bcf1_t *line = bcf_sr_get_line(args->files,i); if ( !line ) continue; bcf_translate(args->out_hdr, args->files->readers[i].header, line); bcf_write1(args->out_fh, args->out_hdr, line); if ( args->remove_dups ) break; } } } else // concatenating { kstring_t tmp = {0,0,0}; int prev_chr_id = -1, prev_pos; bcf1_t *line = bcf_init(); for (i=0; i<args->nfnames; i++) { htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]); if ( !fp->is_bin && args->output_type&FT_VCF ) { line->max_unpack = BCF_UN_STR; // if VCF is on both input and output, avoid VCF to BCF conversion while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 ) { char *str = fp->line.s; while ( *str && *str!='\t' ) str++; tmp.l = 0; kputsn(fp->line.s,str-fp->line.s,&tmp); int chr_id = bcf_hdr_name2id(args->out_hdr, tmp.s); if ( chr_id<0 ) error("The sequence \"%s\" not defined in the header: %s\n(Quick workaround: index the file.)\n", tmp.s, args->fnames[i]); if ( prev_chr_id!=chr_id ) { prev_pos = -1; if ( args->seen_seq[chr_id] ) error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", tmp.s); } char *end; int pos = strtol(str+1,&end,10) - 1; if ( end==str+1 ) error("Could not parse line: %s\n", fp->line.s); if ( prev_pos > pos ) error("The chromosome block %s is not sorted, consider running with -a.\n", tmp.s); args->seen_seq[chr_id] = 1; prev_chr_id = chr_id; if ( vcf_write_line(args->out_fh, &fp->line)!=0 ) error("Failed to write %d bytes\n", fp->line.l); } } else { // BCF conversion is required line->max_unpack = 0; while ( bcf_read(fp, hdr, line)==0 ) { bcf_translate(args->out_hdr, hdr, line); if ( prev_chr_id!=line->rid ) { prev_pos = -1; if ( args->seen_seq[line->rid] ) error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", bcf_seqname(args->out_hdr, line)); } if ( prev_pos > line->pos ) error("The chromosome block %s is not sorted, consider running with -a.\n", bcf_seqname(args->out_hdr, line)); args->seen_seq[line->rid] = 1; prev_chr_id = line->rid; if ( bcf_write(args->out_fh, args->out_hdr, line)!=0 ) error("Failed to write\n"); } } bcf_hdr_destroy(hdr); hts_close(fp); } bcf_destroy(line); free(tmp.s); } }
/** * tabix workhorse function */ static int tabix_handler(request_rec *r) { htsFile *fp=NULL; hts_itr_t *itr=NULL; kstring_t line = {0,0,0}; int print_header=1; int print_body=1; struct tabix_callback_t handler; int http_status=OK; memset((void*)&handler,0,sizeof(struct tabix_callback_t)); handler.r=r; handler.limit=DEFAULT_LIMIT_RECORDS; if (!r->handler || strcmp(r->handler, "tabix-handler")) return (DECLINED); if (strcmp(r->method, "GET")!=0) return DECLINED; if(r->canonical_filename==NULL) return DECLINED; /* file must be b-gzipped */ if( !( str_ends_with(r->canonical_filename,".gz") )) return DECLINED; /* file must be indexed with tabix */ if( !( fileExtExists(r->canonical_filename,".tbi") )) return 404; handler.httParams = HttpParamParseGET(r); if(handler.httParams==NULL) return DECLINED; handler.file_format=E_FORMAT_UNDEFINED; if(str_ends_with(r->canonical_filename,".vcf.gz")) { handler.file_format=E_FORMAT_VCF; } else if(str_ends_with(r->canonical_filename,".bed.gz")) { handler.file_format=E_FORMAT_BED; } /* only one loop, we use this to cleanup the code, instead of using a goto statement */ do { const char* format=HttpParamGet(handler.httParams,"format"); const char* limit=HttpParamGet(handler.httParams,"limit"); const char* region=HttpParamGet(handler.httParams,"region"); int iterator_was_requested=FALSE; if(limit!=NULL) { handler.limit=atol(limit); } if(format==NULL) { http_status=DECLINED; break; } else if(strcmp(format,"xml")==0) { SETUP_HANDLER(xml); } else if(strcmp(format,"json")==0 || strcmp(format,"jsonp")==0) { handler.jsonp_callback=HttpParamGet(handler.httParams,"callback"); SETUP_HANDLER(json); } else if(strcmp(format,"html")==0) { SETUP_HANDLER(html); } else { SETUP_HANDLER(plain); } fp=hts_open(r->canonical_filename,"r"); if(fp==NULL) { http_status=HTTP_NOT_FOUND; break; } //read index handler.tbx = tbx_index_load(r->canonical_filename); if(handler.tbx==NULL) { http_status=HTTP_INTERNAL_SERVER_ERROR; break; } if(region!=NULL && !str_is_empty(region)) { iterator_was_requested=TRUE; itr = tbx_itr_querys(handler.tbx,region); } handler.startdocument(&handler); if(print_header) { handler.startheader(&handler); while ( hts_getline(fp, KS_SEP_LINE, &line) >= 0 ) { if ( !line.l || line.s[0]!=handler.tbx->conf.meta_char ) break; handler.header(&handler,&line); handler.count++; } handler.enddheader(&handler); } handler.count=0;//Reset if(print_body) { handler.startbody(&handler); if(iterator_was_requested) { if(itr!=NULL) { while ((handler.limit==-1 || handler.count< handler.limit) && tbx_itr_next(fp, handler.tbx, itr, &line) >= 0) { if(handler.show(&handler,&line)<0) break; handler.count++; } } } else { while ((handler.limit==-1 || handler.count< handler.limit) && \ hts_getline(fp, KS_SEP_LINE, &line) >= 0) { if(handler.show(&handler,&line)<0) break; handler.count++; } } handler.endbody(&handler); } handler.enddocument(&handler); } while(0);/* always abort */ //cleanup if(itr!=NULL) tbx_itr_destroy(itr); HttpParamFree(handler.httParams); free(line.s); if(fp!=NULL) hts_close(fp); if(handler.tbx!=NULL) tbx_destroy(handler.tbx); return http_status; }