int main_vcfview(int argc, char *argv[]) { int c, clevel = -1, in_type = FT_BCF, out_type = FT_VCF; char *fname_out = NULL, moder[8], modew[8]; while ((c = getopt(argc, argv, "l:bvo:n:z?hu")) >= 0) { switch (c) { case 'o': switch (optarg[0]) { case 'b': out_type = FT_BCF_GZ; break; case 'u': out_type = FT_BCF; break; case 'z': out_type = FT_VCF_GZ; break; case 'v': out_type = FT_VCF; break; default: error("The output type \"%s\" not recognised\n", optarg); } break; case 'l': clevel = atoi(optarg); out_type |= FT_GZ; break; case 'v': in_type = FT_VCF; break; case 'b': out_type = FT_BCF_GZ; break; case 'u': out_type = FT_BCF; break; case 'z': out_type = FT_VCF_GZ; break; case 'n': fname_out = optarg; break; case '?': case 'h': usage(); return 1; break; } } if (argc!=optind+1) { usage(); return 1; } // Init reader strcpy(moder, "r"); if ( (!strcmp("-",argv[optind]) && (in_type & FT_BCF)) || (hts_file_type(argv[optind]) & FT_BCF)) strcat(moder, "b"); htsFile *fp_in = hts_open(argv[optind], moder, NULL); if ( !fp_in ) error("Fail to open: %s\n", argv[optind]); bcf_hdr_t *hdr = vcf_hdr_read(fp_in); if ( !hdr ) error("Fail to read VCF/BCF header: %s\n", argv[optind]); bcf1_t *rec = bcf_init1(); // Init writer strcpy(modew, "w"); if (clevel >= 0 && clevel <= 9) sprintf(modew + 1, "%d", clevel); if (out_type & FT_GZ) strcat(modew,"z"); if (out_type & FT_BCF) strcat(modew, "b"); if (out_type == FT_BCF) strcat(modew, "u"); // uncompressed BCF output htsFile *fp_out = hts_open(fname_out ? fname_out : "-", modew, NULL); vcf_hdr_write(fp_out, hdr); while ( vcf_read1(fp_in, hdr, rec) >= 0) vcf_write1(fp_out, hdr, rec); bcf_destroy1(rec); bcf_hdr_destroy(hdr); hts_close(fp_in); hts_close(fp_out); return 0; }
int bam_access_openhts(char *hts_file, char *ref_file){ assert(hts_file != NULL); //Assign memory for the file name etc holding struct fholder = malloc(sizeof(file_holder)); check_mem(fholder); //Beginning and end of tmp struct for bam access fholder->beg = 0; fholder->end = 0x7fffffff; // The max 32 bit integer. //Open a file for read from compressed bam. fholder->in = hts_open(hts_file, "r"); check(fholder->in != 0,"HTS file %s failed to open.",hts_file); fholder->idx = sam_index_load(fholder->in,hts_file); check(fholder->idx != 0,"HTS index file %s failed to open.",hts_file); if(ref_file){ hts_set_fai_filename(fholder->in, ref_file); }else{ if(fholder->in->format.format == cram) log_warn("No reference file provided for a cram input file, if the reference described in the cram header can't be located this script may fail."); } //Check for generic header read method. fholder->head = sam_hdr_read(fholder->in); return 0; error: if(fholder->in) hts_close(fholder->in); if(fholder) free(fholder); return -1; }
/** * Reads header of a VCF file and returns the bcf header object. * This wraps around vcf_hdr_read from the original htslib to * allow for an alternative header file to be read in. * * this searches for the alternative header saved as <filename>.hdr */ bcf_hdr_t *bcf_alt_hdr_read(htsFile *fp) { bcf_hdr_t *h = NULL; //check for existence of alternative header kstring_t alt_hdr_fn = {0, 0, 0}; kputs(fp->fn, &alt_hdr_fn); kputs(".hdr", &alt_hdr_fn); FILE *file = fopen(alt_hdr_fn.s, "r"); if (!file) { h = bcf_hdr_read(fp); } else { fprintf(stderr, "[I:%s:%d %s] read alternative header for %s\n", __FILE__, __LINE__, __FUNCTION__, fp->fn); fclose(file); htsFile *alt_hdr = hts_open(alt_hdr_fn.s, "r"); h = bcf_hdr_read(alt_hdr); hts_close(alt_hdr); //helps move the pointer to the right place bcf_hdr_t *temp_h = bcf_hdr_read(fp); bcf_hdr_destroy(temp_h); } if (alt_hdr_fn.m) free(alt_hdr_fn.s); return h; }
TBXOrderedReader::TBXOrderedReader(std::string hts_file, std::vector<GenomeInterval>& intervals) { this->hts_file = hts_file; this->intervals = intervals; interval_index = 0; hts = NULL; tbx = NULL; itr = NULL; s = {0, 0, 0}; hts = hts_open(hts_file.c_str(), "r"); intervals_present = intervals.size()!=0; if ((tbx = tbx_index_load(hts_file.c_str()))) { index_loaded = true; } else { if (intervals_present) { fprintf(stderr, "[E:%s] index cannot be loaded for %s\n", __FUNCTION__, hts_file.c_str()); exit(1); } } random_access_enabled = intervals_present && index_loaded; };
static int query_chroms(char *fname) { const char **seq; int i, nseq, ftype = file_type(fname); if ( ftype & IS_TXT || !ftype ) { tbx_t *tbx = tbx_index_load(fname); if ( !tbx ) error("Could not load .tbi index of %s\n", fname); seq = tbx_seqnames(tbx, &nseq); for (i=0; i<nseq; i++) printf("%s\n", seq[i]); free(seq); tbx_destroy(tbx); } else if ( ftype==IS_BCF ) { htsFile *fp = hts_open(fname,"r"); if ( !fp ) error("Could not read %s\n", fname); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Could not read the header: %s\n", fname); hts_close(fp); hts_idx_t *idx = bcf_index_load(fname); if ( !idx ) error("Could not load .csi index of %s\n", fname); seq = bcf_index_seqnames(idx, hdr, &nseq); for (i=0; i<nseq; i++) printf("%s\n", seq[i]); free(seq); bcf_hdr_destroy(hdr); hts_idx_destroy(idx); } else if ( ftype==IS_BAM ) // todo: BAM error("BAM: todo\n"); return 0; }
htsFile *read_vcf_file(char * fname) { htsFile *fp = hts_open(fname, "r"); if ( !fp ) error ("Could not read file %s : %s", fname, strerror(errno)); return fp; }
/** * Parse multiple files from command line unlabeled arguments or -L denoted file list. If both are defined, the files are merged. * * @files - file names are stored in this vector * @argument_files - vector of input files * @file_list - file names stored in a file * */ void Program::parse_files(std::vector<std::string>& files, const std::vector<std::string>& arg_files, std::string file_list) { files.clear(); if (arg_files.size()!=0) { files = arg_files; } if (file_list != "") { htsFile *file = hts_open(file_list.c_str(), "r"); if (file==NULL) { std::cerr << "cannot open " << file_list << "\n"; exit(1); } kstring_t *s = &file->line; while (hts_getline(file, '\n', s) >= 0) { if (s->s[0]!='#') { files.push_back(std::string(s->s)); } } hts_close(file); } }
Tabix::Tabix(string& file) { has_jumped = false; filename = file; const char* cfilename = file.c_str(); struct stat stat_tbi,stat_vcf; char *fnidx = (char*) calloc(strlen(cfilename) + 5, 1); strcat(strcpy(fnidx, cfilename), ".tbi"); if ( bgzf_is_bgzf(cfilename)!=1 ) { cerr << "[tabix++] was bgzip used to compress this file? " << file << endl; free(fnidx); exit(1); } // Common source of errors: new VCF is used with an old index stat(fnidx, &stat_tbi); stat(cfilename, &stat_vcf); if ( stat_vcf.st_mtime > stat_tbi.st_mtime ) { cerr << "[tabix++] the index file is older than the vcf file. Please use '-f' to overwrite or reindex." << endl; free(fnidx); exit(1); } free(fnidx); if ((fn = hts_open(cfilename, "r")) == 0) { cerr << "[tabix++] fail to open the data file." << endl; exit(1); } if ((tbx = tbx_index_load(cfilename)) == NULL) { cerr << "[tabix++] failed to load the index file." << endl; exit(1); } int nseq; const char** seq = tbx_seqnames(tbx, &nseq); for (int i=0; i<nseq; i++) { chroms.push_back(seq[i]); } free(seq); idxconf = &tbx_conf_vcf; // set up the iterator, defaults to the beginning if (nseq == 0){ // the vcf file contains only the header according // to the index iter = NULL; current_chrom = chroms.end(); }else{ current_chrom = chroms.begin(); iter = tbx_itr_querys(tbx, current_chrom->c_str()); } }
int bcf_sr_set_targets(readers_t *files, const char *fname) { regions_t *tgts = (regions_t *) calloc(1,sizeof(regions_t)); tgts->file = hts_open(fname, "rb", NULL); if ( !tgts->file ) return 0; tgts->tbx = tbx_index_load(fname); tgts->seq_names = (char**) tbx_seqnames(tgts->tbx, &tgts->nseqs); tgts->cseq = -1; files->targets = tgts; return 1; }
bam_hdr_t* hts_file_header(string& filename, string& header) { samFile *in = hts_open(filename.c_str(), "r"); if (in == NULL) { cerr << "[vg::alignment] could not open " << filename << endl; exit(1); } bam_hdr_t *hdr = sam_hdr_read(in); header = hdr->text; bam_hdr_destroy(hdr); hts_close(in); return hdr; }
/* * Reads a file and outputs a new CRAM file to stdout with 'h' * replaced as the header. No checks are made to the validity. * * FIXME: error checking */ int cram_reheader(cram_fd *in, bam_hdr_t *h, const char *arg_list, int add_PG) { htsFile *h_out = hts_open("-", "wc"); cram_fd *out = h_out->fp.cram; cram_container *c = NULL; int ret = -1; // Attempt to fill out a cram->refs[] array from @SQ headers cram_fd_set_header(out, sam_hdr_parse_(h->text, h->l_text)); if (add_PG) { if (sam_hdr_add_PG(cram_fd_get_header(out), "samtools", "VN", samtools_version(), arg_list ? "CL": NULL, arg_list ? arg_list : NULL, NULL) != 0) goto err; // Covert back to bam_hdr_t struct free(h->text); h->text = strdup(sam_hdr_str(cram_fd_get_header(out))); h->l_text = sam_hdr_length(cram_fd_get_header(out)); if (!h->text) goto err; } if (sam_hdr_write(h_out, h) != 0) goto err; cram_set_option(out, CRAM_OPT_REFERENCE, NULL); while ((c = cram_read_container(in))) { int32_t i, num_blocks = cram_container_get_num_blocks(c); if (cram_write_container(out, c) != 0) goto err; for (i = 0; i < num_blocks; i++) { cram_block *blk = cram_read_block(in); if (!blk || cram_write_block(out, blk) != 0) { if (blk) cram_free_block(blk); goto err; } cram_free_block(blk); } cram_free_container(c); } ret = 0; err: if (hts_close(h_out) != 0) ret = -1; return ret; }
static int load_genmap(args_t *args, bcf1_t *line) { if ( !args->genmap_fname ) { args->ngenmap = 0; return 0; } kstring_t str = {0,0,0}; char *fname = strstr(args->genmap_fname,"{CHROM}"); if ( fname ) { kputsn(args->genmap_fname, fname - args->genmap_fname, &str); kputs(bcf_seqname(args->hdr,line), &str); kputs(fname+7,&str); fname = str.s; } else fname = args->genmap_fname; htsFile *fp = hts_open(fname, "rb"); if ( !fp ) { args->ngenmap = 0; return -1; } hts_getline(fp, KS_SEP_LINE, &str); if ( strcmp(str.s,"position COMBINED_rate(cM/Mb) Genetic_Map(cM)") ) error("Unexpected header, found:\n\t[%s], but expected:\n\t[position COMBINED_rate(cM/Mb) Genetic_Map(cM)]\n", fname, str.s); args->ngenmap = args->igenmap = 0; while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 ) { args->ngenmap++; hts_expand(genmap_t,args->ngenmap,args->mgenmap,args->genmap); genmap_t *gm = &args->genmap[args->ngenmap-1]; char *tmp, *end; gm->pos = strtol(str.s, &tmp, 10); if ( str.s==tmp ) error("Could not parse %s: %s\n", fname, str.s); // skip second column tmp++; while ( *tmp && !isspace(*tmp) ) tmp++; // read the genetic map in cM gm->rate = strtod(tmp+1, &end); if ( tmp+1==end ) error("Could not parse %s: %s\n", fname, str.s); } if ( !args->ngenmap ) error("Genetic map empty?\n"); int i; for (i=0; i<args->ngenmap; i++) args->genmap[i].rate /= args->genmap[args->ngenmap-1].rate; // scale to 1 if ( hts_close(fp) ) error("Close failed\n"); free(str.s); return 0; }
bcf_sweep_t *bcf_sweep_init(const char *fname) { bcf_sweep_t *sw = (bcf_sweep_t*) calloc(1,sizeof(bcf_sweep_t)); sw->file = hts_open(fname, "r"); sw->fp = hts_get_bgzfp(sw->file); bgzf_index_build_init(sw->fp); sw->hdr = bcf_hdr_read(sw->file); sw->mrec = 1; sw->rec = (bcf1_t*) calloc(sw->mrec,(sizeof(bcf1_t))); sw->block_size = 1024*1024*3; sw->direction = SW_FWD; return sw; }
htsFile* safe_hts_open( char* path, char* mode) { htsFile* bam_file; char err[500]; bam_file = hts_open( path, mode); if( !bam_file) { sprintf( err, "[TARDIS INPUT ERROR] Unable to open file %s in %s mode.", path, mode[0]=='w' ? "write" : "read"); print_error( err); } return bam_file; }
samfile_t *samopen(const char *fn, const char *mode, const void *aux) { // hts_open() is really sam_open(), except for #define games samFile *hts_fp = hts_open(fn, mode); if (hts_fp == NULL) return NULL; samfile_t *fp = malloc(sizeof (samfile_t)); if (!fp) { sam_close(hts_fp); return NULL; } fp->file = hts_fp; fp->x.bam = hts_fp->fp.bgzf; if (strchr(mode, 'r')) { if (aux) { if (hts_set_fai_filename(fp->file, aux) != 0) { sam_close(hts_fp); free(fp); return NULL; } } fp->header = sam_hdr_read(fp->file); // samclose() will free this if (fp->header == NULL) { sam_close(hts_fp); free(fp); return NULL; } fp->is_write = 0; if (fp->header->n_targets == 0 && bam_verbose >= 1) fprintf(samtools_stderr, "[samopen] no @SQ lines in the header.\n"); } else { enum htsExactFormat fmt = hts_get_format(fp->file)->format; fp->header = (bam_hdr_t *)aux; // For writing, we won't free it fp->is_write = 1; if (!(fmt == text_format || fmt == sam) || strchr(mode, 'h')) { if (sam_hdr_write(fp->file, fp->header) < 0) { if (bam_verbose >= 1) fprintf(samtools_stderr, "[samopen] Couldn't write header\n"); sam_close(hts_fp); free(fp); return NULL; } } } return fp; }
void union_data::scanGenotypes(string filename) { vrb.title("Scanning genotype data in [" + filename + "]"); htsFile * fp = hts_open(filename.c_str(),"r"); enum htsExactFormat fileformat = fp->format.format; hts_close(fp); if (fileformat == bcf) { vrb.bullet("File format detected: BCF"); scanGenotypesVCF(filename); } else if (fileformat == vcf) { vrb.bullet("File format detected: VCF"); scanGenotypesVCF(filename); } else if (fileformat == sam) { vrb.bullet("File format detected: BED"); scanGenotypesBED(filename); } else vrb.error("File format not supported!"); }
void cis_data::scanPhenotypes(string fbed) { int n_includedP = 0; int n_excludedP = 0; int n_negativeStrd = 0; //Open BED file vrb.title("Scanning phenotype data in [" + fbed + "]"); htsFile *fp = hts_open(fbed.c_str(),"r"); if (!fp) vrb.error("Cannot open file"); tbx_t * tbx = tbx_index_load(fbed.c_str()); if (!tbx) vrb.error("Cannot open index file"); //Read header kstring_t str = {0,0,0}; if (!hts_getline(fp, KS_SEP_LINE, &str) || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line"); //Scan file vector < string > tokens; while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) { if (str.l && str.s[0] != tbx->conf.meta_char) { stb.split(string(str.s), tokens); if (tokens.size() < 5) vrb.error("Incorrect number of columns!"); if ((grp_mode == GRP_NONE && filter_phenotype.check(tokens[3])) || (grp_mode != GRP_NONE && filter_phenotype.check(tokens[4]))) { phenotype_id.push_back(tokens[3]); phenotype_chr.push_back(tokens[0]); phenotype_start.push_back(atoi(tokens[1].c_str()) + 1); phenotype_end.push_back(atoi(tokens[2].c_str())); if (grp_mode > 0 && full_test) phenotype_grp.push_back("ALL_GENES"); if (grp_mode > 0 && !full_test) phenotype_grp.push_back(tokens[4]); phenotype_neg.push_back(tokens[5] == "-"); if (phenotype_neg.back()) n_negativeStrd ++; n_includedP++; } else n_excludedP ++; } } //Finalize & verbose tbx_destroy(tbx); if (hts_close(fp)) vrb.error("Cannot properly close file"); phenotype_count = phenotype_id.size(); vrb.bullet(stb.str(n_includedP) + " phenotypes included"); if (n_excludedP > 0) vrb.bullet(stb.str(n_excludedP) + " phenotypes excluded by user"); if (n_negativeStrd > 0 ) vrb.bullet(stb.str(n_negativeStrd) + " phenotypes are on the negative strand"); if (phenotype_count == 0) vrb.leave("Cannot find phenotypes in region!"); }
static void read_header_file(char *fname, kstring_t *hdr) { kstring_t tmp = {0,0,0}; hdr->l = 0; htsFile *fp = hts_open(fname, "r"); if ( !fp ) error("Could not read: %s\n", fname); while ( hts_getline(fp, KS_SEP_LINE, &tmp) > 0 ) { kputsn(tmp.s,tmp.l,hdr); kputc('\n',hdr); } if ( hts_close(fp) ) error("Close failed: %s\n", fname); free(tmp.s); while ( hdr->l>0 && isspace(hdr->s[hdr->l-1]) ) hdr->l--; // remove trailing newlines kputc('\n',hdr); }
int hts_for_each(string& filename, function<void(Alignment&)> lambda) { samFile *in = hts_open(filename.c_str(), "r"); if (in == NULL) return 0; bam_hdr_t *hdr = sam_hdr_read(in); map<string, string> rg_sample; parse_rg_sample_map(hdr->text, rg_sample); bam1_t *b = bam_init1(); while (sam_read1(in, hdr, b) >= 0) { Alignment a = bam_to_alignment(b, rg_sample); lambda(a); } bam_destroy1(b); bam_hdr_destroy(hdr); hts_close(in); return 1; }
static void init_data(args_t *args) { args->hdr = args->files->readers[0].header; args->hdr_out = bcf_hdr_dup(args->hdr); init_plugin(args); if ( args->filter_str ) args->filter = filter_init(args->hdr, args->filter_str); bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_plugin"); if ( !args->drop_header ) { args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); bcf_hdr_write(args->out_fh, args->hdr_out); } }
static void reheader_vcf(args_t *args) { kstring_t hdr = {0,0,0}; htsFile *fp = hts_open(args->fname, "r"); if ( !fp ) error("Failed to open: %s\n", args->fname); while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 ) { kputc('\n',&fp->line); // hts_getline eats the newline character if ( fp->line.s[0]!='#' ) break; kputsn(fp->line.s,fp->line.l,&hdr); } int nsamples = 0; char **samples = NULL; if ( args->samples_fname ) samples = hts_readlines(args->samples_fname, &nsamples); if ( args->header_fname ) { free(hdr.s); hdr.s = NULL; hdr.l = hdr.m = 0; read_header_file(args->header_fname, &hdr); } if ( samples ) { set_samples(samples, nsamples, &hdr); int i; for (i=0; i<nsamples; i++) free(samples[i]); free(samples); } int out = STDOUT_FILENO; if ( write(out, hdr.s, hdr.l)!=hdr.l ) error("Failed to write %d bytes\n", hdr.l); free(hdr.s); if ( fp->line.l ) { if ( write(out, fp->line.s, fp->line.l)!=fp->line.l ) error("Failed to write %d bytes\n", fp->line.l); } while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 ) // uncompressed file implies small size, we don't worry about speed { kputc('\n',&fp->line); if ( write(out, fp->line.s, fp->line.l)!=fp->line.l ) error("Failed to write %d bytes\n", fp->line.l); } hts_close(fp); }
int file_type(const char *fname) { int l = strlen(fname); if (l>=7 && strcasecmp(fname+l-7, ".gff.gz") == 0) return IS_GFF; else if (l>=7 && strcasecmp(fname+l-7, ".bed.gz") == 0) return IS_BED; else if (l>=7 && strcasecmp(fname+l-7, ".sam.gz") == 0) return IS_SAM; else if (l>=7 && strcasecmp(fname+l-7, ".vcf.gz") == 0) return IS_VCF; else if (l>=4 && strcasecmp(fname+l-4, ".bcf") == 0) return IS_BCF; else if (l>=4 && strcasecmp(fname+l-4, ".bam") == 0) return IS_BAM; else if (l>=4 && strcasecmp(fname+l-5, ".cram") == 0) return IS_CRAM; htsFile *fp = hts_open(fname,"r"); enum htsExactFormat format = fp->format.format; hts_close(fp); if ( format == bcf ) return IS_BCF; if ( format == bam ) return IS_BAM; if ( format == cram ) return IS_CRAM; if ( format == vcf ) return IS_VCF; return 0; }
hts_streamer:: hts_streamer( const char* filename, const char* region) : _is_record_set(false), _is_stream_end(false), _record_no(0), _stream_name(filename), _hfp(nullptr), _tidx(nullptr), _titr(nullptr), _kstr(kinit) { if (! filename) { BOOST_THROW_EXCEPTION(illumina::common::GeneralException("hts filename is null ptr")); } if ('\0' == *filename) { BOOST_THROW_EXCEPTION(illumina::common::GeneralException("hts filename is empty string")); } _hfp = hts_open(filename, "r"); if (! _hfp) { std::ostringstream oss; oss << "Failed to open hts file: '" << filename << "'"; BOOST_THROW_EXCEPTION(illumina::common::GeneralException(oss.str())); } _load_index(); // read only a region of HTS file: if (region) { resetRegion(region); } }
/** * Parse intervals. Processes the interval list first followed by the interval string. Duplicates are dropped. * * @intervals - intervals stored in this vector * @interval_list - file containing intervals * @interval_string - comma delimited intervals in a string * * todo: merge overlapping sites? */ void Program::parse_intervals(std::vector<GenomeInterval>& intervals, std::string interval_list, std::string interval_string) { intervals.clear(); std::map<std::string, uint32_t> m; if (interval_list!="") { htsFile *file = hts_open(interval_list.c_str(), "r"); if (file) { kstring_t *s = &file->line; while (hts_getline(file, '\n', s)>=0) { std::string ss = std::string(s->s); if (m.find(ss)==m.end()) { m[ss] = 1; GenomeInterval interval(ss); intervals.push_back(interval); } } hts_close(file); } } std::vector<std::string> v; if (interval_string!="") split(v, ",", interval_string); for (uint32_t i=0; i<v.size(); ++i) { if (m.find(v[i])==m.end()) { m[v[i]] = 1; GenomeInterval interval(v[i]); intervals.push_back(interval); } } }
int main(int argc, char **argv) { if (argc != 3) { fprintf(stderr,"anno_setter <in.vcf.gz> <columns_string>\n"); return 1; } bcf_hdr_t *h = NULL; //bcf_hdr_init(); htsFile *fp = hts_open(argv[1], "r"); if (fp == NULL) error("%s : %s", argv[1], strerror(errno)); h = bcf_hdr_read(fp); if (h == NULL) error("failed to prase header"); bcf_hdr_t *out = bcf_hdr_dup(h); char *string = strdup(argv[2]); int ncols = 0; anno_col_t *cols = init_columns(string, h, out, &ncols, anno_is_vcf); print_anno_cols(cols, ncols); hts_close(fp); return 0; }
int hts_for_each_parallel(string& filename, function<void(Alignment&)> lambda) { samFile *in = hts_open(filename.c_str(), "r"); if (in == NULL) return 0; bam_hdr_t *hdr = sam_hdr_read(in); map<string, string> rg_sample; parse_rg_sample_map(hdr->text, rg_sample); int thread_count = get_thread_count(); vector<bam1_t*> bs; bs.resize(thread_count); for (auto& b : bs) { b = bam_init1(); } bool more_data = true; #pragma omp parallel shared(in, hdr, more_data, rg_sample) { int tid = omp_get_thread_num(); while (more_data) { bam1_t* b = bs[tid]; #pragma omp critical (hts_input) if (more_data) { more_data = sam_read1(in, hdr, b) >= 0; } if (more_data) { Alignment a = bam_to_alignment(b, rg_sample); lambda(a); } } } for (auto& b : bs) bam_destroy1(b); bam_hdr_destroy(hdr); hts_close(in); return 1; }
/** * Parse samples. Processes the sample list. Duplicates are dropped. * * @samples - samples stored in this vector * @sample_map - samples stored in this map * @sample_list - file containing sample names */ void Program::read_sample_list(std::vector<std::string>& samples, std::string sample_list) { samples.clear(); std::map<std::string, int32_t> map; if (sample_list!="") { htsFile *file = hts_open(sample_list.c_str(), "r"); if (file) { kstring_t *s = &file->line; while (hts_getline(file, '\n', s)>=0) { std::string ss = std::string(s->s); if (map.find(ss)==map.end()) { map[ss] = 1; samples.push_back(ss); } } hts_close(file); } } }
static void init_data(args_t *args) { args->aux.srs = bcf_sr_init(); // Open files for input and output, initialize structures if ( args->targets ) { if ( bcf_sr_set_targets(args->aux.srs, args->targets, args->targets_is_file, args->aux.flag&CALL_CONSTR_ALLELES ? 3 : 0)<0 ) error("Failed to read the targets: %s\n", args->targets); if ( args->aux.flag&CALL_CONSTR_ALLELES && args->flag&CF_INS_MISSED ) { args->aux.srs->targets->missed_reg_handler = print_missed_line; args->aux.srs->targets->missed_reg_data = args; } } if ( args->regions ) { if ( bcf_sr_set_regions(args->aux.srs, args->regions, args->regions_is_file)<0 ) error("Failed to read the targets: %s\n", args->regions); } int i; if ( !bcf_sr_add_reader(args->aux.srs, args->bcf_fname) ) error("Failed to open: %s\n", args->bcf_fname); if ( args->nsamples && args->nsamples != bcf_hdr_nsamples(args->aux.srs->readers[0].header) ) { args->samples_map = (int *) malloc(sizeof(int)*args->nsamples); args->aux.hdr = bcf_hdr_subset(args->aux.srs->readers[0].header, args->nsamples, args->samples, args->samples_map); for (i=0; i<args->nsamples; i++) if ( args->samples_map[i]<0 ) error("No such sample: %s\n", args->samples[i]); if ( !bcf_hdr_nsamples(args->aux.hdr) ) error("No matching sample found\n"); } else { args->aux.hdr = bcf_hdr_dup(args->aux.srs->readers[0].header); for (i=0; i<args->nsamples; i++) if ( bcf_hdr_id2int(args->aux.hdr,BCF_DT_SAMPLE,args->samples[i])<0 ) error("No such sample: %s\n", args->samples[i]); } // Reorder ploidy and family indexes to match mpileup's output and exclude samples which are not available if ( args->aux.ploidy ) { for (i=0; i<args->aux.nfams; i++) { int j; for (j=0; j<3; j++) { int k = bcf_hdr_id2int(args->aux.hdr, BCF_DT_SAMPLE, args->samples[ args->aux.fams[i].sample[j] ]); if ( k<0 ) error("No such sample: %s\n", args->samples[ args->aux.fams[i].sample[j] ]); args->aux.fams[i].sample[j] = k; } } uint8_t *ploidy = (uint8_t*) calloc(bcf_hdr_nsamples(args->aux.hdr), 1); for (i=0; i<args->nsamples; i++) // i index in -s sample list { int j = bcf_hdr_id2int(args->aux.hdr, BCF_DT_SAMPLE, args->samples[i]); // j index in the output VCF / subset VCF if ( j<0 ) { fprintf(stderr,"Warning: no such sample: \"%s\"\n", args->samples[i]); continue; } ploidy[j] = args->aux.ploidy[i]; } args->nsamples = bcf_hdr_nsamples(args->aux.hdr); for (i=0; i<args->nsamples; i++) assert( ploidy[i]==0 || ploidy[i]==1 || ploidy[i]==2 ); free(args->aux.ploidy); args->aux.ploidy = ploidy; } args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->flag & CF_QCALL ) return; if ( args->flag & CF_MCALL ) mcall_init(&args->aux); if ( args->flag & CF_CCALL ) ccall_init(&args->aux); if ( args->flag&CF_GVCF ) { bcf_hdr_append(args->aux.hdr,"##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">"); args->gvcf.rid = -1; args->gvcf.line = bcf_init1(); args->gvcf.gt = (int32_t*) malloc(2*sizeof(int32_t)*bcf_hdr_nsamples(args->aux.hdr)); for (i=0; i<bcf_hdr_nsamples(args->aux.hdr); i++) { args->gvcf.gt[2*i+0] = bcf_gt_unphased(0); args->gvcf.gt[2*i+1] = bcf_gt_unphased(0); } } bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "QS"); bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "I16"); bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call"); bcf_hdr_write(args->out_fh, args->aux.hdr); if ( args->flag&CF_INS_MISSED ) init_missed_line(args); }
void isec_vcf(args_t *args) { bcf_srs_t *files = args->files; kstring_t str = {0,0,0}; htsFile *out_fh = NULL; // When only one VCF is output, print VCF to pysam_stdout or -o file int out_std = 0; if ( args->nwrite==1 && !args->prefix ) out_std = 1; if ( args->targets_list && files->nreaders==1 ) out_std = 1; if ( out_std ) { out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type)); if ( out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec"); bcf_hdr_write(out_fh, files->readers[args->iwrite].header); } if ( !args->nwrite && !out_std && !args->prefix ) fprintf(pysam_stderr,"Note: -w option not given, printing list of sites...\n"); int n; while ( (n=bcf_sr_next_line(files)) ) { bcf_sr_t *reader = NULL; bcf1_t *line = NULL; int i, ret = 0; for (i=0; i<files->nreaders; i++) { if ( !bcf_sr_has_line(files,i) ) continue; if ( args->nflt && args->flt[i] ) { bcf1_t *rec = bcf_sr_get_line(files, i); int pass = filter_test(args->flt[i], rec, NULL); if ( args->flt_logic[i] & FLT_EXCLUDE ) pass = pass ? 0 : 1; if ( !pass ) { files->has_line[i] = 0; n--; continue; } } if ( !line ) { line = files->readers[i].buffer[0]; reader = &files->readers[i]; } ret |= 1<<i; // this may overflow for many files, but will be used only with two (OP_VENN) } switch (args->isec_op) { case OP_COMPLEMENT: if ( n!=1 || !bcf_sr_has_line(files,0) ) continue; break; case OP_EQUAL: if ( n != args->isec_n ) continue; break; case OP_PLUS: if ( n < args->isec_n ) continue; break; case OP_MINUS: if ( n > args->isec_n ) continue; break; case OP_EXACT: for (i=0; i<files->nreaders; i++) if ( files->has_line[i] != args->isec_exact[i] ) break; if ( i<files->nreaders ) continue; break; } if ( out_std ) { if ( bcf_sr_has_line(files,args->iwrite) ) bcf_write1(out_fh, files->readers[args->iwrite].header, files->readers[args->iwrite].buffer[0]); continue; } else if ( args->fh_sites ) { str.l = 0; kputs(reader->header->id[BCF_DT_CTG][line->rid].key, &str); kputc('\t', &str); kputw(line->pos+1, &str); kputc('\t', &str); if (line->n_allele > 0) kputs(line->d.allele[0], &str); else kputc('.', &str); kputc('\t', &str); if (line->n_allele > 1) kputs(line->d.allele[1], &str); else kputc('.', &str); for (i=2; i<line->n_allele; i++) { kputc(',', &str); kputs(line->d.allele[i], &str); } kputc('\t', &str); for (i=0; i<files->nreaders; i++) kputc(bcf_sr_has_line(files,i)?'1':'0', &str); kputc('\n', &str); fwrite(str.s,sizeof(char),str.l,args->fh_sites); } if ( args->prefix ) { if ( args->isec_op==OP_VENN && ret==3 ) { if ( !args->nwrite || args->write[0] ) bcf_write1(args->fh_out[2], bcf_sr_get_header(files,0), bcf_sr_get_line(files,0)); if ( !args->nwrite || args->write[1] ) bcf_write1(args->fh_out[3], bcf_sr_get_header(files,1), bcf_sr_get_line(files,1)); } else { for (i=0; i<files->nreaders; i++) { if ( !bcf_sr_has_line(files,i) ) continue; if ( args->write && !args->write[i] ) continue; bcf_write1(args->fh_out[i], files->readers[i].header, files->readers[i].buffer[0]); } } } } if ( str.s ) free(str.s); if ( out_fh ) hts_close(out_fh); }
static void init_data(args_t *args) { bcf1_t *line = NULL; // With phased concat, the chunks overlap and come in the right order. To // avoid opening all files at once, store start positions to recognise need // for the next one. This way we can keep only two open chunks at once. if ( args->phased_concat ) { args->start_pos = (int*) malloc(sizeof(int)*args->nfnames); line = bcf_init(); } kstring_t str = {0,0,0}; int i, prev_chrid = -1; for (i=0; i<args->nfnames; i++) { htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]); args->out_hdr = bcf_hdr_merge(args->out_hdr,hdr); if ( bcf_hdr_nsamples(hdr) != bcf_hdr_nsamples(args->out_hdr) ) error("Different number of samples in %s. Perhaps \"bcftools merge\" is what you are looking for?\n", args->fnames[i]); int j; for (j=0; j<bcf_hdr_nsamples(hdr); j++) if ( strcmp(args->out_hdr->samples[j],hdr->samples[j]) ) error("Different sample names in %s. Perhaps \"bcftools merge\" is what you are looking for?\n", args->fnames[i]); if ( args->phased_concat ) { int ret = bcf_read(fp, hdr, line); if ( ret!=0 ) args->start_pos[i] = -2; // empty file else { int chrid = bcf_hdr_id2int(args->out_hdr,BCF_DT_CTG,bcf_seqname(hdr,line)); args->start_pos[i] = chrid==prev_chrid ? line->pos : -1; prev_chrid = chrid; } } bcf_hdr_destroy(hdr); hts_close(fp); } free(str.s); if ( line ) bcf_destroy(line); args->seen_seq = (int*) calloc(args->out_hdr->n[BCF_DT_CTG],sizeof(int)); if ( args->phased_concat ) { bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PQ,Number=1,Type=Integer,Description=\"Phasing Quality (bigger is better)\">"); bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PS,Number=1,Type=Integer,Description=\"Phase Set\">"); } if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat"); args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); bcf_hdr_write(args->out_fh, args->out_hdr); if ( args->allow_overlaps ) { args->files = bcf_sr_init(); args->files->require_index = 1; if ( args->regions_list ) { if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } if ( args->remove_dups ) { if ( !strcmp(args->remove_dups,"snps") ) args->files->collapse |= COLLAPSE_SNPS; else if ( !strcmp(args->remove_dups,"indels") ) args->files->collapse |= COLLAPSE_INDELS; else if ( !strcmp(args->remove_dups,"both") ) args->files->collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS; else if ( !strcmp(args->remove_dups,"any") ) args->files->collapse |= COLLAPSE_ANY; else if ( !strcmp(args->remove_dups,"all") ) args->files->collapse |= COLLAPSE_ANY; else if ( !strcmp(args->remove_dups,"none") ) args->files->collapse = COLLAPSE_NONE; else error("The -D string \"%s\" not recognised.\n", args->remove_dups); } for (i=0; i<args->nfnames; i++) if ( !bcf_sr_add_reader(args->files,args->fnames[i]) ) error("Failed to open %s: %s\n", args->fnames[i],bcf_sr_strerror(args->files->errnum)); } else if ( args->phased_concat ) { // Remove empty files from the list int nok = 0; while (1) { while ( nok<args->nfnames && args->start_pos[nok]!=-2 ) nok++; if ( nok==args->nfnames ) break; i = nok; while ( i<args->nfnames && args->start_pos[i]==-2 ) i++; if ( i==args->nfnames ) break; int tmp = args->start_pos[nok]; args->start_pos[nok] = args->start_pos[i]; args->start_pos[i] = tmp; char *str = args->fnames[nok]; args->fnames[nok] = args->fnames[i]; args->fnames[i] = str; } for (i=nok; i<args->nfnames; i++) free(args->fnames[i]); args->nfnames = nok; for (i=1; i<args->nfnames; i++) if ( args->start_pos[i-1]!=-1 && args->start_pos[i]!=-1 && args->start_pos[i]<args->start_pos[i-1] ) error("The files not in ascending order: %d in %s, %d in %s\n", args->start_pos[i-1]+1,args->fnames[i-1],args->start_pos[i]+1,args->fnames[i]); args->prev_chr = -1; args->swap_phase = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int)); args->nmatch = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int)); args->nmism = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int)); args->phase_qual = (int32_t*) malloc(bcf_hdr_nsamples(args->out_hdr)*sizeof(int32_t)); args->phase_set = (int32_t*) malloc(bcf_hdr_nsamples(args->out_hdr)*sizeof(int32_t)); args->files = bcf_sr_init(); args->files->require_index = 1; args->ifname = 0; } }