bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr, int ifrom, int ito) { bcf_sr_regions_t *reg; if ( !is_file ) return _regions_init_string(regions); reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t)); reg->start = reg->end = -1; reg->prev_start = reg->prev_seq = -1; reg->file = hts_open(regions, "rb"); if ( !reg->file ) { fprintf(stderr,"[%s:%d %s] Could not open file: %s\n", __FILE__,__LINE__,__FUNCTION__,regions); free(reg); return NULL; } reg->tbx = tbx_index_load(regions); if ( !reg->tbx ) { int len = strlen(regions); int is_bed = strcasecmp(".bed",regions+len-4) ? 0 : 1; if ( !is_bed && !strcasecmp(".bed.gz",regions+len-7) ) is_bed = 1; if ( reg->file->format.format==vcf ) ito = 1; // read the whole file, tabix index is not present while ( hts_getline(reg->file, KS_SEP_LINE, ®->line) > 0 ) { char *chr, *chr_end; int from, to, ret; ret = _regions_parse_line(reg->line.s, ichr,ifrom,abs(ito), &chr,&chr_end,&from,&to); if ( ret < 0 ) { if ( ito<0 ) ret = _regions_parse_line(reg->line.s, ichr,ifrom,ifrom, &chr,&chr_end,&from,&to); if ( ret<0 ) { fprintf(stderr,"[%s:%d] Could not parse the file %s, using the columns %d,%d[,%d]\n", __FILE__,__LINE__,regions,ichr+1,ifrom+1,ito+1); hts_close(reg->file); reg->file = NULL; free(reg); return NULL; } } if ( !ret ) continue; if ( is_bed ) from++; *chr_end = 0; _regions_add(reg, chr, from, to); *chr_end = '\t'; } hts_close(reg->file); reg->file = NULL; if ( !reg->nseqs ) { free(reg); return NULL; } return reg; } reg->seq_names = (char**) tbx_seqnames(reg->tbx, ®->nseqs); if ( !reg->seq_hash ) reg->seq_hash = khash_str2int_init(); int i; for (i=0; i<reg->nseqs; i++) { khash_str2int_set(reg->seq_hash,reg->seq_names[i],i); } reg->fname = strdup(regions); reg->is_bin = 1; return reg; }
int bcf_sr_add_reader(bcf_srs_t *files, const char *fname) { htsFile* file_ptr = hts_open(fname, "r"); if ( ! file_ptr ) { files->errnum = open_failed; return 0; } files->has_line = (int*) realloc(files->has_line, sizeof(int)*(files->nreaders+1)); files->has_line[files->nreaders] = 0; files->readers = (bcf_sr_t*) realloc(files->readers, sizeof(bcf_sr_t)*(files->nreaders+1)); bcf_sr_t *reader = &files->readers[files->nreaders++]; memset(reader,0,sizeof(bcf_sr_t)); reader->file = file_ptr; files->errnum = 0; if ( files->require_index ) { if ( reader->file->format.format==vcf ) { if ( reader->file->format.compression!=bgzf ) { files->errnum = not_bgzf; return 0; } reader->tbx_idx = tbx_index_load(fname); if ( !reader->tbx_idx ) { files->errnum = idx_load_failed; return 0; } reader->header = bcf_hdr_read(reader->file); } else if ( reader->file->format.format==bcf ) { if ( reader->file->format.compression!=bgzf ) { files->errnum = not_bgzf; return 0; } reader->header = bcf_hdr_read(reader->file); reader->bcf_idx = bcf_index_load(fname); if ( !reader->bcf_idx ) { files->errnum = idx_load_failed; return 0; } } else { files->errnum = file_type_error; return 0; } } else { if ( reader->file->format.format==bcf || reader->file->format.format==vcf ) { reader->header = bcf_hdr_read(reader->file); } else { files->errnum = file_type_error; return 0; } files->streaming = 1; } if ( files->streaming && files->nreaders>1 ) { files->errnum = api_usage_error; fprintf(stderr,"[%s:%d %s] Error: %d readers, yet require_index not set\n", __FILE__,__LINE__,__FUNCTION__,files->nreaders); return 0; } if ( files->streaming && files->regions ) { files->errnum = api_usage_error; fprintf(stderr,"[%s:%d %s] Error: cannot tabix-jump in streaming mode\n", __FILE__,__LINE__,__FUNCTION__); return 0; } if ( !reader->header ) { files->errnum = header_error; return 0; } reader->fname = fname; if ( files->apply_filters ) reader->filter_ids = init_filters(reader->header, files->apply_filters, &reader->nfilter_ids); // Update list of chromosomes if ( !files->explicit_regs && !files->streaming ) { int n,i; const char **names = reader->tbx_idx ? tbx_seqnames(reader->tbx_idx, &n) : bcf_hdr_seqnames(reader->header, &n); for (i=0; i<n; i++) { if ( !files->regions ) files->regions = _regions_init_string(names[i]); else _regions_add(files->regions, names[i], -1, -1); } free(names); } return 1; }
int bcf_sr_add_reader(bcf_srs_t *files, const char *fname) { files->has_line = (int*) realloc(files->has_line, sizeof(int)*(files->nreaders+1)); files->has_line[files->nreaders] = 0; files->readers = (bcf_sr_t*) realloc(files->readers, sizeof(bcf_sr_t)*(files->nreaders+1)); bcf_sr_t *reader = &files->readers[files->nreaders++]; memset(reader,0,sizeof(bcf_sr_t)); reader->file = hts_open(fname, "r"); if ( !reader->file ) return 0; reader->type = reader->file->is_bin? FT_BCF : FT_VCF; if (reader->file->is_compressed) reader->type |= FT_GZ; if ( files->require_index ) { if ( reader->type==FT_VCF_GZ ) { reader->tbx_idx = tbx_index_load(fname); if ( !reader->tbx_idx ) { fprintf(stderr,"[add_reader] Could not load the index of %s\n", fname); return 0; } reader->header = bcf_hdr_read(reader->file); } else if ( reader->type==FT_BCF_GZ ) { reader->header = bcf_hdr_read(reader->file); reader->bcf_idx = bcf_index_load(fname); if ( !reader->bcf_idx ) { fprintf(stderr,"[add_reader] Could not load the index of %s\n", fname); return 0; // not indexed..? } } else { fprintf(stderr,"Index required, expected .vcf.gz or .bcf file: %s\n", fname); return 0; } } else { if ( reader->type & FT_BCF ) { reader->header = bcf_hdr_read(reader->file); } else if ( reader->type & FT_VCF ) { reader->header = bcf_hdr_read(reader->file); } else { fprintf(stderr,"File type not recognised: %s\n", fname); return 0; } files->streaming = 1; } if ( files->streaming && files->nreaders>1 ) { fprintf(stderr,"[%s:%d %s] Error: %d readers, yet require_index not set\n", __FILE__,__LINE__,__FUNCTION__,files->nreaders); return 0; } if ( files->streaming && files->regions ) { fprintf(stderr,"[%s:%d %s] Error: cannot tabix-jump in streaming mode\n", __FILE__,__LINE__,__FUNCTION__); return 0; } if ( !reader->header ) return 0; reader->fname = fname; if ( files->apply_filters ) reader->filter_ids = init_filters(reader->header, files->apply_filters, &reader->nfilter_ids); // Update list of chromosomes if ( !files->explicit_regs && !files->streaming ) { int n,i; const char **names = reader->tbx_idx ? tbx_seqnames(reader->tbx_idx, &n) : bcf_hdr_seqnames(reader->header, &n); for (i=0; i<n; i++) { if ( !files->regions ) files->regions = _regions_init_string(names[i]); else _regions_add(files->regions, names[i], -1, -1); } free(names); } return 1; }
static int query_regions(args_t *args, char *fname, char **regs, int nregs) { int i; htsFile *fp = hts_open(fname,"r"); if ( !fp ) error("Could not read %s\n", fname); enum htsExactFormat format = hts_get_format(fp)->format; regidx_t *reg_idx = NULL; if ( args->targets_fname ) { reg_idx = regidx_init(args->targets_fname, NULL, NULL, 0, NULL); if ( !reg_idx ) error("Could not read %s\n", args->targets_fname); } if ( format == bcf ) { htsFile *out = hts_open("-","w"); if ( !out ) error("Could not open stdout\n", fname); hts_idx_t *idx = bcf_index_load(fname); if ( !idx ) error("Could not load .csi index of %s\n", fname); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Could not read the header: %s\n", fname); if ( args->print_header ) bcf_hdr_write(out,hdr); if ( !args->header_only ) { bcf1_t *rec = bcf_init(); for (i=0; i<nregs; i++) { hts_itr_t *itr = bcf_itr_querys(idx,hdr,regs[i]); while ( bcf_itr_next(fp, itr, rec) >=0 ) { if ( reg_idx && !regidx_overlap(reg_idx, bcf_seqname(hdr,rec),rec->pos,rec->pos+rec->rlen-1, NULL) ) continue; bcf_write(out,hdr,rec); } tbx_itr_destroy(itr); } bcf_destroy(rec); } if ( hts_close(out) ) error("hts_close returned non-zero status for stdout\n"); bcf_hdr_destroy(hdr); hts_idx_destroy(idx); } else if ( format==vcf || format==sam || format==unknown_format ) { tbx_t *tbx = tbx_index_load(fname); if ( !tbx ) error("Could not load .tbi/.csi index of %s\n", fname); kstring_t str = {0,0,0}; if ( args->print_header ) { while ( hts_getline(fp, KS_SEP_LINE, &str) >= 0 ) { if ( !str.l || str.s[0]!=tbx->conf.meta_char ) break; puts(str.s); } } if ( !args->header_only ) { int nseq; const char **seq = NULL; if ( reg_idx ) seq = tbx_seqnames(tbx, &nseq); for (i=0; i<nregs; i++) { hts_itr_t *itr = tbx_itr_querys(tbx, regs[i]); if ( !itr ) continue; while (tbx_itr_next(fp, tbx, itr, &str) >= 0) { if ( reg_idx && !regidx_overlap(reg_idx,seq[itr->curr_tid],itr->curr_beg,itr->curr_end, NULL) ) continue; puts(str.s); } tbx_itr_destroy(itr); } free(seq); } free(str.s); tbx_destroy(tbx); } else if ( format==bam ) error("Please use \"samtools view\" for querying BAM files.\n"); if ( reg_idx ) regidx_destroy(reg_idx); if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname); for (i=0; i<nregs; i++) free(regs[i]); free(regs); return 0; }
int convert(int argc, char **argv) { if (argc < 2) return convert_help(); int c; char *in=NULL, *out=NULL, *bim=NULL, *vid=NULL, *tmp_dir=NULL, *ped=NULL; uint32_t num_fields, num_records, col = 2; int i_is_set = 0, o_is_set = 0, f_is_set = 0, b_is_set = 0, v_is_set = 0, t_is_set = 0, p_is_set = 0, r_is_set = 0; while((c = getopt (argc, argv, "hi:o:f:r:b:v:t:p:c:")) != -1) { switch (c) { case 'c': col = atoi(optarg); break; case 'p': p_is_set = 1; ped = optarg; break; case 't': t_is_set = 1; tmp_dir = optarg; break; case 'v': v_is_set = 1; vid = optarg; break; case 'b': b_is_set = 1; bim = optarg; break; case 'i': i_is_set = 1; in = optarg; break; case 'o': o_is_set = 1; out = optarg; break; case 'f': f_is_set = 1; num_fields = atoi(optarg); break; case 'r': r_is_set = 1; num_records = atoi(optarg); break; case 'h': convert_help(); return 1; case '?': if ( (optopt == 'i') || (optopt == 'f') || (optopt == 'r') || (optopt == 't') || (optopt == 's') || (optopt == 'p') || (optopt == 'c') || (optopt == 'o') ) fprintf (stderr, "Option -%c requires an argument.\n", optopt); else if (isprint (optopt)) fprintf (stderr, "Unknown option `-%c'.\n", optopt); else fprintf (stderr, "Unknown option character `\\x%x'.\n", optopt); default: convert_help(); return 1; } } char *type = argv[0]; if (i_is_set == 0) { printf("Input file is not set\n"); return convert_help(); } if (strcmp(type, "bcf") == 0) { if ( (f_is_set == 0) || (r_is_set == 0) ) { fprintf(stderr,"Attempting to autodetect num of records " "and fields from %s\n", in); //Try and auto detect the sizes, need the index tbx_t *tbx = NULL; hts_idx_t *idx = NULL; htsFile *fp = hts_open(in,"rb"); if ( !fp ) { fprintf(stderr,"Could not read %s\n", in); return 1; } bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) { fprintf(stderr,"Could not read the header: %s\n", in); return 1; } if (hts_get_format(fp)->format==vcf) { tbx = tbx_index_load(in); if ( !tbx ) { fprintf(stderr,"Could not load TBI index: %s\n", in); return 1; } } else if ( hts_get_format(fp)->format==bcf ) { idx = bcf_index_load(in); if ( !idx ) { fprintf(stderr,"Could not load CSI index: %s\n", in); return 1; } } else { fprintf(stderr, "Could not detect the file type as VCF or BCF: %s\n", in); return 1; } num_fields = hdr->n[BCF_DT_SAMPLE]; num_records = 0; const char **seq; int nseq; seq = tbx ? tbx_seqnames(tbx, &nseq) : bcf_index_seqnames(idx, hdr, &nseq); int i; uint32_t sum = 0; for (i = 0; i < nseq; ++i) { uint64_t records, v; hts_idx_get_stat(tbx ? tbx->idx: idx, i, &records, &v); num_records += records; } fprintf(stderr, "Number of records:%u\tNumber of fields:%u\n", num_records, num_fields); free(seq); hts_close(fp); bcf_hdr_destroy(hdr); if (idx) hts_idx_destroy(idx); if (tbx) tbx_destroy(tbx); } if (o_is_set == 0) { out = (char*)malloc(strlen(in) + 5); // 5 for ext and \0 strcpy(out,in); strcat(out, ".gqt"); } if (b_is_set == 0) { bim = (char*)malloc(strlen(in) + 5); // 5 for ext and \0 strcpy(bim,in); strcat(bim, ".bim"); } if (v_is_set == 0) { vid = (char*)malloc(strlen(in) + 5); // 5 for ext and \0 strcpy(vid,in); strcat(vid, ".vid"); } if (t_is_set == 0) { tmp_dir = (char*)malloc(3*sizeof(char)); // "./\0" strcpy(tmp_dir,"./"); } int r = bcf_wahbm(in, out, bim, vid, tmp_dir, num_fields, num_records); return r; } if (strcmp(type, "ped") == 0) { if (o_is_set == 0) { if (p_is_set == 1) { out = (char*)malloc(strlen(ped) + 4); // 4 for ext and \0 strcpy(out,ped); strcat(out, ".db"); } else { out = (char*)malloc(strlen(in) + 4); // 4 for ext and \0 strcpy(out,in); strcat(out, ".db"); } } fprintf(stderr, "Creating sample database %s\n", out); return ped_ped(in, ped, col, out); } return convert_help(); }