bcf1_t *bcf_sweep_fwd(bcf_sweep_t *sw) { if ( sw->direction==SW_BWD ) sw_seek(sw, SW_FWD); long pos = hts_utell(sw->file); bcf1_t *rec = &sw->rec[0]; int ret = bcf_read1(sw->file, sw->hdr, rec); if ( ret!=0 ) // last record, get ready for sweeping backwards { sw->idx_done = 1; sw->fp->idx_build_otf = 0; sw_seek(sw, SW_BWD); return NULL; } if ( !sw->idx_done ) { if ( !sw->nidx || pos - sw->idx[sw->nidx-1] > sw->block_size ) { sw->nidx++; hts_expand(uint64_t, sw->nidx, sw->midx, sw->idx); sw->idx[sw->nidx-1] = pos; } } return rec; }
int main_getalt(int argc, char *argv[]) { int c; char *fn; BGZF *fp; bcf1_t *b; bcf_hdr_t *h; kstring_t s = {0,0,0}; while ((c = getopt(argc, argv, "")) >= 0) { } if (argc - optind == 0) { fprintf(stderr, "Usage: bgt getalt <bgt-base>\n"); return 1; } fn = (char*)calloc(strlen(argv[optind]) + 5, 1); sprintf(fn, "%s.bcf", argv[optind]); fp = bgzf_open(fn, "r"); free(fn); assert(fp); h = bcf_hdr_read(fp); b = bcf_init1(); while (bcf_read1(fp, b) >= 0) { char *ref, *alt; int l_ref, l_alt, i, min_l; bcf_get_ref_alt1(b, &l_ref, &ref, &l_alt, &alt); min_l = l_ref < l_alt? l_ref : l_alt; for (i = 0; i < min_l && ref[i] == alt[i]; ++i); s.l = 0; kputs(h->id[BCF_DT_CTG][b->rid].key, &s); kputc(':', &s); kputw(b->pos + 1 + i, &s); kputc(':', &s); kputw(b->rlen - i, &s); kputc(':', &s); kputsn(alt + i, l_alt - i, &s); puts(s.s); } bcf_destroy1(b); bcf_hdr_destroy(h); bgzf_close(fp); free(s.s); return 0; }
static void sw_fill_buffer(bcf_sweep_t *sw) { if ( !sw->iidx ) return; sw->iidx--; int ret = hts_useek(sw->file, sw->idx[sw->iidx], 0); assert( ret==0 ); sw->nrec = 0; bcf1_t *rec = &sw->rec[sw->nrec]; while ( (ret=bcf_read1(sw->file, sw->hdr, rec))==0 ) { bcf_unpack(rec, BCF_UN_STR); // if not in the last block, stop at the saved record if ( sw->iidx+1 < sw->nidx && sw_rec_equal(sw,rec) ) break; sw->nrec++; hts_expand0(bcf1_t, sw->nrec+1, sw->mrec, sw->rec); rec = &sw->rec[sw->nrec]; } sw_rec_save(sw, &sw->rec[0]); }
int vcf_index_stats(char *fname, int stats) { char *fn_out = NULL; FILE *out; out = fn_out ? fopen(fn_out, "w") : stdout; const char **seq; int i, nseq; tbx_t *tbx = NULL; hts_idx_t *idx = NULL; htsFile *fp = hts_open(fname,"r"); if ( !fp ) { fprintf(stderr,"Could not read %s\n", fname); return 1; } bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) { fprintf(stderr,"Could not read the header: %s\n", fname); return 1; } if ( hts_get_format(fp)->format==vcf ) { tbx = tbx_index_load(fname); if ( !tbx ) { fprintf(stderr,"Could not load TBI index: %s\n", fname); return 1; } } else if ( hts_get_format(fp)->format==bcf ) { idx = bcf_index_load(fname); if ( !idx ) { fprintf(stderr,"Could not load CSI index: %s\n", fname); return 1; } } else { fprintf(stderr,"Could not detect the file type as VCF or BCF: %s\n", fname); return 1; } seq = tbx ? tbx_seqnames(tbx, &nseq) : bcf_index_seqnames(idx, hdr, &nseq); uint64_t sum = 0; for (i=0; i<nseq; i++) { uint64_t records, v; hts_idx_get_stat(tbx ? tbx->idx : idx, i, &records, &v); sum+=records; if (stats&2 || !records) continue; bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", seq[i], NULL); int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1; fprintf(out,"%s\t%s\t%" PRIu64 "\n", seq[i], hkey<0?".":hrec->vals[hkey], records); } if (!sum) { // No counts found. // Is this because index version has no stored count data, or no records? bcf1_t *rec = bcf_init1(); if (bcf_read1(fp, hdr, rec) >= 0) { fprintf(stderr,"%s index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", tbx ? "TBI" : "CSI", fname); return 1; } bcf_destroy1(rec); } if (stats&2) fprintf(out, "%" PRIu64 "\n", sum); free(seq); fclose(out); hts_close(fp); bcf_hdr_destroy(hdr); if (tbx) tbx_destroy(tbx); if (idx) hts_idx_destroy(idx); return 0; }
int main(int argc, char **argv) { int i, n; static struct option const long_opts[] = { {"out", required_argument, NULL, 1}, {"report", required_argument, NULL, 2}, {"dotasref", no_argument, NULL, 3}, {"help", no_argument, NULL, 0}, {"version", no_argument, NULL, 4}, {"export_uncov", no_argument, NULL, 5} }; bool help = FALSE; bool report_version = FALSE; while ((n = getopt_long(argc, argv, "1:2:304", long_opts, NULL)) >= 0) { switch (n) { case 1 : outfile = strdup(optarg); break; case 2 : report = strdup(optarg); break; case 3 : dotasref = TRUE; break; case 0 : help = TRUE; break; case 4 : report_version = TRUE; break; case 5 : export_uncover = TRUE; break; default : return 1; } if ( help ) return usage(); if ( report_version ) return show_version(); } n = argc - optind; if ( n > 1 ) errabort("only accept one input vcf"); if ( export_uncover == TRUE && outfile == FALSE) { warnings("export uncove region only used with option --out"); export_uncover = FALSE; } char * input; if ( n == 0 ) input = strdup("-"); else input = strdup(argv[optind]); htsFile * fp = read_vcf_file(input); enum htsExactFormat fmt = hts_get_format(fp)->format; if ( fmt != vcf && fmt != bcf ) errabort("This is not a VCF/BCF file : %s", input); bcf_hdr_t * hdr = bcf_hdr_read(fp); int n_samples = bcf_hdr_nsamples(hdr); if ( n_samples != 2 ) errabort("the input VCF/BCF file must contain only two samples! %d", n_samples); LOG("Using sample %s as ref ...", hdr->samples[0]); LOG("Using sample %s as test ...", hdr->samples[1]); uint32_t matrix[4][4] = { {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0} }; bcf1_t * v = bcf_init1(); kstring_t str = { 0, 0, 0 }; uint32_t line = 0; htsFile *out = NULL; if ( outfile && !check_filename(outfile) ) out = hts_open(outfile, mode); if ( out != NULL ) bcf_hdr_write(out, hdr); while ( bcf_read1(fp, hdr, v) >= 0 ) { bcf_unpack(v, BCF_UN_STR|BCF_UN_FMT); int k; str.l = 0; int tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "GT"); if ( !bcf_hdr_idinfo_exists(hdr, BCF_HL_FMT, tag_id) ) warnings("There is no 'GT' in the header!"); for ( i = 0; i < v->n_fmt; ++i ) if ( v->d.fmt[i].id == tag_id ) break; if ( i == v->n_fmt ) { vcf_format1(hdr, v, &str); LOG("There is no tag GT in this line : %s", str.s); continue; } corr_t xy[2] = { {-1, -2, -2}, {-1, -2, -2} }; bcf_fmt_t * fmt = &v->d.fmt[i]; for ( i = 0; i < 2; ++i ) { int corr = i; if ( fmt == NULL ) { if ( dotasref == TRUE ) xy[corr].alt = ALT_IS_REF; else xy[corr].alt = ALT_IS_UNC; continue; } int last = -2; uint8_t *d = (uint8_t*)((char*)fmt->p + fmt->size*i); for ( k = 0; k < fmt->n && d[k] != (uint8_t)bcf_int8_vector_end; ++k ) { int curr = d[k]>>1; if ( last != curr ) { if ( curr ) { if ( last == -2 ) xy[corr].alt = curr > 1 ? ALT_IS_HOM : ALT_IS_REF; else xy[corr].alt = ALT_IS_HET; } else { xy[corr].alt = dotasref == TRUE ? ALT_IS_REF : ALT_IS_UNC; } } else { if ( curr ) { xy[corr].alt = curr > 1 ? ALT_IS_HOM : ALT_IS_REF; } else { xy[corr].alt = dotasref == TRUE ? ALT_IS_REF : ALT_IS_UNC; } } if (last == -2 ) { xy[corr].min = xy[corr].max = curr; } else { if ( curr < xy[corr].min ) xy[corr].min = curr; else if ( curr > xy[corr].max ) xy[corr].max = curr; } last = curr; } } matrix[xy[0].alt][xy[1].alt]++; if ( xy[0].alt != xy[1].alt && out != NULL) { if ( xy[0].alt == ALT_IS_UNC || xy[1].alt == ALT_IS_UNC ) { if ( export_uncover == TRUE ) { str.l = 0; vcf_format1(hdr, v, &str); vcf_write(out, hdr, v); } } else { str.l = 0; vcf_format1(hdr, v, &str); vcf_write(out, hdr, v); } } if ( xy[0].alt == ALT_IS_HET && xy[1].alt == ALT_IS_HET && (xy[0].min != xy[1].min || xy[0].max != xy[1].max ) ) { bias++; matrix[ALT_IS_HET][ALT_IS_HET]--; if ( out != NULL ) { str.l = 0; vcf_format1(hdr, v, &str); vcf_write(out, hdr, v); } } line++; } if ( out ) hts_close(out); if ( str.m ) free(str.s); write_report(matrix, hdr); bcf_hdr_destroy(hdr); free(input); bcf_destroy1(v); if ( outfile ) free(outfile); if ( report ) free(report); if ( hts_close(fp) ) warnings("hts_close returned non-zero status: %s", input); return 0; }
/* * _reader_fill_buffer() - buffers all records with the same coordinate */ static void _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) { // Return if the buffer is full: the coordinate of the last buffered record differs if ( reader->nbuffer && reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) return; // No iterator (sequence not present in this file) and not streaming if ( !reader->itr && !files->streaming ) return; // Fill the buffer with records starting at the same position int i, ret = 0; while (1) { if ( reader->nbuffer+1 >= reader->mbuffer ) { // Increase buffer size reader->mbuffer += 8; reader->buffer = (bcf1_t**) realloc(reader->buffer, sizeof(bcf1_t*)*reader->mbuffer); for (i=8; i>0; i--) // initialize { reader->buffer[reader->mbuffer-i] = bcf_init1(); reader->buffer[reader->mbuffer-i]->max_unpack = files->max_unpack; reader->buffer[reader->mbuffer-i]->pos = -1; // for rare cases when VCF starts from 1 } } if ( files->streaming ) { if ( reader->file->format.format==vcf ) { if ( (ret=hts_getline(reader->file, KS_SEP_LINE, &files->tmps)) < 0 ) break; // no more lines int ret = vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]); if ( ret<0 ) break; } else if ( reader->file->format.format==bcf ) { if ( (ret=bcf_read1(reader->file, reader->header, reader->buffer[reader->nbuffer+1])) < 0 ) break; // no more lines } else { fprintf(stderr,"[%s:%d %s] fixme: not ready for this\n", __FILE__,__LINE__,__FUNCTION__); exit(1); } } else if ( reader->tbx_idx ) { if ( (ret=tbx_itr_next(reader->file, reader->tbx_idx, reader->itr, &files->tmps)) < 0 ) break; // no more lines vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]); } else { if ( (ret=bcf_itr_next(reader->file, reader->itr, reader->buffer[reader->nbuffer+1])) < 0 ) break; // no more lines bcf_subset_format(reader->header,reader->buffer[reader->nbuffer+1]); } // apply filter if ( !reader->nfilter_ids ) bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR); else { bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR|BCF_UN_FLT); if ( !has_filter(reader, reader->buffer[reader->nbuffer+1]) ) continue; } reader->nbuffer++; if ( reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break; // the buffer is full } if ( ret<0 ) { // done for this region tbx_itr_destroy(reader->itr); reader->itr = NULL; } if ( files->collapse && reader->nbuffer>=2 && reader->buffer[1]->pos==reader->buffer[2]->pos ) collapse_buffer(files, reader); }