void remove_info(bcf1_t *line) { // remove all INFO fields if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO); int i; for (i=0; i<line->n_info; i++) { bcf_info_t *inf = &line->d.info[i]; if ( inf->vptr_free ) { free(inf->vptr - inf->vptr_off); inf->vptr_free = 0; } line->d.shared_dirty |= BCF1_DIRTY_INF; inf->vptr = NULL; } line->n_info=0; }
/** * Gets a string representation of the variant. */ std::string Variant::get_variant_string() { kstring_t var = {0,0,0}; bcf_unpack(v, BCF_UN_STR); var.l = 0; kputs(bcf_get_chrom(h, v), &var); kputc(':', &var); kputw(bcf_get_pos1(v), &var); kputc(':', &var); for (size_t i=0; i<bcf_get_n_allele(v); ++i) { if (i) kputc('/', &var); kputs(bcf_get_alt(v, i), &var); } std::string str(var.s); if (var.m) free(var.s); return str; }
// true if all samples are phased. // haploid genotypes are considered phased // ./. => not phased, .|. => phased int bcf_all_phased(const bcf_hdr_t *header, bcf1_t *line) { bcf_unpack(line, BCF_UN_FMT); bcf_fmt_t *fmt_ptr = bcf_get_fmt(header, line, "GT"); int all_phased = 1; if ( fmt_ptr ) { int i, isample; for (isample=0; isample<line->n_sample; isample++) { int sample_phased = 0; #define BRANCH_INT(type_t,vector_end) { \ type_t *p = (type_t*) (fmt_ptr->p + isample*fmt_ptr->size); \ for (i=0; i<fmt_ptr->n; i++) \ { \ if (fmt_ptr->n == 1 || (p[i] == vector_end && i == 1)) { sample_phased = 1; break; } /* haploid phased by definition */ \ if ( p[i] == vector_end ) { break; }; /* smaller ploidy */ \ if ( bcf_gt_is_missing(p[i]) ) continue; /* missing allele */ \ if ((p[i])&1) { \ sample_phased = 1; \ break; \ } \ } \ } switch (fmt_ptr->type) { case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; default: fprintf(stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); exit(1); break; } #undef BRANCH_INT if (!sample_phased) { all_phased = 0; break; } } } return all_phased; }
static void sw_fill_buffer(bcf_sweep_t *sw) { if ( !sw->iidx ) return; sw->iidx--; int ret = hts_useek(sw->file, sw->idx[sw->iidx], 0); assert( ret==0 ); sw->nrec = 0; bcf1_t *rec = &sw->rec[sw->nrec]; while ( (ret=bcf_read1(sw->file, sw->hdr, rec))==0 ) { bcf_unpack(rec, BCF_UN_STR); // if not in the last block, stop at the saved record if ( sw->iidx+1 < sw->nidx && sw_rec_equal(sw,rec) ) break; sw->nrec++; hts_expand0(bcf1_t, sw->nrec+1, sw->mrec, sw->rec); rec = &sw->rec[sw->nrec]; } sw_rec_save(sw, &sw->rec[0]); }
void merge_filter(args_t *args, bcf1_t *out) { bcf_srs_t *files = args->files; bcf_hdr_t *out_hdr = args->out_hdr; int i, ret; khiter_t kitr; strdict_t *tmph = args->tmph; kh_clear(strdict, tmph); maux_t *ma = args->maux; out->d.n_flt = 0; for (i=0; i<files->nreaders; i++) { if ( !ma->has_line[i]) continue; bcf_sr_t *reader = &files->readers[i]; bcf1_t *line = reader->buffer[0]; bcf_hdr_t *hdr = reader->header; bcf_unpack(line, BCF_UN_ALL); int k; for (k=0; k<line->d.n_flt; k++) { const char *flt = hdr->id[BCF_DT_ID][line->d.flt[k]].key; kitr = kh_get(strdict, tmph, flt); if ( kitr == kh_end(tmph) ) { int id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, flt); if ( id==-1 ) error("The filter not defined: %s\n", flt); hts_expand(int,out->d.n_flt+1,ma->mflt,ma->flt); ma->flt[out->d.n_flt] = id; out->d.n_flt++; kh_put(strdict, tmph, flt, &ret); } } }
int calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) { int i; for (i=0; i<line->n_allele; i++) ac[i]=0; // Use INFO/AC,AN field only when asked if ( which&BCF_UN_INFO ) { bcf_unpack(line, BCF_UN_INFO); int an_id = bcf_id2int(header, BCF_DT_ID, "AN"); int ac_id = bcf_id2int(header, BCF_DT_ID, "AC"); if ( an_id>=0 && ac_id>=0 ) { int i, an=0, ac_len=0, ac_type=0; uint8_t *ac_ptr=NULL; for (i=0; i<line->n_info; i++) { bcf_info_t *z = &line->d.info[i]; if ( z->key == an_id ) an = z->v1.i; else if ( z->key == ac_id ) { ac_ptr = z->vptr; ac_len = z->len; ac_type = z->type; } } int nac = 0; #define BRANCH_INT(type_t) { \ type_t *p = (type_t *) ac_ptr; \ for (i=0; i<ac_len; i++) \ { \ ac[i+1] = p[i]; \ nac += p[i]; \ } \ } if ( ac_type==BCF_BT_INT8 ) { BRANCH_INT(uint8_t) } else if ( ac_type==BCF_BT_INT16 ) { BRANCH_INT(uint16_t) } else if ( ac_type==BCF_BT_INT32 ) { BRANCH_INT(uint32_t) } #undef BRANCH_INT ac[0] = an - nac; return 1; }
void union_data::readGenotypesVCF(string fvcf,string region) { int n_includedG = 0; int n_excludedG_mult = 0; int n_excludedG_void = 0; int n_excludedG_user = 0; int n_includedS = 0; vector < int > mappingS; genotype_id.clear(); genotype_chr.clear(); genotype_start.clear(); genotype_end.clear(); genotype_val.clear(); genotype_count=0; genotype_id_to_idx.clear(); //Opening files bcf_srs_t * sr = bcf_sr_init(); //vrb.bullet("target region [" + regionGenotype.get() + "]"); //if (bcf_sr_set_regions(sr, regionGenotype.get().c_str(), 0) == -1) vrb.error("Cannot jump to region!"); bcf_sr_set_regions(sr, region.c_str(), 0); if(!(bcf_sr_add_reader (sr, fvcf.c_str()))) { switch (sr->errnum) { case not_bgzf: vrb.error("File not compressed with bgzip!"); case idx_load_failed: vrb.error("Impossible to load index file!"); case file_type_error: vrb.error("File format not detected by htslib!"); default : vrb.error("Unknown error!"); } } //Sample processing int n_samples = bcf_hdr_nsamples(sr->readers[0].header); for (int i0 = 0 ; i0 < n_samples ; i0 ++) { mappingS.push_back(findSample(string(sr->readers[0].header->samples[i0]))); if (mappingS.back() >= 0) n_includedS++; } //Read genotype data int ngt, ngt_arr = 0, nds, nds_arr = 0, * gt_arr = NULL, nsl, nsl_arr = 0, * sl_arr = NULL; float * ds_arr = NULL; bcf1_t * line; unsigned int linecount = 0; while(bcf_sr_next_line (sr)) { linecount ++; if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); line = bcf_sr_get_line(sr, 0); if (line->n_allele == 2) { ngt = bcf_get_genotypes(sr->readers[0].header, line, >_arr, &ngt_arr); nds = bcf_get_format_float(sr->readers[0].header, line,"DS", &ds_arr, &nds_arr); if (nds == n_samples || ngt == 2*n_samples) { bcf_unpack(line, BCF_UN_STR); string sid = string(line->d.id); if (filter_genotype.check(sid)) { genotype_id.push_back(sid); genotype_chr.push_back(string(bcf_hdr_id2name(sr->readers[0].header, line->rid))); string genotype_ref = string(line->d.allele[0]); genotype_start.push_back(line->pos + 1); nsl = bcf_get_info_int32(sr->readers[0].header, line, "END", &sl_arr, &nsl_arr); if (nsl >= 0 && nsl_arr == 1) genotype_end.push_back(sl_arr[0]); else genotype_end.push_back(genotype_start.back() + genotype_ref.size() - 1); genotype_val.push_back(vector < float > (sample_count, 0.0)); for(int i = 0 ; i < n_samples ; i ++) { if (mappingS[i] >= 0) { if (nds > 0) genotype_val.back()[mappingS[i]] = ds_arr[i]; else { if (gt_arr[2*i+0] == bcf_gt_missing || gt_arr[2*i+1] == bcf_gt_missing) genotype_val.back()[mappingS[i]] = bcf_float_missing; else genotype_val.back()[mappingS[i]] = bcf_gt_allele(gt_arr[2*i+0]) + bcf_gt_allele(gt_arr[2*i+1]); } } } pair < string, int > temp (sid,n_includedG); genotype_id_to_idx.insert(temp); n_includedG++; } else n_excludedG_user ++; } else n_excludedG_void ++; } else n_excludedG_mult ++; } //Finalize free(gt_arr); free(ds_arr); bcf_sr_destroy(sr); genotype_count = n_includedG; //vrb.bullet(stb.str(n_includedG) + " variants included"); //if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user"); //if (n_excludedG_mult > 0) vrb.bullet(stb.str(n_excludedG_mult) + " multi-allelic variants excluded"); //if (n_excludedG_void > 0) vrb.bullet(stb.str(n_excludedG_void) + " uninformative variants excluded [no GT/DS]"); //if (genotype_count == 0) vrb.leave("Cannot find genotypes in target region!"); }
//{{{ void get_bcf_query_result(uint32_t *mask, void get_bcf_query_result(uint32_t *mask, uint32_t mask_len, struct gqt_query *q, char **id_query_list, uint32_t *id_lens, uint32_t num_qs, uint32_t num_fields, char *vid_file_name, char *bcf_file_name, int bcf_output) { /* The VID file contains the line numbers of the variants after they have * been sorted. To reach back into the BCF file to print the metadata * associated with the variants marked in the mask, we need to create a * sorted list of line numbers we want. So first we intersect the VID file * and the mask, then sort it. */ /* FILE *vid_f = fopen(vid_file_name, "rb"); if (!vid_f) err(EX_NOINPUT, "Cannot read file\"%s\"", vid_file_name); uint32_t *vids = (uint32_t *) malloc(num_fields*sizeof(uint32_t)); if (!vids ) err(EX_OSERR, "malloc error"); size_t fr = fread(vids, sizeof(uint32_t), num_fields, vid_f); check_file_read(vid_file_name, vid_f, num_fields, fr); fclose(vid_f); */ struct vid_file *vid_f = open_vid_file(vid_file_name); load_vid_data(vid_f); uint32_t i, j, masked_vid_count = 0; for (i = 0; i < mask_len; ++i) masked_vid_count += popcount(mask[i]); uint32_t *masked_vids = (uint32_t *) malloc(masked_vid_count*sizeof(uint32_t)); if (!masked_vids ) err(EX_OSERR, "malloc error"); uint32_t masked_vid_i = 0; for (i = 0; i < mask_len; ++i) { uint32_t bytes = mask[i]; if (bytes == 0) continue; /* skip a bunch of ops if you can */ for (j = 0; j < 32; j++) { if (bytes & (1 << (31 - j))) { masked_vids[masked_vid_i] = vid_f->vids[i*32 + j]; masked_vid_i+=1; } } if (masked_vid_i == masked_vid_count) break; } destroy_vid_file(vid_f); qsort(masked_vids, masked_vid_count, sizeof(uint32_t), compare_uint32_t); htsFile *fp = hts_open(bcf_file_name,"rb"); bcf_hdr_t *hdr = bcf_hdr_read(fp); bcf1_t *line = bcf_init1(); //bcf_hdr_set_samples(hdr, print_name_csv, 0); htsFile *out; if (!bcf_output) out = hts_open("-", "w"); else out = hts_open("-", "wb"); int r = bcf_hdr_write(out, hdr); uint32_t bcf_line_i = 0; masked_vid_i = 0; while ( bcf_read(fp, hdr, line) != -1) { if (masked_vids[masked_vid_i] == bcf_line_i) { r = bcf_unpack(line, BCF_UN_ALL); r = bcf_write1(out, hdr, line); masked_vid_i+=1; } if (masked_vid_i == masked_vid_count) break; bcf_line_i += 1; } hts_close(out); hts_close(fp); }
int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) { int i; for (i=0; i<line->n_allele; i++) ac[i]=0; // Use INFO/AC,AN field only when asked if ( which&BCF_UN_INFO ) { bcf_unpack(line, BCF_UN_INFO); int an_id = bcf_hdr_id2int(header, BCF_DT_ID, "AN"); int ac_id = bcf_hdr_id2int(header, BCF_DT_ID, "AC"); int i, an=-1, ac_len=0, ac_type=0; uint8_t *ac_ptr=NULL; if ( an_id>=0 && ac_id>=0 ) { for (i=0; i<line->n_info; i++) { bcf_info_t *z = &line->d.info[i]; if ( z->key == an_id ) an = z->v1.i; else if ( z->key == ac_id ) { ac_ptr = z->vptr; ac_len = z->len; ac_type = z->type; } } } if ( an>=0 && ac_ptr ) { int nac = 0; #define BRANCH_INT(type_t) { \ type_t *p = (type_t *) ac_ptr; \ for (i=0; i<ac_len; i++) \ { \ ac[i+1] = p[i]; \ nac += p[i]; \ } \ } switch (ac_type) { case BCF_BT_INT8: BRANCH_INT(int8_t); break; case BCF_BT_INT16: BRANCH_INT(int16_t); break; case BCF_BT_INT32: BRANCH_INT(int32_t); break; default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, ac_type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; } #undef BRANCH_INT assert( an>=nac ); // sanity check for missing values ac[0] = an - nac; return 1; } } // Split genotype fields only when asked if ( which&BCF_UN_FMT ) { int i, gt_id = bcf_hdr_id2int(header,BCF_DT_ID,"GT"); if ( gt_id<0 ) return 0; bcf_unpack(line, BCF_UN_FMT); bcf_fmt_t *fmt_gt = NULL; for (i=0; i<(int)line->n_fmt; i++) if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; } if ( !fmt_gt ) return 0; #define BRANCH_INT(type_t,missing,vector_end) { \ for (i=0; i<line->n_sample; i++) \ { \ type_t *p = (type_t*) (fmt_gt->p + i*fmt_gt->size); \ int ial; \ for (ial=0; ial<fmt_gt->n; ial++) \ { \ if ( p[ial]==vector_end ) break; /* smaller ploidy */ \ if ( !(p[ial]>>1) || p[ial]==missing ) continue; /* missing allele */ \ ac[(p[ial]>>1)-1]++; \ } \ } \ } switch (fmt_gt->type) { case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, fmt_gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; } #undef BRANCH_INT return 1; } return 0; }
int bcf_sr_next_line(readers_t *files) { int32_t min_pos = INT_MAX; int ret,i,j; kstring_t *str = &files->tmps; while ( min_pos==INT_MAX ) { // Need to open new chromosome? int eos = 0; for (i=0; i<files->nreaders; i++) if ( !files->readers[i].itr && !files->readers[i].nbuffer ) eos++; if ( eos==files->nreaders ) { const char *seq; if ( files->targets ) { seq = tgt_next_seq(files->targets); if ( !seq ) return 0; // all chroms scanned } else { if ( ++files->iseq >= files->nseqs ) return 0; // all chroms scanned seq = files->seqs[files->iseq]; } for (i=0; i<files->nreaders; i++) { reader_t *reader = &files->readers[i]; if ( reader->tbx ) reader->itr = tbx_itr_querys(reader->tbx,seq); else reader->itr = bcf_itr_querys(reader->bcf,reader->header,seq); } } // Find the smallest coordinate for (i=0; i<files->nreaders; i++) { reader_t *reader = &files->readers[i]; int buffer_full = ( reader->nbuffer && reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) ? 1 : 0; if ( reader->itr && !buffer_full ) { // Fill the buffer with records starting at the same position while (1) { if ( reader->nbuffer+1 >= reader->mbuffer ) { reader->mbuffer += 8; reader->buffer = (bcf1_t**) realloc(reader->buffer, sizeof(bcf1_t*)*reader->mbuffer); for (j=8; j>0; j--) reader->buffer[reader->mbuffer-j] = bcf_init1(); } if ( reader->tbx ) { ret = tbx_itr_next((BGZF*)reader->file->fp, reader->tbx, reader->itr, str); if ( ret<0 ) break; vcf_parse1(str, reader->header, reader->buffer[reader->nbuffer+1]); } else { ret = bcf_itr_next((BGZF*)reader->file->fp, reader->itr, reader->buffer[reader->nbuffer+1]); if ( ret<0 ) break; } bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR|BCF_UN_FLT); // apply filter if ( reader->filter_id!=-1 && reader->buffer[reader->nbuffer+1]->d.n_flt && reader->filter_id!=reader->buffer[reader->nbuffer+1]->d.flt[0] ) continue; set_variant_types(reader->buffer[reader->nbuffer+1]); reader->nbuffer++; if ( reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break; } if ( ret<0 ) { tbx_itr_destroy(reader->itr); reader->itr = NULL; } // done for this chromosome } if ( reader->nbuffer ) { if ( min_pos > reader->buffer[1]->pos ) min_pos = reader->buffer[1]->pos; } // The buffer is full - either there is nothing else to read or the last record has a different coordinate if ( files->collapse && reader->nbuffer>2 && reader->buffer[1]->pos==reader->buffer[2]->pos ) { collapse_buffer(files, reader); } } if ( files->targets && min_pos!=INT_MAX ) { int ret = tgt_has_position(files->targets, min_pos); if ( ret==1 ) continue; // The position must be skipped if ( ret==-1 ) { // done for this chromosome, don't read the rest for (i=0; i<files->nreaders; i++) { files->readers[i].nbuffer = 0; if ( files->readers[i].itr ) { tbx_itr_destroy(files->readers[i].itr); files->readers[i].itr = NULL; } } min_pos = INT_MAX; continue; } // remove the active line, save the buffer line for (i=0; i<files->nreaders; i++) { reader_t *reader = &files->readers[i]; for (j=1; j<=reader->nbuffer; j++) if ( reader->buffer[j]->pos!=min_pos ) break; if ( j==1 ) continue; if ( j<=reader->nbuffer ) { bcf1_t *tmp = reader->buffer[1]; reader->buffer[1] = reader->buffer[j]; reader->buffer[j] = tmp; reader->nbuffer = 1; } else reader->nbuffer = 0; } min_pos = INT_MAX; } } //printf("[next_line] min_pos=%d\n", min_pos+1); //debug_buffers(files); // Set the current line ret = 0; bcf1_t *first = NULL; for (i=0; i<files->nreaders; i++) { reader_t *reader = &files->readers[i]; if ( !reader->nbuffer || reader->buffer[1]->pos!=min_pos ) continue; // Match the records by REF and ALT int j, irec = -1; if ( first ) { for (j=1; j<=reader->nbuffer; j++) { bcf1_t *line = reader->buffer[j]; if ( min_pos != line->pos ) break; // done with this buffer if ( files->collapse&COLLAPSE_ANY ) { irec=j; break; } // checking position only if ( files->collapse&COLLAPSE_SNPS && first->d.var_type&VCF_SNP && line->d.var_type&VCF_SNP ) { irec=j; break; } if ( files->collapse&COLLAPSE_INDELS && first->d.var_type&VCF_INDEL && line->d.var_type&VCF_INDEL ) { irec=j; break; } if ( first->rlen != line->rlen ) continue; // REFs do not match if ( strcmp(first->d.allele[0], line->d.allele[0]) ) continue; // REFs do not match int ial,jal; if ( files->collapse==COLLAPSE_NONE ) { // require exact match, all alleles must be identical if ( first->n_allele!=line->n_allele ) continue; // different number of alleles int nmatch = 1; // REF has been already checked for (ial=1; ial<first->n_allele; ial++) { for (jal=1; jal<line->n_allele; jal++) if ( !strcmp(first->d.allele[ial], line->d.allele[jal]) ) { nmatch++; break; } } if ( nmatch>=first->n_allele ) { irec=j; break; } } else { // thorough check: the REFs and some of the alleles have to be shared // (neglecting different representations of the same indel for now) for (ial=1; ial<first->n_allele; ial++) { for (jal=1; jal<line->n_allele; jal++) if ( !strcmp(first->d.allele[ial], line->d.allele[jal]) ) { irec=j; break; } if ( irec>=1 ) break; } } if ( irec>=1 ) break; } if ( irec==-1 ) continue; } else { first = reader->buffer[1]; irec = 1; } bcf1_t *tmp = reader->buffer[0]; reader->buffer[0] = reader->buffer[irec]; for (j=irec+1; j<=reader->nbuffer; j++) reader->buffer[j-1] = reader->buffer[j]; reader->buffer[ reader->nbuffer ] = tmp; reader->nbuffer--; ret |= 1<<i; } // fprintf(stdout,"[next_line] min_pos=%d mask=%d\n", min_pos+1, ret); // debug_buffers(stdout,files); return ret; }
int filter_test(filter_t *filter, bcf1_t *line) { bcf_unpack(line, BCF_UN_INFO); int i, nstack = 0; for (i=0; i<filter->nfilters; i++) { filter->filters[i].missing_value = 0; filter->filters[i].str_value = NULL; filter->filters[i].pass = -1; if ( filter->filters[i].tok_type == TOK_VAL ) { if ( filter->filters[i].setter ) filter->filters[i].setter(line, &filter->filters[i]); else if ( filter->filters[i].key ) { filter->filters[i].str_value = filter->filters[i].key; filter->filters[i].num_value = filter->filters[i].num_value; } else filter->filters[i].num_value = filter->filters[i].threshold; filter->flt_stack[nstack++] = &filter->filters[i]; continue; } if ( nstack<2 ) error("Error occurred while processing the filter \"%s\" (1:%d)\n", filter->str,nstack); // too few values left on the stack int is_str = (filter->flt_stack[nstack-1]->str_value ? 1 : 0) + (filter->flt_stack[nstack-2]->str_value ? 1 : 0 ); if ( filter->filters[i].tok_type == TOK_OR ) { if ( filter->flt_stack[nstack-1]->pass<0 || filter->flt_stack[nstack-2]->pass<0 ) error("Error occurred while processing the filter \"%s\" (%d %d OR)\n", filter->str,filter->flt_stack[nstack-2]->pass,filter->flt_stack[nstack-1]->pass); filter->flt_stack[nstack-2]->pass = filter->flt_stack[nstack-1]->pass + filter->flt_stack[nstack-2]->pass; nstack--; continue; } if ( filter->filters[i].tok_type == TOK_AND ) { if ( filter->flt_stack[nstack-1]->pass<0 || filter->flt_stack[nstack-2]->pass<0 ) error("Error occurred while processing the filter \"%s\" (%d %d AND)\n", filter->str,filter->flt_stack[nstack-2]->pass,filter->flt_stack[nstack-1]->pass); filter->flt_stack[nstack-2]->pass = filter->flt_stack[nstack-1]->pass * filter->flt_stack[nstack-2]->pass; nstack--; continue; } if ( filter->filters[i].tok_type == TOK_ADD ) { filter->flt_stack[nstack-2]->num_value += filter->flt_stack[nstack-1]->num_value; nstack--; continue; } else if ( filter->filters[i].tok_type == TOK_SUB ) { filter->flt_stack[nstack-2]->num_value -= filter->flt_stack[nstack-1]->num_value; nstack--; continue; } else if ( filter->filters[i].tok_type == TOK_MULT ) { filter->flt_stack[nstack-2]->num_value *= filter->flt_stack[nstack-1]->num_value; nstack--; continue; } else if ( filter->filters[i].tok_type == TOK_DIV ) { filter->flt_stack[nstack-2]->num_value /= filter->flt_stack[nstack-1]->num_value; nstack--; continue; } int is_true = 0; if ( filter->flt_stack[nstack-1]->missing_value || filter->flt_stack[nstack-2]->missing_value ) is_true = 0; else if ( filter->filters[i].tok_type == TOK_EQ ) { if ( filter->flt_stack[nstack-1]->comparator ) is_true = filter->flt_stack[nstack-1]->comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],TOK_EQ,line); else if ( filter->flt_stack[nstack-2]->comparator ) is_true = filter->flt_stack[nstack-2]->comparator(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_EQ,line); else if ( is_str==2 ) { int ncmp = filter->flt_stack[nstack-1]->num_value > filter->flt_stack[nstack-2]->num_value ? filter->flt_stack[nstack-1]->num_value : filter->flt_stack[nstack-2]->num_value; is_true = strncmp(filter->flt_stack[nstack-1]->str_value,filter->flt_stack[nstack-2]->str_value, ncmp) ? 0 : 1; } else if ( is_str==1 ) error("Comparing string to numeric value: %s\n", filter->str); else is_true = (filter->flt_stack[nstack-1]->num_value == filter->flt_stack[nstack-2]->num_value) ? 1 : 0; } else if ( filter->filters[i].tok_type == TOK_NE ) { if ( filter->flt_stack[nstack-1]->comparator ) is_true = filter->flt_stack[nstack-1]->comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],TOK_NE,line); else if ( filter->flt_stack[nstack-2]->comparator ) is_true = filter->flt_stack[nstack-2]->comparator(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_NE,line); else if ( is_str==2 ) { int ncmp = filter->flt_stack[nstack-1]->num_value > filter->flt_stack[nstack-2]->num_value ? filter->flt_stack[nstack-1]->num_value : filter->flt_stack[nstack-2]->num_value; is_true = strncmp(filter->flt_stack[nstack-1]->str_value,filter->flt_stack[nstack-2]->str_value, ncmp) ? 1 : 0; } else if ( is_str==1 ) error("Comparing string to numeric value: %s\n", filter->str); else is_true = ( filter->flt_stack[nstack-2]->num_value != filter->flt_stack[nstack-1]->num_value ) ? 1 : 0; } else if ( is_str>0 ) error("Wrong operator in string comparison: %s [%s,%s]\n", filter->str, filter->flt_stack[nstack-1]->str_value, filter->flt_stack[nstack-2]->str_value); else if ( filter->filters[i].tok_type == TOK_LE ) is_true = ( filter->flt_stack[nstack-2]->num_value <= filter->flt_stack[nstack-1]->num_value ) ? 1 : 0; else if ( filter->filters[i].tok_type == TOK_LT ) is_true = ( filter->flt_stack[nstack-2]->num_value < filter->flt_stack[nstack-1]->num_value ) ? 1 : 0; else if ( filter->filters[i].tok_type == TOK_EQ ) is_true = ( filter->flt_stack[nstack-2]->num_value == filter->flt_stack[nstack-1]->num_value ) ? 1 : 0; else if ( filter->filters[i].tok_type == TOK_BT ) is_true = ( filter->flt_stack[nstack-2]->num_value > filter->flt_stack[nstack-1]->num_value ) ? 1 : 0; else if ( filter->filters[i].tok_type == TOK_BE ) is_true = ( filter->flt_stack[nstack-2]->num_value >= filter->flt_stack[nstack-1]->num_value ) ? 1 : 0; else error("FIXME: did not expect this .. tok_type %d = %d\n", i, filter->filters[i].tok_type); filter->flt_stack[nstack-2]->pass = is_true; nstack--; } if ( nstack>1 ) error("Error occurred while processing the filter \"%s\" (2:%d)\n", filter->str,nstack); // too few values left on the stack return filter->flt_stack[0]->pass; }
int main(int argc, char **argv) { if (argc < 3) { fprintf(stderr,"%s <bcf file> <num vars>\n", argv[0]); return 1; } char *fname = argv[1]; uint32_t num_vars = atoi(argv[2]); htsFile *fp = hts_open(fname,"rb"); bcf_hdr_t *hdr = bcf_hdr_read(fp); bcf1_t *line = bcf_init1(); int32_t *gt_p = NULL; uint32_t num_inds = bcf_hdr_nsamples(hdr); int32_t i, j, k, ntmp = 0, int_i = 0, two_bit_i = 0, sum, t_sum = 0; uint32_t num_ind_ints = 1 + ((num_inds - 1) / 16); pri_queue q = priq_new(0); priority p; uint32_t *packed_ints = (uint32_t *) calloc(num_ind_ints, sizeof(uint32_t)); FILE *gt_of = fopen("gt.tmp.packed","wb"); FILE *md_of = fopen("md.tmp.packed","w"); uint32_t *md_index = (uint32_t *) malloc(num_vars * sizeof(uint32_t)); uint32_t md_i = 0; unsigned long t_bcf_read = 0, t_bcf_dup = 0, t_bcf_unpack = 0, t_bcf_get_genotypes = 0, t_bcf_hdr_nsamples = 0, t_q = 0, t_write = 0, t_get_md = 0, t_md_write = 0, t_pack = 0; for (i = 0; i < num_vars; ++i) { sum = 0; int_i = 0; two_bit_i = 0; int r = bcf_read(fp, hdr, line); // Copy bcf1_t *t_line = bcf_dup(line); // Unpack bcf_unpack(t_line, BCF_UN_ALL); // Get metadata size_t len = strlen(bcf_hdr_id2name(hdr, t_line->rid)) + 10 + // max length of pos strlen(t_line->d.id) + strlen(t_line->d.allele[0]) + strlen(t_line->d.allele[1]) + 4; //tabs char *md = (char *) malloc(len * sizeof(char)); sprintf(md, "%s\t%d\t%s\t%s\t%s", bcf_hdr_id2name(hdr, t_line->rid), t_line->pos + 1, t_line->d.id, t_line->d.allele[0], t_line->d.allele[1]); // Write metadata md_i += strlen(md); md_index[i] = md_i; fprintf(md_of, "%s", md); // Get gentotypes uint32_t num_gts_per_sample = bcf_get_genotypes(hdr, t_line, >_p, &ntmp); num_gts_per_sample /= num_inds; int32_t *gt_i = gt_p; // Pack genotypes for (j = 0; j < num_inds; ++j) { uint32_t gt = 0; for (k = 0; k < num_gts_per_sample; ++k) { gt += bcf_gt_allele(gt_i[k]); } packed_ints[int_i] += gt << (30 - 2*two_bit_i); two_bit_i += 1; if (two_bit_i == 16) { two_bit_i = 0; int_i += 1; } sum += gt; gt_i += num_gts_per_sample; } // Get a priority for the variant based on the sum and number of // leading zeros p.sum = sum; uint32_t prefix_len = 0; j = 0; while ((j < num_ind_ints) && (packed_ints[j] == 0)){ prefix_len += 32; j += 1; } if (j < num_ind_ints) prefix_len += nlz1(packed_ints[j]); // Push it into the q p.len = prefix_len; int *j = (int *) malloc (sizeof(int)); j[0] = i; priq_push(q, j, p); // Write to file fwrite(packed_ints, sizeof(uint32_t), num_ind_ints,gt_of); memset(packed_ints, 0, num_ind_ints*sizeof(uint32_t)); t_sum += sum; bcf_destroy(t_line); free(md); } fclose(gt_of); fclose(md_of); md_of = fopen("md.tmp.packed","r"); FILE *md_out = fopen("md.bim","w"); gt_of = fopen("gt.tmp.packed","rb"); FILE *s_gt_of = fopen("s.gt.tmp.packed","wb"); // Get variants in order and rewrite a variant-major sorted matrix while ( priq_top(q, &p) != NULL ) { int *d = priq_pop(q, &p); uint32_t start = 0; if (*d != 0) start = md_index[*d - 1]; uint32_t len = md_index[*d] - start; fseek(md_of, start*sizeof(char), SEEK_SET); char buf[len+1]; fread(buf, sizeof(char), len, md_of); buf[len] = '\0'; fseek(gt_of, (*d)*num_ind_ints*sizeof(uint32_t), SEEK_SET); fread(packed_ints, sizeof(uint32_t), num_ind_ints, gt_of); fwrite(packed_ints, sizeof(uint32_t), num_ind_ints,s_gt_of); fprintf(md_out, "%s\n", buf); } fclose(md_out); fclose(md_of); fclose(gt_of); fclose(s_gt_of); /* * In a packed-int variant-major matrix there will be a num_vars * number of rows, and a num_inds number of values packed into * num_inds_ints number of intergers. For examples, 16 rows of 16 values * will be 16 ints, where each int encodes 16 values. * */ uint32_t num_var_ints = 1 + ((num_vars - 1) / 16); uint32_t *I_data = (uint32_t *)calloc(num_var_ints*16,sizeof(uint32_t)); uint32_t **I = (uint32_t **)malloc(16*sizeof(uint32_t*)); for (i = 0; i < 16; ++i) I[i] = I_data + i*num_var_ints; uint32_t I_i = 0, I_int_i = 0; uint32_t v; s_gt_of = fopen("s.gt.tmp.packed","rb"); FILE *rs_gt_of = fopen("r.s.gt.tmp.packed","wb"); // Write these to values to that this is a well-formed uncompressed // packed int binary file (ubin) file fwrite(&num_vars, sizeof(uint32_t), 1, rs_gt_of); fwrite(&num_inds, sizeof(uint32_t), 1, rs_gt_of); /* * we need to loop over the columns in the v-major file. * There are num_vars rows, and num_ind_ints 16-ind packed columns * * In this loop : * i: cols in var-major form, rows in ind-major form * j: rows in var-major form, cols in ind-major form */ uint32_t num_inds_to_write = num_inds; for (i = 0; i < num_ind_ints; ++i) { // loop over each int col for (j = 0; j < num_vars; ++j) { // loop over head row in that col // skip to the value at the row/col fseek(s_gt_of, j*num_ind_ints*sizeof(uint32_t) + //row i*sizeof(uint32_t), //col SEEK_SET); fread(&v, sizeof(uint32_t), 1, s_gt_of); // one int corresponds to a col of 16 two-bit values // two_bit_i will move across the cols for (two_bit_i = 0; two_bit_i < 16; ++two_bit_i) { I[two_bit_i][I_i] += ((v >> (30 - 2*two_bit_i)) & 3) << (30 - 2*I_int_i); } I_int_i += 1; if (I_int_i == 16) { I_i += 1; I_int_i = 0; } } // When we are at the end of the file, and the number of lines // is not a factor of 16, only write out the lines that contain values if (num_inds_to_write >= 16) { fwrite(I_data, sizeof(uint32_t), num_var_ints*16, rs_gt_of); num_inds_to_write -= 16; } else { fwrite(I_data, sizeof(uint32_t), num_var_ints*num_inds_to_write, rs_gt_of); } memset(I_data, 0, num_var_ints*16*sizeof(uint32_t)); I_int_i = 0; I_i = 0; } fclose(s_gt_of); fclose(rs_gt_of); free(md_index); free(packed_ints); }
void genrich_data::readReferenceGenotypes(string fvcf) { vector < int > mappingS; //Opening files vrb.title("Reading variant list in [" + fvcf + "] MAF=" + stb.str(threshold_maf)); bcf_srs_t * sr = bcf_sr_init(); if(!(bcf_sr_add_reader (sr, fvcf.c_str()))) { switch (sr->errnum) { case not_bgzf: vrb.error("File not compressed with bgzip!"); case idx_load_failed: vrb.error("Impossible to load index file!"); case file_type_error: vrb.error("File format not detected by htslib!"); default : vrb.error("Unknown error!"); } } //Sample processing int included_sample = 0; int n_samples = bcf_hdr_nsamples(sr->readers[0].header); for (int i = 0 ; i < n_samples ; i ++) { mappingS.push_back(findSample(string(sr->readers[0].header->samples[i]))); if (mappingS.back() >= 0) included_sample ++; } vrb.bullet("#samples = " + stb.str(included_sample)); //Variant processing unsigned int n_excludedV_mult = 0, n_excludedV_void = 0, n_excludedV_rare = 0, n_excludedV_uchr = 0, n_line = 0, n_excludedV_toofar = 0; int ngt, ngt_arr = 0, *gt_arr = NULL; bcf1_t * line; while(bcf_sr_next_line (sr)) { line = bcf_sr_get_line(sr, 0); if (line->n_allele == 2) { bcf_unpack(line, BCF_UN_STR); string sid = string(line->d.id); string chr = string(bcf_hdr_id2name(sr->readers[0].header, line->rid)); int chr_idx = findCHR(chr); if (chr_idx >= 0) { unsigned int pos = line->pos + 1; ngt = bcf_get_genotypes(sr->readers[0].header, line, >_arr, &ngt_arr); if (ngt == 2*n_samples) { double freq = 0.0, tot = 0.0; for(int i = 0 ; i < n_samples ; i ++) { assert(gt_arr[2*i+0] != bcf_gt_missing && gt_arr[2*i+1] != bcf_gt_missing); if (mappingS[i] >= 0) { freq += bcf_gt_allele(gt_arr[2*i+0]) + bcf_gt_allele(gt_arr[2*i+1]); tot += 2.0; } } double maf = freq / tot; if (maf > 0.5) maf = 1.0 - maf; if (maf >= threshold_maf) { int dist_tss = getDistance(chr_idx, pos); if (dist_tss < 1e6) { string tmp_id = chr + "_" + stb.str(pos); genotype_uuid.insert(pair < string, unsigned int > (tmp_id, genotype_pos.size())); genotype_chr.push_back(chr_idx); genotype_pos.push_back(pos); genotype_maf.push_back(maf); genotype_dist.push_back(dist_tss); genotype_haps.push_back(vector < bool > (2 * included_sample, false)); for(int i = 0 ; i < n_samples ; i ++) { if (mappingS[i] >= 0) { genotype_haps.back()[2 * mappingS[i] + 0] = bcf_gt_allele(gt_arr[2 * i + 0]); genotype_haps.back()[2 * mappingS[i] + 1] = bcf_gt_allele(gt_arr[2 * i + 1]); } } } else n_excludedV_toofar++; } else n_excludedV_rare ++; } else n_excludedV_void ++; } else n_excludedV_uchr ++; } else n_excludedV_mult ++; if (n_line % 100000 == 0) vrb.bullet("#lines = " + stb.str(n_line)); n_line ++; } genotype_qtl = vector < bool > (genotype_pos.size(), false); genotype_gwas = vector < bool > (genotype_pos.size(), false); genotype_bin = vector < int > (genotype_pos.size(), -1); //Finalize bcf_sr_destroy(sr); vrb.bullet(stb.str(genotype_pos.size()) + " variants included"); if (n_excludedV_mult > 0) vrb.bullet(stb.str(n_excludedV_mult) + " multi-allelic variants excluded"); if (n_excludedV_uchr > 0) vrb.bullet(stb.str(n_excludedV_uchr) + " variants with unreferenced chromosome in --tss"); if (n_excludedV_rare > 0) vrb.bullet(stb.str(n_excludedV_rare) + " maf filtered variants"); if (n_excludedV_toofar > 0) vrb.bullet(stb.str(n_excludedV_toofar) + " too far variants"); }
// add ROC decision point void BlockQuantify::addROCValue(std::string const & roc_identifier, roc::DecisionType dt, double level, uint64_t n, bcf1_t * v) { // add observation to a roc auto observe = [this, level, dt, n, v](std::string const & name, bool f) { roc::DecisionType final_dt = dt; if(f) { if(dt == roc::DecisionType::TP) { // filter-failed TPs become FNs final_dt = roc::DecisionType::FN; } else if(dt == roc::DecisionType::TP2) { // filter-failed TPs become FNs final_dt = roc::DecisionType::FN2; } else if(dt != roc::DecisionType::FN) { // filter-failed FPs / UNKs become Ns final_dt = roc::DecisionType::N; } } uint64_t flags = 0; switch(final_dt) { case roc::DecisionType::FN: case roc::DecisionType::TP: { const std::string bk_truth = bcfhelpers::getFormatString(_impl->hdr, v, "BK", 0, "."); const std::string bi_truth = bcfhelpers::getFormatString(_impl->hdr, v, "BI", 0, "."); const std::string blt_truth = bcfhelpers::getFormatString(_impl->hdr, v, "BLT", 0, "."); flags = roc::makeObservationFlags(bk_truth, bi_truth, blt_truth); break; } case roc::DecisionType::FP: case roc::DecisionType::UNK: case roc::DecisionType::TP2: case roc::DecisionType::FN2: { const std::string bk_query = bcfhelpers::getFormatString(_impl->hdr, v, "BK", 1, "."); const std::string bi_query = bcfhelpers::getFormatString(_impl->hdr, v, "BI", 1, "."); const std::string blt_query = bcfhelpers::getFormatString(_impl->hdr, v, "BLT", 1, "."); flags = roc::makeObservationFlags(bk_query, bi_query, blt_query); break; } default: break; } auto it = _impl->rocs.find(name); if(it == _impl->rocs.end()) { it = _impl->rocs.insert(std::make_pair(name, roc::Roc())).first; } // make sure FN and N always come first if( ( final_dt == roc::DecisionType::FN || final_dt == roc::DecisionType::FN2 || final_dt == roc::DecisionType::N ) && level == 0 ) { it->second.add(roc::Observation{std::numeric_limits<double>::min(), final_dt, n, flags}); } else { it->second.add(roc::Observation{level, final_dt, n, flags}); } }; bcf_unpack(v, BCF_UN_FLT); bool fail = false; // fails any of the non-blocked filters bool fail_any = false; // fails any filter for(int j = 0; j < v->d.n_flt; ++j) { std::string filter = "PASS"; int k = v->d.flt[j]; if(k >= 0) { filter = bcf_hdr_int2id(_impl->hdr, BCF_DT_ID, v->d.flt[j]); } if(filter != "PASS" && filter != "") { if(!_impl->filters_to_ignore.count("*") && !_impl->filters_to_ignore.count(filter)) { fail = true; observe("f:" + roc_identifier + ":" + filter, false); } else { observe("f:" + roc_identifier + ":SEL_IGN_" + filter, false); } fail_any = true; } } observe("a:" + roc_identifier + ":PASS", fail_any); // selectively-filtered ROCs if(!_impl->filters_to_ignore.empty()) { observe("a:" + roc_identifier + ":SEL", fail); } observe("a:" + roc_identifier + ":ALL", false); const std::string regions = bcfhelpers::getInfoString(_impl->hdr, v, "Regions", ""); if(!regions.empty() && regions != "CONF") { std::vector<std::string> rs; stringutil::split(regions, rs, ","); for(auto const & r : rs) { if(r == "CONF") { continue; } observe("s|" + r + ":" + roc_identifier + ":PASS", fail_any); if(!_impl->filters_to_ignore.empty()) { observe("s|" + r + ":" + roc_identifier + ":SEL", fail); } observe("s|" + r + ":" + roc_identifier + ":ALL", false); } } }
static void cross_check_gts(args_t *args) { int nsamples = bcf_hdr_nsamples(args->sm_hdr), ndp_arr = 0; unsigned int *dp = (unsigned int*) calloc(nsamples,sizeof(unsigned int)), *ndp = (unsigned int*) calloc(nsamples,sizeof(unsigned int)); // this will overflow one day... int fake_pls = args->no_PLs, ignore_dp = 0; int i,j,k,idx, pl_warned = 0, dp_warned = 0; int32_t *dp_arr = NULL; int *is_hom = args->hom_only ? (int*) malloc(sizeof(int)*nsamples) : NULL; if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 ) { if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname); fprintf(stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname); fake_pls = 1; } if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "DP")<0 ) ignore_dp = 1; FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : stdout; print_header(args, fp); if ( args->all_sites ) fprintf(fp,"# [1]SD, Average Site Discordance\t[2]Chromosome\t[3]Position\t[4]Number of available pairs\t[5]Average discordance\n"); while ( bcf_sr_next_line(args->files) ) { bcf1_t *line = args->files->readers[0].buffer[0]; bcf_unpack(line, BCF_UN_FMT); int npl; if ( !fake_pls ) { npl = bcf_get_format_int32(args->sm_hdr, line, "PL", &args->pl_arr, &args->npl_arr); if ( npl<=0 ) { pl_warned++; continue; } npl /= nsamples; } else npl = fake_PLs(args, args->sm_hdr, line); if ( !ignore_dp && bcf_get_format_int32(args->sm_hdr, line, "DP", &dp_arr, &ndp_arr) <= 0 ) { dp_warned++; continue; } if ( args->hom_only ) { for (i=0; i<nsamples; i++) is_hom[i] = is_hom_most_likely(line->n_allele, args->pl_arr+i*npl); } double sum = 0; int nsum = 0; idx = 0; for (i=0; i<nsamples; i++) { int *ipl = &args->pl_arr[i*npl]; if ( *ipl==-1 ) { idx += i; continue; } // missing genotype if ( !ignore_dp && (dp_arr[i]==bcf_int32_missing || !dp_arr[i]) ) { idx += i; continue; } if ( args->hom_only && !is_hom[i] ) { idx += i; continue; } for (j=0; j<i; j++) { int *jpl = &args->pl_arr[j*npl]; if ( *jpl==-1 ) { idx++; continue; } // missing genotype if ( !ignore_dp && (dp_arr[j]==bcf_int32_missing || !dp_arr[j]) ) { idx++; continue; } if ( args->hom_only && !is_hom[j] ) { idx++; continue; } int min_pl = INT_MAX; for (k=0; k<npl; k++) { if ( ipl[k]==bcf_int32_missing || jpl[k]==bcf_int32_missing ) break; if ( ipl[k]==bcf_int32_vector_end || jpl[k]==bcf_int32_vector_end ) { k = npl; break; } if ( min_pl > ipl[k]+jpl[k] ) min_pl = ipl[k]+jpl[k]; } if ( k!=npl ) { idx++; continue; } if ( args->all_sites ) { sum += min_pl; nsum++; } args->lks[idx] += min_pl; args->cnts[idx]++; if ( !ignore_dp ) { args->dps[idx] += dp_arr[i] < dp_arr[j] ? dp_arr[i] : dp_arr[j]; dp[i] += dp_arr[i]; ndp[i]++; dp[j] += dp_arr[j]; ndp[j]++; } else { args->dps[idx]++; dp[i]++; ndp[i]++; dp[j]++; ndp[j]++; } idx++; } } if ( args->all_sites ) fprintf(fp,"SD\t%s\t%d\t%d\t%.0f\n", args->sm_hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1, nsum, nsum?sum/nsum:0); } if ( dp_arr ) free(dp_arr); if ( args->pl_arr ) free(args->pl_arr); if ( args->tmp_arr ) free(args->tmp_arr); if ( is_hom ) free(is_hom); if ( pl_warned ) fprintf(stderr, "[W::%s] PL was not found at %d site(s)\n", __func__, pl_warned); if ( dp_warned ) fprintf(stderr, "[W::%s] DP was not found at %d site(s)\n", __func__, dp_warned); // Output samples sorted by average discordance double *score = (double*) calloc(nsamples,sizeof(double)); args->sites = (double*) calloc(nsamples,sizeof(double)); idx = 0; for (i=0; i<nsamples; i++) { for (j=0; j<i; j++) { score[i] += args->lks[idx]; score[j] += args->lks[idx]; args->sites[i] += args->cnts[idx]; args->sites[j] += args->cnts[idx]; idx++; } } for (i=0; i<nsamples; i++) if ( args->sites[i] ) score[i] /= args->sites[i]; double **p = (double**) malloc(sizeof(double*)*nsamples), avg_score = 0; for (i=0; i<nsamples; i++) p[i] = &score[i]; qsort(p, nsamples, sizeof(int*), cmp_doubleptr); // The average discordance gives the number of differing sites in % with -G1 fprintf(fp, "# [1]SM\t[2]Average Discordance\t[3]Average depth\t[4]Average number of sites\t[5]Sample\t[6]Sample ID\n"); for (i=0; i<nsamples; i++) { idx = p[i] - score; double adp = ndp[idx] ? (double)dp[idx]/ndp[idx] : 0; double nsites = args->sites[idx]/(nsamples-1); avg_score += score[idx]; fprintf(fp, "SM\t%f\t%.2lf\t%.0lf\t%s\t%d\n", score[idx]*100., adp, nsites, args->sm_hdr->samples[idx],i); } // Overall score: maximum absolute deviation from the average score fprintf(fp, "# [1] MD\t[2]Maximum deviation\t[3]The culprit\n"); fprintf(fp, "MD\t%f\t%s\n", (score[idx] - avg_score/nsamples)*100., args->sm_hdr->samples[idx]); // idx still set free(p); free(score); free(dp); free(ndp); // Pairwise discordances fprintf(fp, "# [1]CN\t[2]Discordance\t[3]Number of sites\t[4]Average minimum depth\t[5]Sample i\t[6]Sample j\n"); idx = 0; for (i=0; i<nsamples; i++) { for (j=0; j<i; j++) { fprintf(fp, "CN\t%.0f\t%d\t%.2f\t%s\t%s\n", args->lks[idx], args->cnts[idx], args->cnts[idx]?(double)args->dps[idx]/args->cnts[idx]:0.0, args->sm_hdr->samples[i],args->sm_hdr->samples[j]); idx++; } } fclose(fp); if ( args->plot ) plot_cross_check(args); }
static void reheader_bcf(args_t *args, int is_compressed) { htsFile *fp = hts_open(args->fname, "r"); if ( !fp ) error("Failed to open: %s\n", args->fname); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to read the header: %s\n", args->fname); kstring_t htxt = {0,0,0}; int hlen; htxt.s = bcf_hdr_fmt_text(hdr, 1, &hlen); htxt.l = hlen; int i, nsamples = 0; char **samples = NULL; if ( args->samples_fname ) samples = hts_readlines(args->samples_fname, &nsamples); if ( args->header_fname ) { free(htxt.s); htxt.s = NULL; htxt.l = htxt.m = 0; read_header_file(args->header_fname, &htxt); } if ( samples ) { set_samples(samples, nsamples, &htxt); for (i=0; i<nsamples; i++) free(samples[i]); free(samples); } bcf_hdr_t *hdr_out = bcf_hdr_init("r"); bcf_hdr_parse(hdr_out, htxt.s); if ( args->header_fname ) hdr_out = strip_header(hdr, hdr_out); // write the header and the body htsFile *fp_out = hts_open("-",is_compressed ? "wb" : "wbu"); bcf_hdr_write(fp_out, hdr_out); bcf1_t *rec = bcf_init(); while ( bcf_read(fp, hdr, rec)==0 ) { // sanity checking, this slows things down. Make it optional? bcf_unpack(rec, BCF_UN_ALL); if ( rec->rid >= hdr_out->n[BCF_DT_CTG] || strcmp(bcf_hdr_int2id(hdr,BCF_DT_CTG,rec->rid),bcf_hdr_int2id(hdr_out,BCF_DT_CTG,rec->rid)) ) error("The CHROM is not defined: \"%s\"\n", bcf_hdr_int2id(hdr,BCF_DT_CTG,rec->rid)); for (i=0; i<rec->d.n_flt; i++) { int id = rec->d.flt[i]; if ( id >= hdr_out->n[BCF_DT_ID] ) break; if ( !bcf_hdr_idinfo_exists(hdr_out,BCF_HL_FLT,id) ) break; if ( strcmp(hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key) ) error("FIXME: Broken FILTER ids: %s vs %s\n", hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key); } if ( i!=rec->d.n_flt ) error("The FILTER is not defined: \"%s\"\n", bcf_hdr_int2id(hdr,BCF_DT_ID,rec->d.flt[i])); for (i=0; i<rec->n_info; i++) { int id = rec->d.info[i].key; if ( id >= hdr_out->n[BCF_DT_ID] ) break; if ( !hdr_out->id[BCF_DT_ID][id].key ) break; if ( !bcf_hdr_idinfo_exists(hdr_out,BCF_HL_INFO,id) ) break; if ( strcmp(hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key) ) error("FIXME: Broken INFO ids: %s vs %s\n", hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key); } if ( i!=rec->n_info ) error("The INFO tag is not defined: \"%s\"\n", bcf_hdr_int2id(hdr,BCF_DT_ID,rec->d.info[i].key)); for (i=0; i<rec->n_fmt; i++) { int id = rec->d.fmt[i].id; if ( id >= hdr_out->n[BCF_DT_ID] ) break; if ( !hdr_out->id[BCF_DT_ID][id].key ) break; if ( !bcf_hdr_idinfo_exists(hdr_out,BCF_HL_FMT,id) ) break; if ( strcmp(hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key) ) error("FIXME: Broken FORMAT ids: %s vs %s\n", hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key); } if ( i!=rec->n_fmt ) error("The FORMAT tag is not defined: \"%s\"\n", bcf_hdr_int2id(hdr,BCF_DT_ID,rec->d.fmt[i].id)); bcf_write(fp_out,hdr_out,rec); } bcf_destroy(rec); free(htxt.s); hts_close(fp_out); hts_close(fp); bcf_hdr_destroy(hdr_out); bcf_hdr_destroy(hdr); }
/* * GT field (genotype) comparison function. */ bcf1_t *process(bcf1_t *rec) { uint64_t i; bcf_unpack(rec, BCF_UN_FMT); // unpack the Format fields, including the GT field int gte_smp = 0; // number GT array entries per sample (should be 2, one entry per allele) args.ngt_arr = 0; /*! hold the number of current GT array entries */ if ( (gte_smp = bcf_get_genotypes(args.hdr, rec, &(args.gt_arr), &(args.ngt_arr) ) ) <= 0 ) { error("GT not present at %s: %d\n", args.hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos+1); } gte_smp /= args.nsmp; // divide total number of genotypes array entries (= args.ngt_arr) by number of samples // initialize with missing genotype int a1 = 0; int a2 = 0; // initialize with first selected sample genotype that is not missing int gt = -1; while ( (a1 == 0) || (a2 == 0) ) { gt++; if (gt == args.nsmp) break; if (args.selected_smps[gt] == 0) continue; a1 = (args.gt_arr + gte_smp * gt)[0]; if ( gte_smp == 2 ) a2 = (args.gt_arr + gte_smp * gt)[1]; else if ( gte_smp == 1 ) a2 = bcf_int32_vector_end; else error("GTsubset does not support ploidy higher than 2.\n"); } // fprintf(stderr, "a1: %i a2: %i\n", a1, a2); // check all genotypes if they match (for included samples) or disagree (for samples not included) gt = 0; for ( i = 0; i < args.nsmp; i++ ) { int *gt_ptr = args.gt_arr + gte_smp * i; int b1 = gt_ptr[0]; int b2; if ( gte_smp == 2 ) // two entries available per sample, padded with missing values for haploid genotypes { b2 = gt_ptr[1]; } else if (gte_smp == 1 ) // use vector end value for second entry, if only one is available { b2 = bcf_int32_vector_end; } else { error("GTsubset does not support ploidy higher than 2.\n"); } // fprintf(stderr, "b1: %i b2: %i\n", b1, b2); /* missing genotypes are counted as always passing, as they neither * mismatch the initial selected genotype for a selected sample, nor * do they match the initial selected genotype for an excluded sample's * genotype */ if ( (b1 == 0) || (b2 == 0) ) { gt++; // fprintf(stderr, "missing => pass\n"); continue; } else if ( args.selected_smps[i] == 1 ) { if ( (b1 == a1) && (b2 == a2) ) { gt++; // fprintf(stderr, "match => pass\n"); continue; } else { // fprintf(stderr, "no match => fail\n"); break; } } else if ( args.selected_smps[i] == 0 ) { if ( (b1 != a1 ) || (b2 != a2) ) { gt++; // fprintf(stderr, "no match => pass\n"); continue; } else { // fprintf(stderr, "match => fail\n"); break; } } } if ( gt == args.nsmp ) { return rec; } else { return NULL; } }
bcf1_t *process(bcf1_t *rec) { int i, ns = 0; bcf_unpack(rec, BCF_UN_FMT); bcf_fmt_t *fmt_gt = NULL; for (i=0; i<rec->n_fmt; i++) if ( rec->d.fmt[i].id==args.gt_id ) { fmt_gt = &rec->d.fmt[i]; break; } if ( !fmt_gt ) return rec; // no GT tag hts_expand(int32_t,rec->n_allele,args.marr,args.arr); hts_expand(float,rec->n_allele,args.mfarr,args.farr); hts_expand(counts_t,rec->n_allele,args.mcounts,args.counts); memset(args.arr,0,sizeof(*args.arr)*rec->n_allele); memset(args.counts,0,sizeof(*args.counts)*rec->n_allele); #define BRANCH_INT(type_t,vector_end) { \ for (i=0; i<rec->n_sample; i++) \ { \ type_t *p = (type_t*) (fmt_gt->p + i*fmt_gt->size); \ int ial, als = 0; \ for (ial=0; ial<fmt_gt->n; ial++) \ { \ if ( p[ial]==vector_end ) break; /* smaller ploidy */ \ if ( bcf_gt_is_missing(p[ial]) ) break; /* missing allele */ \ int idx = bcf_gt_allele(p[ial]); \ \ if ( idx >= rec->n_allele ) \ error("Incorrect allele (\"%d\") in %s at %s:%d\n",idx,args.in_hdr->samples[i],bcf_seqname(args.in_hdr,rec),rec->pos+1); \ als |= (1<<idx); /* this breaks with too many alleles */ \ } \ if ( ial==0 ) continue; /* missing alleles */ \ ns++; \ int is_hom = als && !(als & (als-1)); /* only one bit is set */ \ int is_hemi = ial==1; \ for (ial=0; als; ial++) \ { \ if ( als&1 ) \ { \ if ( !is_hom ) \ args.counts[ial].nhet++; \ else if ( !is_hemi ) \ args.counts[ial].nhom += 2; \ else \ args.counts[ial].nhemi++; \ } \ als >>= 1; \ } \ } \ } switch (fmt_gt->type) { case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; default: error("The GT type is not recognised: %d at %s:%d\n",fmt_gt->type, bcf_seqname(args.in_hdr,rec),rec->pos+1); break; } #undef BRANCH_INT if ( args.tags&SET_NS ) { if ( bcf_update_info_int32(args.out_hdr,rec,"NS",&ns,1)!=0 ) error("Error occurred while updating NS at %s:%d\n", bcf_seqname(args.in_hdr,rec),rec->pos+1); } if ( args.tags&SET_AN ) { args.arr[0] = 0; for (i=0; i<rec->n_allele; i++) args.arr[0] += args.counts[i].nhet + args.counts[i].nhom + args.counts[i].nhemi; if ( bcf_update_info_int32(args.out_hdr,rec,"AN",args.arr,1)!=0 ) error("Error occurred while updating AN at %s:%d\n", bcf_seqname(args.in_hdr,rec),rec->pos+1); } if ( args.tags&SET_AF ) { int n = rec->n_allele-1; if ( n>0 ) { args.arr[0] = 0; for (i=0; i<rec->n_allele; i++) args.arr[0] += args.counts[i].nhet + args.counts[i].nhom + args.counts[i].nhemi; for (i=1; i<rec->n_allele; i++) args.farr[i] = (args.counts[i].nhet + args.counts[i].nhom + args.counts[i].nhemi)*1.0/args.arr[0]; } if ( args.arr[0] ) { if ( bcf_update_info_float(args.out_hdr,rec,"AF",args.farr+1,n)!=0 ) error("Error occurred while updating AF at %s:%d\n", bcf_seqname(args.in_hdr,rec),rec->pos+1); } } if ( args.tags&SET_AC ) { int n = rec->n_allele-1; if ( n>0 ) { memset(args.arr,0,sizeof(*args.arr)*rec->n_allele); for (i=1; i<rec->n_allele; i++) args.arr[i] = args.counts[i].nhet + args.counts[i].nhom + args.counts[i].nhemi; } if ( bcf_update_info_int32(args.out_hdr,rec,"AC",args.arr+1,n)!=0 ) error("Error occurred while updating AC at %s:%d\n", bcf_seqname(args.in_hdr,rec),rec->pos+1); } if ( args.tags&SET_AC_Het ) { int n = rec->n_allele-1; if ( n>0 ) { memset(args.arr,0,sizeof(*args.arr)*rec->n_allele); for (i=1; i<rec->n_allele; i++) args.arr[i] += args.counts[i].nhet; } if ( bcf_update_info_int32(args.out_hdr,rec,"AC_Het",args.arr+1,n)!=0 ) error("Error occurred while updating AC_Het at %s:%d\n", bcf_seqname(args.in_hdr,rec),rec->pos+1); } if ( args.tags&SET_AC_Hom ) { int n = rec->n_allele-1; if ( n>0 ) { memset(args.arr,0,sizeof(*args.arr)*rec->n_allele); for (i=1; i<rec->n_allele; i++) args.arr[i] += args.counts[i].nhom; } if ( bcf_update_info_int32(args.out_hdr,rec,"AC_Hom",args.arr+1,n)!=0 ) error("Error occurred while updating AC_Hom at %s:%d\n", bcf_seqname(args.in_hdr,rec),rec->pos+1); } if ( args.tags&SET_AC_Hemi ) { int n = rec->n_allele-1; if ( n>0 ) { memset(args.arr,0,sizeof(*args.arr)*rec->n_allele); for (i=1; i<rec->n_allele; i++) args.arr[i] += args.counts[i].nhemi; } if ( bcf_update_info_int32(args.out_hdr,rec,"AC_Hemi",args.arr+1,n)!=0 ) error("Error occurred while updating AC_Hemi at %s:%d\n", bcf_seqname(args.in_hdr,rec),rec->pos+1); } return rec; }
/** * Gets records for the most recent position and fills up the buffer from file i. * returns true if buffer is filled or it is not necessary to fill buffer. * returns false if no more records are found to fill buffer */ void BCFSyncedReader::fill_buffer(int32_t i) { if (buffer[i].size()>=2) return; if (random_access) { int32_t pos1 = buffer[i].size()==0 ? 0 : bcf_get_pos1(buffer[i].front()); if (ftypes[i].format==bcf) { bcf1_t *v = get_bcf1_from_pool(); bool populated = false; while (itrs[i] && bcf_itr_next(files[i], itrs[i], v)>=0) { populated = true; bcf_unpack(v, BCF_UN_STR); //check to ensure order if (!buffer[i].empty()) { if (!bcf_is_in_order(buffer[i].back(), v)) { fprintf(stderr, "[E:%s:%d %s] VCF file not in order: %s\n", __FILE__, __LINE__, __FUNCTION__, file_names[i].c_str()); exit(1); } } buffer[i].push_back(v); insert_into_pq(i, v); if (pos1==0) { pos1 = bcf_get_pos1(v); } if (bcf_get_pos1(v)!=pos1) { break; } v = get_bcf1_from_pool(); populated = false; } if (!populated) store_bcf1_into_pool(v); } else if (ftypes[i].format==vcf) { while (itrs[i] && tbx_itr_next(files[i], tbxs[i], itrs[i], &s)>=0) { bcf1_t *v = get_bcf1_from_pool(); vcf_parse(&s, hdrs[i], v); bcf_unpack(v, BCF_UN_STR); //check to ensure order if (!buffer[i].empty()) { if (!bcf_is_in_order(buffer[i].back(), v)) { fprintf(stderr, "[E:%s:%d %s] VCF file not in order: %s\n", __FILE__, __LINE__, __FUNCTION__, file_names[i].c_str()); exit(1); } } buffer[i].push_back(v); insert_into_pq(i, v); if (pos1==0) { pos1 = bcf_get_pos1(v); } if (bcf_get_pos1(v)!=pos1) { break; } } } } else { int32_t rid = buffer[i].size()==0 ? -1 : bcf_get_rid(buffer[i].front()); int32_t pos1 = buffer[i].size()==0 ? 0 : bcf_get_pos1(buffer[i].front()); bcf1_t *v = get_bcf1_from_pool(); bool populated = false; while (bcf_read(files[i], hdrs[i], v)>=0) { populated = true; bcf_unpack(v, BCF_UN_STR); //check to ensure order if (!buffer[i].empty()) { if (!bcf_is_in_order(buffer[i].back(), v)) { fprintf(stderr, "[E:%s:%d %s] VCF file not in order: %s\n", __FILE__, __LINE__, __FUNCTION__, file_names[i].c_str()); exit(1); } } buffer[i].push_back(v); insert_into_pq(i, v); if (rid==-1) { rid = bcf_get_rid(v); pos1 = bcf_get_pos1(v); } if (bcf_get_rid(v)!=rid || bcf_get_pos1(v)!=pos1) { break; } v = get_bcf1_from_pool(); populated = false; } if (!populated) store_bcf1_into_pool(v); } }
/* For unphased genotypes D is approximated as suggested in https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2710162/ D =~ (GT correlation) * sqrt(Pa*(1-Pa)*Pb*(1-Pb)) */ static double _calc_ld(vcfbuf_t *buf, bcf1_t *arec, bcf1_t *brec) { if ( arec->n_sample!=brec->n_sample ) error("Different number of samples: %d vs %d\n",arec->n_sample,brec->n_sample); assert( arec->n_sample ); int i,j,igt = bcf_hdr_id2int(buf->hdr, BCF_DT_ID, "GT"); bcf_unpack(arec, BCF_UN_FMT); bcf_unpack(brec, BCF_UN_FMT); bcf_fmt_t *afmt = NULL, *bfmt = NULL; for (i=0; i<arec->n_fmt; i++) if ( arec->d.fmt[i].id==igt ) { afmt = &arec->d.fmt[i]; break; } if ( !afmt ) return -1; // no GT tag for (i=0; i<brec->n_fmt; i++) if ( brec->d.fmt[i].id==igt ) { bfmt = &brec->d.fmt[i]; break; } if ( !bfmt ) return -1; // no GT tag if ( afmt->n==0 ) return -1; // empty?! if ( bfmt->n==0 ) return -1; // empty?! if ( afmt->type!=BCF_BT_INT8 ) error("TODO: the GT fmt_type is not int8!\n"); if ( bfmt->type!=BCF_BT_INT8 ) error("TODO: the GT fmt_type is not int8!\n"); // Determine allele frequencies, this is to sample randomly missing genotypes double aaf = 0, baf = 0; if ( buf->ld.rand_missing ) { aaf = _estimate_af((int8_t*)afmt->p, afmt->size, afmt->n, arec->n_sample); baf = _estimate_af((int8_t*)bfmt->p, bfmt->size, bfmt->n, brec->n_sample); } // Calculate correlation double ab = 0, aa = 0, bb = 0, a = 0, b = 0; int nab = 0, na = 0, nb = 0, ndiff = 0; for (i=0; i<arec->n_sample; i++) { int8_t *aptr = (int8_t*) (afmt->p + i*afmt->size); int8_t *bptr = (int8_t*) (bfmt->p + i*bfmt->size); int adsg = 0, bdsg = 0, an = 0, bn = 0; for (j=0; j<afmt->n; j++) { if ( aptr[j]==bcf_int8_vector_end ) break; if ( aptr[j]==bcf_gt_missing ) { if ( !buf->ld.rand_missing ) break; if ( rand()/RAND_MAX >= aaf ) adsg += 1; } else if ( bcf_gt_allele(aptr[j]) ) adsg += 1; an++; } for (j=0; j<bfmt->n; j++) { if ( bptr[j]==bcf_int8_vector_end ) break; if ( bptr[j]==bcf_gt_missing ) { if ( !buf->ld.rand_missing ) break; if ( rand()/RAND_MAX >= baf ) bdsg += 1; } else if ( bcf_gt_allele(bptr[j]) ) bdsg += 1; bn++; } if ( an ) { aa += adsg*adsg; a += adsg; na++; } if ( bn ) { bb += bdsg*bdsg; b += bdsg; nb++; } if ( an && bn ) { if ( adsg!=bdsg ) ndiff++; ab += adsg*bdsg; nab++; } } if ( !nab ) return -1; double cor; if ( !ndiff ) cor = 1; else { // Don't know how to deal with zero variance. Since this the purpose is filtering, // it is not enough to say the value is undefined. Therefore an artificial noise is // added to make the denominator non-zero. if ( aa == a*a/na || bb == b*b/nb ) { aa += 3*3; bb += 3*3; ab += 3*3; a += 3; b += 3; na++; nb++; nab++; } cor = (ab/nab - a/na*b/nb) / sqrt(aa/na - a/na*a/na) / sqrt(bb/nb - b/nb*b/nb); } return cor*cor; }
static void check_gt(args_t *args) { int i,ret, *gt2ipl = NULL, m_gt2ipl = 0, *gt_arr = NULL, ngt_arr = 0; int fake_pls = args->no_PLs; // Initialize things: check which tags are defined in the header, sample names etc. if ( bcf_hdr_id2int(args->gt_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] GT not present in the header of %s?\n", __func__, args->files->readers[1].fname); if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 ) { if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname); fprintf(stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname); fake_pls = 1; } FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : stdout; print_header(args, fp); int tgt_isample = -1, query_isample = 0; if ( args->target_sample ) { tgt_isample = bcf_hdr_id2int(args->gt_hdr, BCF_DT_SAMPLE, args->target_sample); if ( tgt_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[1].fname, args->target_sample); } if ( args->all_sites ) { if ( tgt_isample==-1 ) { fprintf(stderr,"No target sample selected for comparison, using the first sample in %s: %s\n", args->gt_fname,args->gt_hdr->samples[0]); tgt_isample = 0; } } if ( args->query_sample ) { query_isample = bcf_hdr_id2int(args->sm_hdr, BCF_DT_SAMPLE, args->query_sample); if ( query_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[0].fname, args->query_sample); } if ( args->all_sites ) fprintf(fp, "# [1]SC, Site by Site Comparison\t[2]Chromosome\t[3]Position\t[4]-g alleles\t[5]-g GT (%s)\t[6]Coverage\t[7]Query alleles\t[8-]Query PLs (%s)\n", args->gt_hdr->samples[tgt_isample],args->sm_hdr->samples[query_isample]); // Main loop while ( (ret=bcf_sr_next_line(args->files)) ) { if ( ret!=2 ) continue; bcf1_t *sm_line = args->files->readers[0].buffer[0]; // the query file bcf1_t *gt_line = args->files->readers[1].buffer[0]; // the -g target file bcf_unpack(sm_line, BCF_UN_FMT); bcf_unpack(gt_line, BCF_UN_FMT); // Init mapping from target genotype index to the sample's PL fields int n_gt2ipl = gt_line->n_allele*(gt_line->n_allele + 1)/2; if ( n_gt2ipl > m_gt2ipl ) { m_gt2ipl = n_gt2ipl; gt2ipl = (int*) realloc(gt2ipl, sizeof(int)*m_gt2ipl); } if ( !init_gt2ipl(args, gt_line, sm_line, gt2ipl, n_gt2ipl) ) continue; // Target genotypes int ngt, npl; if ( (ngt=bcf_get_genotypes(args->gt_hdr, gt_line, >_arr, &ngt_arr)) <= 0 ) error("GT not present at %s:%d?", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1); ngt /= bcf_hdr_nsamples(args->gt_hdr); if ( ngt!=2 ) continue; // checking only diploid genotypes // Sample PLs if ( !fake_pls ) { if ( (npl=bcf_get_format_int32(args->sm_hdr, sm_line, "PL", &args->pl_arr, &args->npl_arr)) <= 0 ) error("PL not present at %s:%d?", args->sm_hdr->id[BCF_DT_CTG][sm_line->rid].key, sm_line->pos+1); npl /= bcf_hdr_nsamples(args->sm_hdr); } else npl = fake_PLs(args, args->sm_hdr, sm_line); // Calculate likelihoods for all samples, assuming diploid genotypes // For faster access to genotype likelihoods (PLs) of the query sample int max_ipl, *pl_ptr = args->pl_arr + query_isample*npl; double sum_pl = 0; // for converting PLs to probs for (max_ipl=0; max_ipl<npl; max_ipl++) { if ( pl_ptr[max_ipl]==bcf_int32_vector_end ) break; if ( pl_ptr[max_ipl]==bcf_int32_missing ) continue; sum_pl += pow(10, -0.1*pl_ptr[max_ipl]); } if ( sum_pl==0 ) continue; // no PLs present // The main stats: concordance of the query sample with the target -g samples for (i=0; i<bcf_hdr_nsamples(args->gt_hdr); i++) { int *gt_ptr = gt_arr + i*ngt; if ( gt_ptr[1]==bcf_int32_vector_end ) continue; // skip haploid genotypes int a = bcf_gt_allele(gt_ptr[0]); int b = bcf_gt_allele(gt_ptr[1]); if ( a<0 || b<0 ) continue; // missing genotypes if ( args->hom_only && a!=b ) continue; // heterozygous genotype int igt_tgt = igt_tgt = bcf_alleles2gt(a,b); // genotype index in the target file int igt_qry = gt2ipl[igt_tgt]; // corresponding genotype in query file if ( igt_qry>=max_ipl || pl_ptr[igt_qry]<0 ) continue; // genotype not present in query sample: haploid or missing args->lks[i] += log(pow(10, -0.1*pl_ptr[igt_qry])/sum_pl); args->sites[i]++; } if ( args->all_sites ) { // Print LKs at all sites for debugging int *gt_ptr = gt_arr + tgt_isample*ngt; if ( gt_ptr[1]==bcf_int32_vector_end ) continue; // skip haploid genotypes int a = bcf_gt_allele(gt_ptr[0]); int b = bcf_gt_allele(gt_ptr[1]); if ( args->hom_only && a!=b ) continue; // heterozygous genotype fprintf(fp, "SC\t%s\t%d", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1); for (i=0; i<gt_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', gt_line->d.allele[i]); fprintf(fp, "\t%s/%s", a>=0 ? gt_line->d.allele[a] : ".", b>=0 ? gt_line->d.allele[b] : "."); int igt, *pl_ptr = args->pl_arr + query_isample*npl; // PLs of the query sample for (i=0; i<sm_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', sm_line->d.allele[i]); for (igt=0; igt<npl; igt++) if ( pl_ptr[igt]==bcf_int32_vector_end ) break; else if ( pl_ptr[igt]==bcf_int32_missing ) fprintf(fp, "."); else fprintf(fp, "\t%d", pl_ptr[igt]); fprintf(fp, "\n"); } } free(gt2ipl); free(gt_arr); free(args->pl_arr); free(args->tmp_arr); // Scale LKs and certainties int nsamples = bcf_hdr_nsamples(args->gt_hdr); double min = args->lks[0]; for (i=1; i<nsamples; i++) if ( min>args->lks[i] ) min = args->lks[i]; for (i=0; i<nsamples; i++) args->lks[i] = min ? args->lks[i] / min : 0; double max_avg = args->sites[0] ? args->lks[0]/args->sites[0] : 0; for (i=1; i<nsamples; i++) { double val = args->sites[i] ? args->lks[i]/args->sites[i] : 0; if ( max_avg<val ) max_avg = val; } // Sorted output double **p = (double**) malloc(sizeof(double*)*nsamples); for (i=0; i<nsamples; i++) p[i] = &args->lks[i]; qsort(p, nsamples, sizeof(int*), cmp_doubleptr); fprintf(fp, "# [1]CN\t[2]Concordance with %s (total)\t[3]Concordance (average)\t[4]Number of sites compared\t[5]Sample\t[6]Sample ID\n", args->sm_hdr->samples[query_isample]); for (i=0; i<nsamples; i++) { int idx = p[i] - args->lks; double avg = args->sites[idx] ? args->lks[idx]/args->sites[idx] : 0; fprintf(fp, "CN\t%e\t%e\t%.0f\t%s\t%d\n", 1-args->lks[idx], 1-avg/max_avg, args->sites[idx], args->gt_hdr->samples[idx], i); } if ( args->plot ) { fclose(fp); plot_check(args, args->target_sample ? args->target_sample : "", args->sm_hdr->samples[query_isample]); } }
/** * Classifies variants. */ int32_t Variant::classify(bcf_hdr_t *h, bcf1_t *v) { clear(); this->h = h; this->v = v; bcf_unpack(v, BCF_UN_STR); chrom.assign(bcf_get_chrom(h, v)); rid = bcf_get_rid(v); pos1 = bcf_get_pos1(v); end1 = bcf_get_end1(v); char** allele = bcf_get_allele(v); int32_t n_allele = bcf_get_n_allele(v); int32_t pos0 = pos1-1; bool homogeneous_length = true; char* ref = allele[0]; int32_t rlen = strlen(ref); if (strchr(ref, 'N')) { contains_N = true; } //if only ref allele, skip this entire for loop for (size_t i=1; i<n_allele; ++i) { int32_t allele_type = VT_REF; //check for symbolic alternative alleles if (strchr(allele[i],'<')) { size_t len = strlen(allele[i]); if (len>=5) { //VN/d+ if (allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][len-1]=='>' ) { for (size_t j=3; j<len-1; ++j) { if (allele[i][j]<'0' || allele[i][j]>'9') { allele_type = VT_VNTR; } } } //VNTR else if (len==6 && allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][3]=='T' && allele[i][4]=='R' && allele[i][5]=='>' ) { allele_type = VT_VNTR; } //STR else if (len==5 && allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][3]=='R' && allele[i][4]=='>' ) { allele_type = VT_VNTR; } //ST/d+ else if (allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][len-1]=='>' ) { type = VT_VNTR; for (size_t j=3; j<len-1; ++j) { if ((allele[i][j]<'0' || allele[i][j]>'9') && allele[i][j]!='.') { type = VT_SV; } } } } if (allele_type==VT_VNTR) { allele_type = VT_VNTR; type |= allele_type; alleles.push_back(Allele(allele_type)); } else { allele_type = VT_SV; type |= allele_type; std::string sv_type(allele[i]); alleles.push_back(Allele(allele_type, sv_type)); } } //checks for chromosomal breakpoints else if (strchr(allele[i],'[')||strchr(allele[i],']')) { allele_type = VT_SV; type |= allele_type; std::string sv_type("<BND>"); alleles.push_back(Allele(allele_type, sv_type)); } //non variant record else if (allele[i][0]=='.' || strcmp(allele[i],allele[0])==0) { type = VT_REF; } //explicit sequence of bases else { kstring_t REF = {0,0,0}; kstring_t ALT = {0,0,0}; ref = allele[0]; char* alt = allele[i]; int32_t alen = strlen(alt); if (strchr(alt, 'N')) { contains_N = true; } if (rlen!=alen) { homogeneous_length = false; } //trimming //this is required in particular for the //characterization of multiallelics and //in general, any unnormalized variant int32_t rl = rlen; int32_t al = alen; //trim right while (rl!=1 && al!=1) { if (ref[rl-1]==alt[al-1]) { --rl; --al; } else { break; } } //trim left while (rl !=1 && al!=1) { if (ref[0]==alt[0]) { ++ref; ++alt; --rl; --al; } else { break; } } kputsn(ref, rl, &REF); kputsn(alt, al, &ALT); ref = REF.s; alt = ALT.s; int32_t mlen = std::min(rl, al); int32_t dlen = al-rl; int32_t diff = 0; int32_t ts = 0; int32_t tv = 0; if (mlen==1 && dlen) { char ls, le, ss; if (rl>al) { ls = ref[0]; le = ref[rl-1]; ss = alt[0]; } else { ls = alt[0]; le = alt[al-1]; ss = ref[0]; } if (ls!=ss && le!=ss) { ++diff; if ((ls=='G' && ss=='A') || (ls=='A' && ss=='G') || (ls=='C' && ss=='T') || (ls=='T' && ss=='C')) { ++ts; } else { ++tv; } } } else { for (int32_t j=0; j<mlen; ++j) { if (ref[j]!=alt[j]) { ++diff; if ((ref[j]=='G' && alt[j]=='A') || (ref[j]=='A' && alt[j]=='G') || (ref[j]=='C' && alt[j]=='T') || (ref[j]=='T' && alt[j]=='C')) { ++ts; } else { ++tv; } } } } //substitution variants if (mlen==diff) { allele_type |= mlen==1 ? VT_SNP : VT_MNP; } //indel variants if (dlen) { allele_type |= VT_INDEL; } //clumped SNPs and MNPs if (diff && diff < mlen) //internal gaps { allele_type |= VT_CLUMPED; } type |= allele_type; alleles.push_back(Allele(type, diff, alen, dlen, mlen, ts, tv)); ts += ts; tv += tv; ins = dlen>0?1:0; del = dlen<0?1:0; if (REF.m) free(REF.s); if (ALT.m) free(ALT.s); } } if (type==VT_VNTR) { update_vntr_from_info_fields(h, v); } //additionally define MNPs by length of all alleles if (!(type&(VT_VNTR|VT_SV)) && type!=VT_REF) { if (homogeneous_length && rlen>1 && n_allele>1) { type |= VT_MNP; } } return type; }
/** * Classifies variants. */ int32_t VariantManip::classify_variant(bcf_hdr_t *h, bcf1_t *v, Variant& var) { bcf_unpack(v, BCF_UN_STR); const char* chrom = bcf_get_chrom(h, v); uint32_t pos1 = bcf_get_pos1(v); char** allele = bcf_get_allele(v); int32_t n_allele = bcf_get_n_allele(v); int32_t pos0 = pos1-1; var.ts = 0; var.tv = 0; var.ins = 0; var.del = 0; var.clear(); // this sets the type to VT_REF by default. bool homogeneous_length = true; char* ref = allele[0]; int32_t rlen = strlen(ref); //if only ref allele, skip this entire for loop for (size_t i=1; i<n_allele; ++i) { int32_t type = VT_REF; //check for tags if (strchr(allele[i],'<')) { size_t len = strlen(allele[i]); if (len>=5) { //VN/d+ if (allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][len-1]=='>' ) { for (size_t j=3; j<len-1; ++j) { if (allele[i][j]<'0' || allele[i][j]>'9') { type = VT_VNTR; } } } //VNTR else if (len==6 && allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][3]=='T' && allele[i][4]=='R' && allele[i][5]=='>' ) { type = VT_VNTR; } //ST/d+ else if (allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][len-1]=='>' ) { for (size_t j=3; j<len-1; ++j) { if (allele[i][j]<'0' || allele[i][j]>'9') { type = VT_VNTR; } } } //STR else if (len==5 && allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][3]=='R' && allele[i][4]=='>' ) { type = VT_VNTR; } } if (type==VT_VNTR) { type = VT_VNTR; var.type |= type; var.alleles.push_back(Allele(type)); } else { type = VT_SV; var.type |= type; std::string sv_type(allele[i]); var.alleles.push_back(Allele(type, sv_type)); } } else if (allele[i][0]=='.' || strcmp(allele[i],allele[0])==0) { type = VT_REF; } else { kstring_t REF = {0,0,0}; kstring_t ALT = {0,0,0}; ref = allele[0]; char* alt = allele[i]; int32_t alen = strlen(alt); if (rlen!=alen) { homogeneous_length = false; } //trimming //this is required in particular for the //characterization of multiallelics and //in general, any unnormalized variant int32_t rl = rlen; int32_t al = alen; //trim right while (rl!=1 && al!=1) { if (ref[rl-1]==alt[al-1]) { --rl; --al; } else { break; } } //trim left while (rl !=1 && al!=1) { if (ref[0]==alt[0]) { ++ref; ++alt; --rl; --al; } else { break; } } kputsn(ref, rl, &REF); kputsn(alt, al, &ALT); ref = REF.s; alt = ALT.s; int32_t mlen = std::min(rl, al); int32_t dlen = al-rl; int32_t diff = 0; int32_t ts = 0; int32_t tv = 0; if (mlen==1 && dlen) { char ls, le, ss; if (rl>al) { ls = ref[0]; le = ref[rl-1]; ss = alt[0]; } else { ls = alt[0]; le = alt[al-1]; ss = ref[0]; } if (ls!=ss && le!=ss) { ++diff; if ((ls=='G' && ss=='A') || (ls=='A' && ss=='G') || (ls=='C' && ss=='T') || (ls=='T' && ss=='C')) { ++ts; } else { ++tv; } } } else { for (int32_t j=0; j<mlen; ++j) { if (ref[j]!=alt[j]) { ++diff; if ((ref[j]=='G' && alt[j]=='A') || (ref[j]=='A' && alt[j]=='G') || (ref[j]=='C' && alt[j]=='T') || (ref[j]=='T' && alt[j]=='C')) { ++ts; } else { ++tv; } } } } //substitution variants if (mlen==diff) { type |= mlen==1 ? VT_SNP : VT_MNP; } //indel variants if (dlen) { type |= VT_INDEL; } //clumped SNPs and MNPs if (diff && diff < mlen) //internal gaps { type |= VT_CLUMPED; } var.type |= type; var.alleles.push_back(Allele(type, diff, alen, dlen, mlen, ts, tv)); var.ts += ts; var.tv += tv; var.ins = dlen>0?1:0; var.del = dlen<0?1:0; if (REF.m) free(REF.s); if (ALT.m) free(ALT.s); } } if (var.type==VT_VNTR) { bcf_unpack(v, BCF_UN_INFO); //populate motif, motif len etc. etc. // char* str = NULL; // int32_t n = 0; // int32_t ret = bcf_get_info_string(h, v, "MOTIF", &str, &n); // if (ret>0) // { // var.motif = std::string(str); // var.mlen = var.motif.size(); // } // ret = bcf_get_info_string(h, v, "RU", &str, &n); // if (ret>0) // { // var.ru = std::string(str); // var.mlen = var.ru.size(); // } // if (n) free(str); // // int32_t* no = NULL; // n = 0; // ret = bcf_get_info_int32(h, v, "RL", &no, &n); // if (ret>0) var.rlen = *no; // if (n) free(no); // // int32_t* fl = NULL; // n = 0; // ret = bcf_get_info_int32(h, v, "REF", &fl, &n); // if (ret>0) var.rcn = *fl; // if (n) free(fl); } //additionally define MNPs by length of all alleles if (!(var.type&(VT_VNTR|VT_SV)) && var.type!=VT_REF) { if (homogeneous_length && rlen>1 && n_allele>1) { var.type |= VT_MNP; } } return var.type; }
/* * _reader_fill_buffer() - buffers all records with the same coordinate */ static void _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) { // Return if the buffer is full: the coordinate of the last buffered record differs if ( reader->nbuffer && reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) return; // No iterator (sequence not present in this file) and not streaming if ( !reader->itr && !files->streaming ) return; // Fill the buffer with records starting at the same position int i, ret = 0; while (1) { if ( reader->nbuffer+1 >= reader->mbuffer ) { // Increase buffer size reader->mbuffer += 8; reader->buffer = (bcf1_t**) realloc(reader->buffer, sizeof(bcf1_t*)*reader->mbuffer); for (i=8; i>0; i--) // initialize { reader->buffer[reader->mbuffer-i] = bcf_init1(); reader->buffer[reader->mbuffer-i]->max_unpack = files->max_unpack; reader->buffer[reader->mbuffer-i]->pos = -1; // for rare cases when VCF starts from 1 } } if ( files->streaming ) { if ( reader->file->format.format==vcf ) { if ( (ret=hts_getline(reader->file, KS_SEP_LINE, &files->tmps)) < 0 ) break; // no more lines int ret = vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]); if ( ret<0 ) break; } else if ( reader->file->format.format==bcf ) { if ( (ret=bcf_read1(reader->file, reader->header, reader->buffer[reader->nbuffer+1])) < 0 ) break; // no more lines } else { fprintf(stderr,"[%s:%d %s] fixme: not ready for this\n", __FILE__,__LINE__,__FUNCTION__); exit(1); } } else if ( reader->tbx_idx ) { if ( (ret=tbx_itr_next(reader->file, reader->tbx_idx, reader->itr, &files->tmps)) < 0 ) break; // no more lines vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]); } else { if ( (ret=bcf_itr_next(reader->file, reader->itr, reader->buffer[reader->nbuffer+1])) < 0 ) break; // no more lines bcf_subset_format(reader->header,reader->buffer[reader->nbuffer+1]); } // apply filter if ( !reader->nfilter_ids ) bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR); else { bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR|BCF_UN_FLT); if ( !has_filter(reader, reader->buffer[reader->nbuffer+1]) ) continue; } reader->nbuffer++; if ( reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break; // the buffer is full } if ( ret<0 ) { // done for this region tbx_itr_destroy(reader->itr); reader->itr = NULL; } if ( files->collapse && reader->nbuffer>=2 && reader->buffer[1]->pos==reader->buffer[2]->pos ) collapse_buffer(files, reader); }
int main_vcfcall(int argc, char *argv[]) { char *samples_fname = NULL; args_t args; memset(&args, 0, sizeof(args_t)); args.argc = argc; args.argv = argv; args.aux.prior_type = -1; args.aux.indel_frac = -1; args.aux.theta = 1e-3; args.aux.pref = 0.5; args.aux.min_perm_p = 0.01; args.aux.min_lrt = 1; args.flag = CF_ACGT_ONLY; args.output_fname = "-"; args.output_type = FT_VCF; args.aux.trio_Pm_SNPs = 1 - 1e-8; args.aux.trio_Pm_ins = args.aux.trio_Pm_del = 1 - 1e-9; int i, c, samples_is_file = 0; static struct option loptions[] = { {"help",0,0,'h'}, {"gvcf",1,0,'g'}, {"format-fields",1,0,'f'}, {"output",1,0,'o'}, {"output-type",1,0,'O'}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, {"samples",1,0,'s'}, {"samples-file",1,0,'S'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, {"keep-alts",0,0,'A'}, {"insert-missed",0,0,'i'}, {"skip-Ns",0,0,'N'}, // now the new default {"keep-masked-refs",0,0,'M'}, {"skip-variants",1,0,'V'}, {"variants-only",0,0,'v'}, {"consensus-caller",0,0,'c'}, {"constrain",1,0,'C'}, {"multiallelic-caller",0,0,'m'}, {"pval-threshold",1,0,'p'}, {"prior",1,0,'P'}, {"chromosome-X",0,0,'X'}, {"chromosome-Y",0,0,'Y'}, {"novel-rate",1,0,'n'}, {0,0,0,0} }; char *tmp = NULL; while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:XYn:P:f:ig:", loptions, NULL)) >= 0) { switch (c) { case 'g': args.flag |= CF_GVCF; args.gvcf.min_dp = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse, expected integer argument: -g %s\n", optarg); break; case 'f': args.aux.output_tags |= parse_format_flag(optarg); break; case 'M': args.flag &= ~CF_ACGT_ONLY; break; // keep sites where REF is N case 'N': args.flag |= CF_ACGT_ONLY; break; // omit sites where first base in REF is N (the new default) case 'A': args.aux.flag |= CALL_KEEPALT; break; case 'c': args.flag |= CF_CCALL; break; // the original EM based calling method case 'i': args.flag |= CF_INS_MISSED; break; case 'v': args.aux.flag |= CALL_VARONLY; break; case 'o': args.output_fname = optarg; break; case 'O': switch (optarg[0]) { case 'b': args.output_type = FT_BCF_GZ; break; case 'u': args.output_type = FT_BCF; break; case 'z': args.output_type = FT_VCF_GZ; break; case 'v': args.output_type = FT_VCF; break; default: error("The output type \"%s\" not recognised\n", optarg); } break; case 'C': if ( !strcasecmp(optarg,"alleles") ) args.aux.flag |= CALL_CONSTR_ALLELES; else if ( !strcasecmp(optarg,"trio") ) args.aux.flag |= CALL_CONSTR_TRIO; else error("Unknown argument to -C: \"%s\"\n", optarg); break; case 'X': args.aux.flag |= CALL_CHR_X; break; case 'Y': args.aux.flag |= CALL_CHR_Y; break; case 'V': if ( !strcasecmp(optarg,"snps") ) args.flag |= CF_INDEL_ONLY; else if ( !strcasecmp(optarg,"indels") ) args.flag |= CF_NO_INDEL; else error("Unknown skip category \"%s\" (-S argument must be \"snps\" or \"indels\")\n", optarg); break; case 'm': args.flag |= CF_MCALL; break; // multiallelic calling method case 'p': args.aux.pref = atof(optarg); break; case 'P': args.aux.theta = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse, expected float argument: -P %s\n", optarg); break; case 'n': parse_novel_rate(&args,optarg); break; case 'r': args.regions = optarg; break; case 'R': args.regions = optarg; args.regions_is_file = 1; break; case 't': args.targets = optarg; break; case 'T': args.targets = optarg; args.targets_is_file = 1; break; case 's': samples_fname = optarg; break; case 'S': samples_fname = optarg; samples_is_file = 1; break; default: usage(&args); } } if ( optind>=argc ) { if ( !isatty(fileno((FILE *)stdin)) ) args.bcf_fname = "-"; // reading from stdin else usage(&args); } else args.bcf_fname = argv[optind++]; // Sanity check options and initialize if ( samples_fname ) { args.samples = read_samples(&args.aux, samples_fname, samples_is_file, &args.nsamples); args.aux.ploidy = (uint8_t*) calloc(args.nsamples+1, 1); args.aux.all_diploid = 1; for (i=0; i<args.nsamples; i++) { args.aux.ploidy[i] = args.samples[i][strlen(args.samples[i]) + 1]; if ( args.aux.ploidy[i]!=2 ) args.aux.all_diploid = 0; } } if ( args.flag & CF_GVCF ) { // Force some flags to avoid unnecessary branching args.aux.flag &= ~CALL_KEEPALT; args.aux.flag |= CALL_VARONLY; } if ( (args.flag & CF_CCALL ? 1 : 0) + (args.flag & CF_MCALL ? 1 : 0) + (args.flag & CF_QCALL ? 1 : 0) > 1 ) error("Only one of -c or -m options can be given\n"); if ( !(args.flag & CF_CCALL) && !(args.flag & CF_MCALL) && !(args.flag & CF_QCALL) ) error("Expected -c or -m option\n"); if ( args.aux.n_perm && args.aux.ngrp1_samples<=0 ) error("Expected -1 with -U\n"); // not sure about this, please fix if ( args.aux.flag & CALL_CONSTR_ALLELES ) { if ( !args.targets ) error("Expected -t or -T with \"-C alleles\"\n"); if ( !(args.flag & CF_MCALL) ) error("The \"-C alleles\" mode requires -m\n"); } if ( args.aux.flag & CALL_CHR_X && args.aux.flag & CALL_CHR_Y ) error("Only one of -X or -Y should be given\n"); if ( args.flag & CF_INS_MISSED && !(args.aux.flag&CALL_CONSTR_ALLELES) ) error("The -i option requires -C alleles\n"); init_data(&args); while ( bcf_sr_next_line(args.aux.srs) ) { bcf1_t *bcf_rec = args.aux.srs->readers[0].buffer[0]; if ( args.samples_map ) bcf_subset(args.aux.hdr, bcf_rec, args.nsamples, args.samples_map); bcf_unpack(bcf_rec, BCF_UN_STR); // Skip unwanted sites if ( args.aux.flag & CALL_VARONLY ) { int is_ref = 0; if ( bcf_rec->n_allele==1 ) is_ref = 1; // not a variant else if ( bcf_rec->n_allele==2 ) { // second allele is mpileup's X, not a variant if ( bcf_rec->d.allele[1][0]=='X' ) is_ref = 1; else if ( bcf_rec->d.allele[1][0]=='<' && bcf_rec->d.allele[1][1]=='X' && bcf_rec->d.allele[1][2]=='>' ) is_ref = 1; } if ( is_ref ) { // gVCF output if ( args.flag & CF_GVCF ) gvcf_write(args.out_fh, &args.gvcf, args.aux.hdr, bcf_rec, 1); continue; } } if ( (args.flag & CF_INDEL_ONLY) && bcf_is_snp(bcf_rec) ) continue; // not an indel if ( (args.flag & CF_NO_INDEL) && !bcf_is_snp(bcf_rec) ) continue; // not a SNP if ( (args.flag & CF_ACGT_ONLY) && (bcf_rec->d.allele[0][0]=='N' || bcf_rec->d.allele[0][0]=='n') ) continue; // REF[0] is 'N' bcf_unpack(bcf_rec, BCF_UN_ALL); // Various output modes: QCall output (todo) if ( args.flag & CF_QCALL ) { qcall(&args.aux, bcf_rec); continue; } // Calling modes which output VCFs int ret; if ( args.flag & CF_MCALL ) ret = mcall(&args.aux, bcf_rec); else ret = ccall(&args.aux, bcf_rec); if ( ret==-1 ) error("Something is wrong\n"); // gVCF output if ( args.flag & CF_GVCF ) { gvcf_write(args.out_fh, &args.gvcf, args.aux.hdr, bcf_rec, ret?0:1); continue; } // Normal output if ( (args.aux.flag & CALL_VARONLY) && ret==0 ) continue; // not a variant bcf_write1(args.out_fh, args.aux.hdr, bcf_rec); } if ( args.flag & CF_GVCF ) gvcf_write(args.out_fh, &args.gvcf, args.aux.hdr, NULL, 0); if ( args.flag & CF_INS_MISSED ) bcf_sr_regions_flush(args.aux.srs->targets); destroy_data(&args); return 0; }
int main(int argc, char **argv) { int i, n; static struct option const long_opts[] = { {"out", required_argument, NULL, 1}, {"report", required_argument, NULL, 2}, {"dotasref", no_argument, NULL, 3}, {"help", no_argument, NULL, 0}, {"version", no_argument, NULL, 4}, {"export_uncov", no_argument, NULL, 5} }; bool help = FALSE; bool report_version = FALSE; while ((n = getopt_long(argc, argv, "1:2:304", long_opts, NULL)) >= 0) { switch (n) { case 1 : outfile = strdup(optarg); break; case 2 : report = strdup(optarg); break; case 3 : dotasref = TRUE; break; case 0 : help = TRUE; break; case 4 : report_version = TRUE; break; case 5 : export_uncover = TRUE; break; default : return 1; } if ( help ) return usage(); if ( report_version ) return show_version(); } n = argc - optind; if ( n > 1 ) errabort("only accept one input vcf"); if ( export_uncover == TRUE && outfile == FALSE) { warnings("export uncove region only used with option --out"); export_uncover = FALSE; } char * input; if ( n == 0 ) input = strdup("-"); else input = strdup(argv[optind]); htsFile * fp = read_vcf_file(input); enum htsExactFormat fmt = hts_get_format(fp)->format; if ( fmt != vcf && fmt != bcf ) errabort("This is not a VCF/BCF file : %s", input); bcf_hdr_t * hdr = bcf_hdr_read(fp); int n_samples = bcf_hdr_nsamples(hdr); if ( n_samples != 2 ) errabort("the input VCF/BCF file must contain only two samples! %d", n_samples); LOG("Using sample %s as ref ...", hdr->samples[0]); LOG("Using sample %s as test ...", hdr->samples[1]); uint32_t matrix[4][4] = { {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0} }; bcf1_t * v = bcf_init1(); kstring_t str = { 0, 0, 0 }; uint32_t line = 0; htsFile *out = NULL; if ( outfile && !check_filename(outfile) ) out = hts_open(outfile, mode); if ( out != NULL ) bcf_hdr_write(out, hdr); while ( bcf_read1(fp, hdr, v) >= 0 ) { bcf_unpack(v, BCF_UN_STR|BCF_UN_FMT); int k; str.l = 0; int tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "GT"); if ( !bcf_hdr_idinfo_exists(hdr, BCF_HL_FMT, tag_id) ) warnings("There is no 'GT' in the header!"); for ( i = 0; i < v->n_fmt; ++i ) if ( v->d.fmt[i].id == tag_id ) break; if ( i == v->n_fmt ) { vcf_format1(hdr, v, &str); LOG("There is no tag GT in this line : %s", str.s); continue; } corr_t xy[2] = { {-1, -2, -2}, {-1, -2, -2} }; bcf_fmt_t * fmt = &v->d.fmt[i]; for ( i = 0; i < 2; ++i ) { int corr = i; if ( fmt == NULL ) { if ( dotasref == TRUE ) xy[corr].alt = ALT_IS_REF; else xy[corr].alt = ALT_IS_UNC; continue; } int last = -2; uint8_t *d = (uint8_t*)((char*)fmt->p + fmt->size*i); for ( k = 0; k < fmt->n && d[k] != (uint8_t)bcf_int8_vector_end; ++k ) { int curr = d[k]>>1; if ( last != curr ) { if ( curr ) { if ( last == -2 ) xy[corr].alt = curr > 1 ? ALT_IS_HOM : ALT_IS_REF; else xy[corr].alt = ALT_IS_HET; } else { xy[corr].alt = dotasref == TRUE ? ALT_IS_REF : ALT_IS_UNC; } } else { if ( curr ) { xy[corr].alt = curr > 1 ? ALT_IS_HOM : ALT_IS_REF; } else { xy[corr].alt = dotasref == TRUE ? ALT_IS_REF : ALT_IS_UNC; } } if (last == -2 ) { xy[corr].min = xy[corr].max = curr; } else { if ( curr < xy[corr].min ) xy[corr].min = curr; else if ( curr > xy[corr].max ) xy[corr].max = curr; } last = curr; } } matrix[xy[0].alt][xy[1].alt]++; if ( xy[0].alt != xy[1].alt && out != NULL) { if ( xy[0].alt == ALT_IS_UNC || xy[1].alt == ALT_IS_UNC ) { if ( export_uncover == TRUE ) { str.l = 0; vcf_format1(hdr, v, &str); vcf_write(out, hdr, v); } } else { str.l = 0; vcf_format1(hdr, v, &str); vcf_write(out, hdr, v); } } if ( xy[0].alt == ALT_IS_HET && xy[1].alt == ALT_IS_HET && (xy[0].min != xy[1].min || xy[0].max != xy[1].max ) ) { bias++; matrix[ALT_IS_HET][ALT_IS_HET]--; if ( out != NULL ) { str.l = 0; vcf_format1(hdr, v, &str); vcf_write(out, hdr, v); } } line++; } if ( out ) hts_close(out); if ( str.m ) free(str.s); write_report(matrix, hdr); bcf_hdr_destroy(hdr); free(input); bcf_destroy1(v); if ( outfile ) free(outfile); if ( report ) free(report); if ( hts_close(fp) ) warnings("hts_close returned non-zero status: %s", input); return 0; }