static void query_vcf(args_t *args) { kstring_t str = {0,0,0}; if ( args->print_header ) { convert_header(args->convert,&str); fwrite(str.s, str.l, 1, args->out); } while ( bcf_sr_next_line(args->files) ) { if ( !bcf_sr_has_line(args->files,0) ) continue; bcf1_t *line = args->files->readers[0].buffer[0]; bcf_unpack(line, args->files->max_unpack); if ( args->filter ) { int pass = filter_test(args->filter, line, NULL); if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; if ( !pass ) continue; } str.l = 0; convert_line(args->convert, line, &str); if ( str.l ) fwrite(str.s, str.l, 1, args->out); } if ( str.m ) free(str.s); }
int process_region_precise(args_t *args, char *seq, regitr_t *itr) { int k = 1; uint32_t start = itr->reg[itr->i].start, end = itr->reg[itr->i].end; while ( itr->i+k<itr->n && start==itr->reg[itr->i+k].start && end==itr->reg[itr->i+k].end ) k++; int ret = ploidy_query(args->ploidy, seq, start, args->sex2ploidy, NULL, NULL); assert(ret); memset(args->counts,0,args->ncounts*sizeof(int)); // Select 'nsites' sites spaced so that they evenly cover the whole region // to get a representative sample. We index-jump as we should be checking // a few sites only. int i, rid = -1, pos, prev_pos = -1, ismpl; for (i=0; i<args->nsites; i++) { rid = -1; pos = ((i+1.0)/(args->nsites+1))*(end - start) + start; if ( i>0 && pos <= prev_pos ) continue; // the vcf is too sparse if ( bcf_sr_seek(args->sr,seq,pos)!=0 ) return k; // sequence not present if ( !bcf_sr_next_line(args->sr) ) return k; // no sites found bcf1_t *rec = bcf_sr_get_line(args->sr,0); if ( rid==-1 ) rid = rec->rid; if ( rid!=rec->rid || rec->pos > end ) break; prev_pos = rec->pos; int ngts = bcf_get_genotypes(args->hdr,rec,&args->gts,&args->ngts); ngts /= args->nsample; for (ismpl=0; ismpl<args->nsample; ismpl++) { int32_t *gts = args->gts + ngts*ismpl; int igt, ploidy = 0; for (igt=0; igt<ngts; igt++) { if ( gts[igt]==bcf_int32_vector_end || bcf_gt_is_missing(gts[igt]) ) break; else ploidy++; } args->counts[ismpl*(args->max_ploidy+1) + ploidy]++; if ( args->verbose ) fprintf(stderr,"%s:%d\t%s\tploidy=%d\n", seq,rec->pos+1,args->hdr->samples[ismpl],ploidy); } } for (ismpl=0; ismpl<args->nsample; ismpl++) { float sum = 0, *probs = args->sex2prob + ismpl*args->nsex; int *counts = args->counts + ismpl*(args->max_ploidy+1); for (i=0; i<args->max_ploidy+1; i++) sum += counts[i]; if ( !sum ) continue; for (i=0; i<args->nsex; i++) { int ploidy = args->sex2ploidy[i]; probs[i] *= counts[ploidy]/sum; } } return k; }
int run(int argc, char **argv) { args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->argc = argc; args->argv = argv; args->output_type = FT_VCF; static struct option loptions[] = { {"keep-tags",required_argument,NULL,'k'}, {"exclude",required_argument,NULL,'e'}, {"include",required_argument,NULL,'i'}, {"regions",required_argument,NULL,'r'}, {"regions-file",required_argument,NULL,'R'}, {"samples-file",required_argument,NULL,'S'}, {"output",required_argument,NULL,'o'}, {"output-type",required_argument,NULL,'O'}, {NULL,0,NULL,0} }; int c; while ((c = getopt_long(argc, argv, "vr:R:t:T:o:O:i:e:k:S:",loptions,NULL)) >= 0) { switch (c) { case 'k': args->keep_tags = optarg; break; case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'T': args->target = optarg; args->target_is_file = 1; break; case 't': args->target = optarg; break; case 'R': args->region = optarg; args->region_is_file = 1; break; case 'S': args->samples_fname = optarg; break; case 'r': args->region = optarg; break; case 'o': args->output_dir = optarg; break; case 'O': switch (optarg[0]) { case 'b': args->output_type = FT_BCF_GZ; break; case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; default: error("The output type \"%s\" not recognised\n", optarg); } break; case 'h': case '?': default: error("%s", usage_text()); break; } } if ( optind==argc ) { if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin else { error("%s", usage_text()); } } else if ( optind+1!=argc ) error("%s", usage_text()); else args->fname = argv[optind]; if ( !args->output_dir ) error("Missing the -o option\n"); if ( args->filter_logic == (FLT_EXCLUDE|FLT_INCLUDE) ) error("Only one of -i or -e can be given.\n"); init_data(args); while ( bcf_sr_next_line(args->sr) ) process(args); destroy_data(args); return 0; }
static void cross_check_gts(args_t *args) { int nsamples = bcf_hdr_nsamples(args->sm_hdr), ndp_arr = 0; unsigned int *dp = (unsigned int*) calloc(nsamples,sizeof(unsigned int)), *ndp = (unsigned int*) calloc(nsamples,sizeof(unsigned int)); // this will overflow one day... int fake_pls = args->no_PLs, ignore_dp = 0; int i,j,k,idx, pl_warned = 0, dp_warned = 0; int32_t *dp_arr = NULL; int *is_hom = args->hom_only ? (int*) malloc(sizeof(int)*nsamples) : NULL; if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 ) { if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname); fprintf(stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname); fake_pls = 1; } if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "DP")<0 ) ignore_dp = 1; FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : stdout; print_header(args, fp); if ( args->all_sites ) fprintf(fp,"# [1]SD, Average Site Discordance\t[2]Chromosome\t[3]Position\t[4]Number of available pairs\t[5]Average discordance\n"); while ( bcf_sr_next_line(args->files) ) { bcf1_t *line = args->files->readers[0].buffer[0]; bcf_unpack(line, BCF_UN_FMT); int npl; if ( !fake_pls ) { npl = bcf_get_format_int32(args->sm_hdr, line, "PL", &args->pl_arr, &args->npl_arr); if ( npl<=0 ) { pl_warned++; continue; } npl /= nsamples; } else npl = fake_PLs(args, args->sm_hdr, line); if ( !ignore_dp && bcf_get_format_int32(args->sm_hdr, line, "DP", &dp_arr, &ndp_arr) <= 0 ) { dp_warned++; continue; } if ( args->hom_only ) { for (i=0; i<nsamples; i++) is_hom[i] = is_hom_most_likely(line->n_allele, args->pl_arr+i*npl); } double sum = 0; int nsum = 0; idx = 0; for (i=0; i<nsamples; i++) { int *ipl = &args->pl_arr[i*npl]; if ( *ipl==-1 ) { idx += i; continue; } // missing genotype if ( !ignore_dp && (dp_arr[i]==bcf_int32_missing || !dp_arr[i]) ) { idx += i; continue; } if ( args->hom_only && !is_hom[i] ) { idx += i; continue; } for (j=0; j<i; j++) { int *jpl = &args->pl_arr[j*npl]; if ( *jpl==-1 ) { idx++; continue; } // missing genotype if ( !ignore_dp && (dp_arr[j]==bcf_int32_missing || !dp_arr[j]) ) { idx++; continue; } if ( args->hom_only && !is_hom[j] ) { idx++; continue; } int min_pl = INT_MAX; for (k=0; k<npl; k++) { if ( ipl[k]==bcf_int32_missing || jpl[k]==bcf_int32_missing ) break; if ( ipl[k]==bcf_int32_vector_end || jpl[k]==bcf_int32_vector_end ) { k = npl; break; } if ( min_pl > ipl[k]+jpl[k] ) min_pl = ipl[k]+jpl[k]; } if ( k!=npl ) { idx++; continue; } if ( args->all_sites ) { sum += min_pl; nsum++; } args->lks[idx] += min_pl; args->cnts[idx]++; if ( !ignore_dp ) { args->dps[idx] += dp_arr[i] < dp_arr[j] ? dp_arr[i] : dp_arr[j]; dp[i] += dp_arr[i]; ndp[i]++; dp[j] += dp_arr[j]; ndp[j]++; } else { args->dps[idx]++; dp[i]++; ndp[i]++; dp[j]++; ndp[j]++; } idx++; } } if ( args->all_sites ) fprintf(fp,"SD\t%s\t%d\t%d\t%.0f\n", args->sm_hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1, nsum, nsum?sum/nsum:0); } if ( dp_arr ) free(dp_arr); if ( args->pl_arr ) free(args->pl_arr); if ( args->tmp_arr ) free(args->tmp_arr); if ( is_hom ) free(is_hom); if ( pl_warned ) fprintf(stderr, "[W::%s] PL was not found at %d site(s)\n", __func__, pl_warned); if ( dp_warned ) fprintf(stderr, "[W::%s] DP was not found at %d site(s)\n", __func__, dp_warned); // Output samples sorted by average discordance double *score = (double*) calloc(nsamples,sizeof(double)); args->sites = (double*) calloc(nsamples,sizeof(double)); idx = 0; for (i=0; i<nsamples; i++) { for (j=0; j<i; j++) { score[i] += args->lks[idx]; score[j] += args->lks[idx]; args->sites[i] += args->cnts[idx]; args->sites[j] += args->cnts[idx]; idx++; } } for (i=0; i<nsamples; i++) if ( args->sites[i] ) score[i] /= args->sites[i]; double **p = (double**) malloc(sizeof(double*)*nsamples), avg_score = 0; for (i=0; i<nsamples; i++) p[i] = &score[i]; qsort(p, nsamples, sizeof(int*), cmp_doubleptr); // The average discordance gives the number of differing sites in % with -G1 fprintf(fp, "# [1]SM\t[2]Average Discordance\t[3]Average depth\t[4]Average number of sites\t[5]Sample\t[6]Sample ID\n"); for (i=0; i<nsamples; i++) { idx = p[i] - score; double adp = ndp[idx] ? (double)dp[idx]/ndp[idx] : 0; double nsites = args->sites[idx]/(nsamples-1); avg_score += score[idx]; fprintf(fp, "SM\t%f\t%.2lf\t%.0lf\t%s\t%d\n", score[idx]*100., adp, nsites, args->sm_hdr->samples[idx],i); } // Overall score: maximum absolute deviation from the average score fprintf(fp, "# [1] MD\t[2]Maximum deviation\t[3]The culprit\n"); fprintf(fp, "MD\t%f\t%s\n", (score[idx] - avg_score/nsamples)*100., args->sm_hdr->samples[idx]); // idx still set free(p); free(score); free(dp); free(ndp); // Pairwise discordances fprintf(fp, "# [1]CN\t[2]Discordance\t[3]Number of sites\t[4]Average minimum depth\t[5]Sample i\t[6]Sample j\n"); idx = 0; for (i=0; i<nsamples; i++) { for (j=0; j<i; j++) { fprintf(fp, "CN\t%.0f\t%d\t%.2f\t%s\t%s\n", args->lks[idx], args->cnts[idx], args->cnts[idx]?(double)args->dps[idx]/args->cnts[idx]:0.0, args->sm_hdr->samples[i],args->sm_hdr->samples[j]); idx++; } } fclose(fp); if ( args->plot ) plot_cross_check(args); }
static void check_gt(args_t *args) { int i,ret, *gt2ipl = NULL, m_gt2ipl = 0, *gt_arr = NULL, ngt_arr = 0; int fake_pls = args->no_PLs; // Initialize things: check which tags are defined in the header, sample names etc. if ( bcf_hdr_id2int(args->gt_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] GT not present in the header of %s?\n", __func__, args->files->readers[1].fname); if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 ) { if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname); fprintf(stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname); fake_pls = 1; } FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : stdout; print_header(args, fp); int tgt_isample = -1, query_isample = 0; if ( args->target_sample ) { tgt_isample = bcf_hdr_id2int(args->gt_hdr, BCF_DT_SAMPLE, args->target_sample); if ( tgt_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[1].fname, args->target_sample); } if ( args->all_sites ) { if ( tgt_isample==-1 ) { fprintf(stderr,"No target sample selected for comparison, using the first sample in %s: %s\n", args->gt_fname,args->gt_hdr->samples[0]); tgt_isample = 0; } } if ( args->query_sample ) { query_isample = bcf_hdr_id2int(args->sm_hdr, BCF_DT_SAMPLE, args->query_sample); if ( query_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[0].fname, args->query_sample); } if ( args->all_sites ) fprintf(fp, "# [1]SC, Site by Site Comparison\t[2]Chromosome\t[3]Position\t[4]-g alleles\t[5]-g GT (%s)\t[6]Coverage\t[7]Query alleles\t[8-]Query PLs (%s)\n", args->gt_hdr->samples[tgt_isample],args->sm_hdr->samples[query_isample]); // Main loop while ( (ret=bcf_sr_next_line(args->files)) ) { if ( ret!=2 ) continue; bcf1_t *sm_line = args->files->readers[0].buffer[0]; // the query file bcf1_t *gt_line = args->files->readers[1].buffer[0]; // the -g target file bcf_unpack(sm_line, BCF_UN_FMT); bcf_unpack(gt_line, BCF_UN_FMT); // Init mapping from target genotype index to the sample's PL fields int n_gt2ipl = gt_line->n_allele*(gt_line->n_allele + 1)/2; if ( n_gt2ipl > m_gt2ipl ) { m_gt2ipl = n_gt2ipl; gt2ipl = (int*) realloc(gt2ipl, sizeof(int)*m_gt2ipl); } if ( !init_gt2ipl(args, gt_line, sm_line, gt2ipl, n_gt2ipl) ) continue; // Target genotypes int ngt, npl; if ( (ngt=bcf_get_genotypes(args->gt_hdr, gt_line, >_arr, &ngt_arr)) <= 0 ) error("GT not present at %s:%d?", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1); ngt /= bcf_hdr_nsamples(args->gt_hdr); if ( ngt!=2 ) continue; // checking only diploid genotypes // Sample PLs if ( !fake_pls ) { if ( (npl=bcf_get_format_int32(args->sm_hdr, sm_line, "PL", &args->pl_arr, &args->npl_arr)) <= 0 ) error("PL not present at %s:%d?", args->sm_hdr->id[BCF_DT_CTG][sm_line->rid].key, sm_line->pos+1); npl /= bcf_hdr_nsamples(args->sm_hdr); } else npl = fake_PLs(args, args->sm_hdr, sm_line); // Calculate likelihoods for all samples, assuming diploid genotypes // For faster access to genotype likelihoods (PLs) of the query sample int max_ipl, *pl_ptr = args->pl_arr + query_isample*npl; double sum_pl = 0; // for converting PLs to probs for (max_ipl=0; max_ipl<npl; max_ipl++) { if ( pl_ptr[max_ipl]==bcf_int32_vector_end ) break; if ( pl_ptr[max_ipl]==bcf_int32_missing ) continue; sum_pl += pow(10, -0.1*pl_ptr[max_ipl]); } if ( sum_pl==0 ) continue; // no PLs present // The main stats: concordance of the query sample with the target -g samples for (i=0; i<bcf_hdr_nsamples(args->gt_hdr); i++) { int *gt_ptr = gt_arr + i*ngt; if ( gt_ptr[1]==bcf_int32_vector_end ) continue; // skip haploid genotypes int a = bcf_gt_allele(gt_ptr[0]); int b = bcf_gt_allele(gt_ptr[1]); if ( a<0 || b<0 ) continue; // missing genotypes if ( args->hom_only && a!=b ) continue; // heterozygous genotype int igt_tgt = igt_tgt = bcf_alleles2gt(a,b); // genotype index in the target file int igt_qry = gt2ipl[igt_tgt]; // corresponding genotype in query file if ( igt_qry>=max_ipl || pl_ptr[igt_qry]<0 ) continue; // genotype not present in query sample: haploid or missing args->lks[i] += log(pow(10, -0.1*pl_ptr[igt_qry])/sum_pl); args->sites[i]++; } if ( args->all_sites ) { // Print LKs at all sites for debugging int *gt_ptr = gt_arr + tgt_isample*ngt; if ( gt_ptr[1]==bcf_int32_vector_end ) continue; // skip haploid genotypes int a = bcf_gt_allele(gt_ptr[0]); int b = bcf_gt_allele(gt_ptr[1]); if ( args->hom_only && a!=b ) continue; // heterozygous genotype fprintf(fp, "SC\t%s\t%d", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1); for (i=0; i<gt_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', gt_line->d.allele[i]); fprintf(fp, "\t%s/%s", a>=0 ? gt_line->d.allele[a] : ".", b>=0 ? gt_line->d.allele[b] : "."); int igt, *pl_ptr = args->pl_arr + query_isample*npl; // PLs of the query sample for (i=0; i<sm_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', sm_line->d.allele[i]); for (igt=0; igt<npl; igt++) if ( pl_ptr[igt]==bcf_int32_vector_end ) break; else if ( pl_ptr[igt]==bcf_int32_missing ) fprintf(fp, "."); else fprintf(fp, "\t%d", pl_ptr[igt]); fprintf(fp, "\n"); } } free(gt2ipl); free(gt_arr); free(args->pl_arr); free(args->tmp_arr); // Scale LKs and certainties int nsamples = bcf_hdr_nsamples(args->gt_hdr); double min = args->lks[0]; for (i=1; i<nsamples; i++) if ( min>args->lks[i] ) min = args->lks[i]; for (i=0; i<nsamples; i++) args->lks[i] = min ? args->lks[i] / min : 0; double max_avg = args->sites[0] ? args->lks[0]/args->sites[0] : 0; for (i=1; i<nsamples; i++) { double val = args->sites[i] ? args->lks[i]/args->sites[i] : 0; if ( max_avg<val ) max_avg = val; } // Sorted output double **p = (double**) malloc(sizeof(double*)*nsamples); for (i=0; i<nsamples; i++) p[i] = &args->lks[i]; qsort(p, nsamples, sizeof(int*), cmp_doubleptr); fprintf(fp, "# [1]CN\t[2]Concordance with %s (total)\t[3]Concordance (average)\t[4]Number of sites compared\t[5]Sample\t[6]Sample ID\n", args->sm_hdr->samples[query_isample]); for (i=0; i<nsamples; i++) { int idx = p[i] - args->lks; double avg = args->sites[idx] ? args->lks[idx]/args->sites[idx] : 0; fprintf(fp, "CN\t%e\t%e\t%.0f\t%s\t%d\n", 1-args->lks[idx], 1-avg/max_avg, args->sites[idx], args->gt_hdr->samples[idx], i); } if ( args->plot ) { fclose(fp); plot_check(args, args->target_sample ? args->target_sample : "", args->sm_hdr->samples[query_isample]); } }
void isec_vcf(args_t *args) { bcf_srs_t *files = args->files; kstring_t str = {0,0,0}; htsFile *out_fh = NULL; // When only one VCF is output, print VCF to stdout int out_std = 0; if ( args->nwrite==1 ) out_std = 1; if ( args->targets_list && files->nreaders==1 ) out_std = 1; if ( out_std ) { out_fh = hts_open("-",hts_bcf_wmode(args->output_type)); bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec"); bcf_hdr_write(out_fh, files->readers[args->iwrite].header); } if ( !args->nwrite && !out_std && !args->prefix ) fprintf(stderr,"Note: -w option not given, printing list of sites...\n"); int n; while ( (n=bcf_sr_next_line(files)) ) { bcf_sr_t *reader = NULL; bcf1_t *line = NULL; int i, ret = 0; for (i=0; i<files->nreaders; i++) { if ( !bcf_sr_has_line(files,i) ) continue; if ( !line ) { line = files->readers[i].buffer[0]; reader = &files->readers[i]; } ret |= 1<<i; // this may overflow for many files, but will be used only with two (OP_VENN) } switch (args->isec_op) { case OP_COMPLEMENT: if ( n!=1 || !bcf_sr_has_line(files,0) ) continue; break; case OP_EQUAL: if ( n != args->isec_n ) continue; break; case OP_PLUS: if ( n < args->isec_n ) continue; break; case OP_MINUS: if ( n > args->isec_n ) continue; } if ( out_std ) { if ( bcf_sr_has_line(files,args->iwrite) ) bcf_write1(out_fh, files->readers[args->iwrite].header, files->readers[args->iwrite].buffer[0]); continue; } else if ( args->fh_sites ) { str.l = 0; kputs(reader->header->id[BCF_DT_CTG][line->rid].key, &str); kputc('\t', &str); kputw(line->pos+1, &str); kputc('\t', &str); if (line->n_allele > 0) kputs(line->d.allele[0], &str); else kputc('.', &str); kputc('\t', &str); if (line->n_allele > 1) kputs(line->d.allele[1], &str); else kputc('.', &str); for (i=2; i<line->n_allele; i++) { kputc(',', &str); kputs(line->d.allele[i], &str); } kputc('\t', &str); for (i=0; i<files->nreaders; i++) kputc(bcf_sr_has_line(files,i)?'1':'0', &str); kputc('\n', &str); fwrite(str.s,sizeof(char),str.l,args->fh_sites); } if ( args->prefix ) { if ( args->isec_op==OP_VENN ) bcf_write1(args->fh_out[ret-1], reader->header, line); else { for (i=0; i<files->nreaders; i++) { if ( !bcf_sr_has_line(files,i) ) continue; if ( args->write && !args->write[i] ) continue; bcf_write1(args->fh_out[i], files->readers[i].header, files->readers[i].buffer[0]); } } } } if ( str.s ) free(str.s); if ( out_fh ) hts_close(out_fh); }
void union_data::readGenotypesVCF(string fvcf,string region) { int n_includedG = 0; int n_excludedG_mult = 0; int n_excludedG_void = 0; int n_excludedG_user = 0; int n_includedS = 0; vector < int > mappingS; genotype_id.clear(); genotype_chr.clear(); genotype_start.clear(); genotype_end.clear(); genotype_val.clear(); genotype_count=0; genotype_id_to_idx.clear(); //Opening files bcf_srs_t * sr = bcf_sr_init(); //vrb.bullet("target region [" + regionGenotype.get() + "]"); //if (bcf_sr_set_regions(sr, regionGenotype.get().c_str(), 0) == -1) vrb.error("Cannot jump to region!"); bcf_sr_set_regions(sr, region.c_str(), 0); if(!(bcf_sr_add_reader (sr, fvcf.c_str()))) { switch (sr->errnum) { case not_bgzf: vrb.error("File not compressed with bgzip!"); case idx_load_failed: vrb.error("Impossible to load index file!"); case file_type_error: vrb.error("File format not detected by htslib!"); default : vrb.error("Unknown error!"); } } //Sample processing int n_samples = bcf_hdr_nsamples(sr->readers[0].header); for (int i0 = 0 ; i0 < n_samples ; i0 ++) { mappingS.push_back(findSample(string(sr->readers[0].header->samples[i0]))); if (mappingS.back() >= 0) n_includedS++; } //Read genotype data int ngt, ngt_arr = 0, nds, nds_arr = 0, * gt_arr = NULL, nsl, nsl_arr = 0, * sl_arr = NULL; float * ds_arr = NULL; bcf1_t * line; unsigned int linecount = 0; while(bcf_sr_next_line (sr)) { linecount ++; if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); line = bcf_sr_get_line(sr, 0); if (line->n_allele == 2) { ngt = bcf_get_genotypes(sr->readers[0].header, line, >_arr, &ngt_arr); nds = bcf_get_format_float(sr->readers[0].header, line,"DS", &ds_arr, &nds_arr); if (nds == n_samples || ngt == 2*n_samples) { bcf_unpack(line, BCF_UN_STR); string sid = string(line->d.id); if (filter_genotype.check(sid)) { genotype_id.push_back(sid); genotype_chr.push_back(string(bcf_hdr_id2name(sr->readers[0].header, line->rid))); string genotype_ref = string(line->d.allele[0]); genotype_start.push_back(line->pos + 1); nsl = bcf_get_info_int32(sr->readers[0].header, line, "END", &sl_arr, &nsl_arr); if (nsl >= 0 && nsl_arr == 1) genotype_end.push_back(sl_arr[0]); else genotype_end.push_back(genotype_start.back() + genotype_ref.size() - 1); genotype_val.push_back(vector < float > (sample_count, 0.0)); for(int i = 0 ; i < n_samples ; i ++) { if (mappingS[i] >= 0) { if (nds > 0) genotype_val.back()[mappingS[i]] = ds_arr[i]; else { if (gt_arr[2*i+0] == bcf_gt_missing || gt_arr[2*i+1] == bcf_gt_missing) genotype_val.back()[mappingS[i]] = bcf_float_missing; else genotype_val.back()[mappingS[i]] = bcf_gt_allele(gt_arr[2*i+0]) + bcf_gt_allele(gt_arr[2*i+1]); } } } pair < string, int > temp (sid,n_includedG); genotype_id_to_idx.insert(temp); n_includedG++; } else n_excludedG_user ++; } else n_excludedG_void ++; } else n_excludedG_mult ++; } //Finalize free(gt_arr); free(ds_arr); bcf_sr_destroy(sr); genotype_count = n_includedG; //vrb.bullet(stb.str(n_includedG) + " variants included"); //if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user"); //if (n_excludedG_mult > 0) vrb.bullet(stb.str(n_excludedG_mult) + " multi-allelic variants excluded"); //if (n_excludedG_void > 0) vrb.bullet(stb.str(n_excludedG_void) + " uninformative variants excluded [no GT/DS]"); //if (genotype_count == 0) vrb.leave("Cannot find genotypes in target region!"); }
void isec_vcf(args_t *args) { bcf_srs_t *files = args->files; kstring_t str = {0,0,0}; htsFile *out_fh = NULL; // When only one VCF is output, print VCF to pysam_stdout or -o file int out_std = 0; if ( args->nwrite==1 && !args->prefix ) out_std = 1; if ( args->targets_list && files->nreaders==1 ) out_std = 1; if ( out_std ) { out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type)); if ( out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec"); bcf_hdr_write(out_fh, files->readers[args->iwrite].header); } if ( !args->nwrite && !out_std && !args->prefix ) fprintf(pysam_stderr,"Note: -w option not given, printing list of sites...\n"); int n; while ( (n=bcf_sr_next_line(files)) ) { bcf_sr_t *reader = NULL; bcf1_t *line = NULL; int i, ret = 0; for (i=0; i<files->nreaders; i++) { if ( !bcf_sr_has_line(files,i) ) continue; if ( args->nflt && args->flt[i] ) { bcf1_t *rec = bcf_sr_get_line(files, i); int pass = filter_test(args->flt[i], rec, NULL); if ( args->flt_logic[i] & FLT_EXCLUDE ) pass = pass ? 0 : 1; if ( !pass ) { files->has_line[i] = 0; n--; continue; } } if ( !line ) { line = files->readers[i].buffer[0]; reader = &files->readers[i]; } ret |= 1<<i; // this may overflow for many files, but will be used only with two (OP_VENN) } switch (args->isec_op) { case OP_COMPLEMENT: if ( n!=1 || !bcf_sr_has_line(files,0) ) continue; break; case OP_EQUAL: if ( n != args->isec_n ) continue; break; case OP_PLUS: if ( n < args->isec_n ) continue; break; case OP_MINUS: if ( n > args->isec_n ) continue; break; case OP_EXACT: for (i=0; i<files->nreaders; i++) if ( files->has_line[i] != args->isec_exact[i] ) break; if ( i<files->nreaders ) continue; break; } if ( out_std ) { if ( bcf_sr_has_line(files,args->iwrite) ) bcf_write1(out_fh, files->readers[args->iwrite].header, files->readers[args->iwrite].buffer[0]); continue; } else if ( args->fh_sites ) { str.l = 0; kputs(reader->header->id[BCF_DT_CTG][line->rid].key, &str); kputc('\t', &str); kputw(line->pos+1, &str); kputc('\t', &str); if (line->n_allele > 0) kputs(line->d.allele[0], &str); else kputc('.', &str); kputc('\t', &str); if (line->n_allele > 1) kputs(line->d.allele[1], &str); else kputc('.', &str); for (i=2; i<line->n_allele; i++) { kputc(',', &str); kputs(line->d.allele[i], &str); } kputc('\t', &str); for (i=0; i<files->nreaders; i++) kputc(bcf_sr_has_line(files,i)?'1':'0', &str); kputc('\n', &str); fwrite(str.s,sizeof(char),str.l,args->fh_sites); } if ( args->prefix ) { if ( args->isec_op==OP_VENN && ret==3 ) { if ( !args->nwrite || args->write[0] ) bcf_write1(args->fh_out[2], bcf_sr_get_header(files,0), bcf_sr_get_line(files,0)); if ( !args->nwrite || args->write[1] ) bcf_write1(args->fh_out[3], bcf_sr_get_header(files,1), bcf_sr_get_line(files,1)); } else { for (i=0; i<files->nreaders; i++) { if ( !bcf_sr_has_line(files,i) ) continue; if ( args->write && !args->write[i] ) continue; bcf_write1(args->fh_out[i], files->readers[i].header, files->readers[i].buffer[0]); } } } } if ( str.s ) free(str.s); if ( out_fh ) hts_close(out_fh); }
int main_vcfview(int argc, char *argv[]) { int c; args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->argc = argc; args->argv = argv; args->files = bcf_sr_init(); args->clevel = -1; args->print_header = 1; args->update_info = 1; args->output_type = FT_VCF; int targets_is_file = 0, regions_is_file = 0; static struct option loptions[] = { {"genotype",1,0,'g'}, {"compression-level",1,0,'l'}, {"header-only",0,0,'h'}, {"no-header",0,0,'H'}, {"exclude",1,0,'e'}, {"include",1,0,'i'}, {"trim-alt-alleles",0,0,'a'}, {"no-update",0,0,'I'}, {"drop-genotypes",0,0,'G'}, {"private",0,0,'x'}, {"exclude-private",0,0,'X'}, {"uncalled",0,0,'u'}, {"exclude-uncalled",0,0,'U'}, {"apply-filters",1,0,'f'}, {"known",0,0,'k'}, {"novel",0,0,'n'}, {"min-alleles",1,0,'m'}, {"max-alleles",1,0,'M'}, {"samples",1,0,'s'}, {"samples-file",1,0,'S'}, {"force-samples",0,0,1}, {"output-type",1,0,'O'}, {"output-file",1,0,'o'}, {"types",1,0,'v'}, {"exclude-types",1,0,'V'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, {"min-ac",1,0,'c'}, {"max-ac",1,0,'C'}, {"min-af",1,0,'q'}, {"max-af",1,0,'Q'}, {"phased",0,0,'p'}, {"exclude-phased",0,0,'P'}, {0,0,0,0} }; char *tmp; while ((c = getopt_long(argc, argv, "l:t:T:r:R:o:O:s:S:Gf:knv:V:m:M:auUhHc:C:Ii:e:xXpPq:Q:g:",loptions,NULL)) >= 0) { char allele_type[8] = "nref"; switch (c) { case 'O': switch (optarg[0]) { case 'b': args->output_type = FT_BCF_GZ; break; case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; default: error("The output type \"%s\" not recognised\n", optarg); }; break; case 'l': args->clevel = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --compression-level %s\n", optarg); args->output_type |= FT_GZ; break; case 'o': args->fn_out = optarg; break; case 'H': args->print_header = 0; break; case 'h': args->header_only = 1; break; case 't': args->targets_list = optarg; break; case 'T': args->targets_list = optarg; targets_is_file = 1; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; case 's': args->sample_names = optarg; break; case 'S': args->sample_names = optarg; args->sample_is_file = 1; break; case 1 : args->force_samples = 1; break; case 'a': args->trim_alts = 1; args->calc_ac = 1; break; case 'I': args->update_info = 0; break; case 'G': args->sites_only = 1; break; case 'f': args->files->apply_filters = optarg; break; case 'k': args->known = 1; break; case 'n': args->novel = 1; break; case 'm': args->min_alleles = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --min-alleles %s\n", optarg); break; case 'M': args->max_alleles = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --max-alleles %s\n", optarg); break; case 'v': args->include_types = optarg; break; case 'V': args->exclude_types = optarg; break; case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'c': { args->min_ac_type = ALLELE_NONREF; if ( sscanf(optarg,"%d:%s",&args->min_ac, allele_type)!=2 && sscanf(optarg,"%d",&args->min_ac)!=1 ) error("Error: Could not parse --min-ac %s\n", optarg); set_allele_type(&args->min_ac_type, allele_type); args->calc_ac = 1; break; } case 'C': { args->max_ac_type = ALLELE_NONREF; if ( sscanf(optarg,"%d:%s",&args->max_ac, allele_type)!=2 && sscanf(optarg,"%d",&args->max_ac)!=1 ) error("Error: Could not parse --max-ac %s\n", optarg); set_allele_type(&args->max_ac_type, allele_type); args->calc_ac = 1; break; } case 'q': { args->min_af_type = ALLELE_NONREF; if ( sscanf(optarg,"%f:%s",&args->min_af, allele_type)!=2 && sscanf(optarg,"%f",&args->min_af)!=1 ) error("Error: Could not parse --min_af %s\n", optarg); set_allele_type(&args->min_af_type, allele_type); args->calc_ac = 1; break; } case 'Q': { args->max_af_type = ALLELE_NONREF; if ( sscanf(optarg,"%f:%s",&args->max_af, allele_type)!=2 && sscanf(optarg,"%f",&args->max_af)!=1 ) error("Error: Could not parse --min_af %s\n", optarg); set_allele_type(&args->max_af_type, allele_type); args->calc_ac = 1; break; } case 'x': args->private_vars |= FLT_INCLUDE; args->calc_ac = 1; break; case 'X': args->private_vars |= FLT_EXCLUDE; args->calc_ac = 1; break; case 'u': args->uncalled |= FLT_INCLUDE; args->calc_ac = 1; break; case 'U': args->uncalled |= FLT_EXCLUDE; args->calc_ac = 1; break; case 'p': args->phased |= FLT_INCLUDE; break; // phased case 'P': args->phased |= FLT_EXCLUDE; break; // exclude-phased case 'g': { if ( !strcasecmp(optarg,"hom") ) args->gt_type = GT_NEED_HOM; else if ( !strcasecmp(optarg,"het") ) args->gt_type = GT_NEED_HET; else if ( !strcasecmp(optarg,"miss") ) args->gt_type = GT_NEED_MISSING; else if ( !strcasecmp(optarg,"^hom") ) args->gt_type = GT_NO_HOM; else if ( !strcasecmp(optarg,"^het") ) args->gt_type = GT_NO_HET; else if ( !strcasecmp(optarg,"^miss") ) args->gt_type = GT_NO_MISSING; else error("The argument to -g not recognised. Expected one of hom/het/miss/^hom/^het/^miss, got \"%s\".\n", optarg); break; } case '?': usage(args); default: error("Unknown argument: %s\n", optarg); } } if ( args->filter_logic == (FLT_EXCLUDE|FLT_INCLUDE) ) error("Only one of -i or -e can be given.\n"); if ( args->private_vars > FLT_EXCLUDE ) error("Only one of -x or -X can be given.\n"); if ( args->uncalled > FLT_EXCLUDE ) error("Only one of -u or -U can be given.\n"); if ( args->phased > FLT_EXCLUDE ) error("Only one of -p or -P can be given.\n"); if ( args->sample_names && args->update_info) args->calc_ac = 1; char *fname = NULL; if ( optind>=argc ) { if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin else usage(args); } else fname = argv[optind]; // read in the regions from the command line if ( args->regions_list ) { if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } else if ( optind+1 < argc ) { int i; kstring_t tmp = {0,0,0}; kputs(argv[optind+1],&tmp); for (i=optind+2; i<argc; i++) { kputc(',',&tmp); kputs(argv[i],&tmp); } if ( bcf_sr_set_regions(args->files, tmp.s, 0)<0 ) error("Failed to read the regions: %s\n", tmp.s); free(tmp.s); } if ( args->targets_list ) { if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); } if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); init_data(args); bcf_hdr_t *out_hdr = args->hnull ? args->hnull : (args->hsub ? args->hsub : args->hdr); if (args->print_header) bcf_hdr_write(args->out, out_hdr); else if ( args->output_type & FT_BCF ) error("BCF output requires header, cannot proceed with -H\n"); if (!args->header_only) { while ( bcf_sr_next_line(args->files) ) { bcf1_t *line = args->files->readers[0].buffer[0]; if ( line->errcode && out_hdr!=args->hdr ) error("Undefined tags in the header, cannot proceed in the sample subset mode.\n"); if ( subset_vcf(args, line) ) bcf_write1(args->out, out_hdr, line); } } hts_close(args->out); destroy_data(args); bcf_sr_destroy(args->files); free(args); return 0; }
int main_vcfroh(int argc, char *argv[]) { int c; args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->argc = argc; args->argv = argv; args->files = bcf_sr_init(); args->t2AZ = 6.7e-8; args->t2HW = 5e-9; args->rec_rate = 0; int regions_is_file = 0, targets_is_file = 0; static struct option loptions[] = { {"AF-tag",1,0,0}, {"AF-file",1,0,1}, {"AF-dflt",1,0,2}, {"estimate-AF",1,0,'e'}, {"GTs-only",1,0,'G'}, {"sample",1,0,'s'}, {"hw-to-az",1,0,'a'}, {"az-to-hw",1,0,'H'}, {"viterbi-training",0,0,'V'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, {"genetic-map",1,0,'m'}, {"rec-rate",1,0,'M'}, {"skip-indels",0,0,'I'}, {0,0,0,0} }; int naf_opts = 0; char *tmp; while ((c = getopt_long(argc, argv, "h?r:R:t:T:H:a:s:m:M:G:Ia:e:V",loptions,NULL)) >= 0) { switch (c) { case 0: args->af_tag = optarg; naf_opts++; break; case 1: args->af_fname = optarg; naf_opts++; break; case 2: args->dflt_AF = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: --AF-dflt %s\n", optarg); break; case 'e': args->estimate_AF = optarg; naf_opts++; break; case 'I': args->snps_only = 1; break; case 'G': args->fake_PLs = 1; args->unseen_PL = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -G %s\n", optarg); args->unseen_PL = pow(10,-args->unseen_PL/10.); break; case 'm': args->genmap_fname = optarg; break; case 'M': args->rec_rate = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -M %s\n", optarg); break; case 's': args->sample = strdup(optarg); break; case 'a': args->t2AZ = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -a %s\n", optarg); break; case 'H': args->t2HW = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -H %s\n", optarg); break; case 't': args->targets_list = optarg; break; case 'T': args->targets_list = optarg; targets_is_file = 1; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; case 'V': args->vi_training = 1; break; case 'h': case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); } } if ( argc<optind+1 ) usage(args); if ( args->t2AZ<0 || args->t2AZ>1 ) error("Error: The parameter --hw-to-az is not in [0,1]\n", args->t2AZ); if ( args->t2HW<0 || args->t2HW>1 ) error("Error: The parameter --az-to-hw is not in [0,1]\n", args->t2HW); if ( naf_opts>1 ) error("Error: The options --AF-tag, --AF-file and -e are mutually exclusive\n"); if ( args->af_fname && args->targets_list ) error("Error: The options --AF-file and -t are mutually exclusive\n"); if ( args->regions_list ) { if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } if ( args->targets_list ) { if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); } if ( args->af_fname ) { if ( bcf_sr_set_targets(args->files, args->af_fname, 1, 3)<0 ) error("Failed to read the targets: %s\n", args->af_fname); } if ( !bcf_sr_add_reader(args->files, argv[optind]) ) error("Failed to open %s: %s\n", argv[optind],bcf_sr_strerror(args->files->errnum)); init_data(args); while ( bcf_sr_next_line(args->files) ) { vcfroh(args, args->files->readers[0].buffer[0]); } vcfroh(args, NULL); fprintf(pysamerr,"Number of lines: total/processed: %d/%d\n", args->ntot,args->nused); destroy_data(args); free(args); return 0; }
void genrich_data::readReferenceGenotypes(string fvcf) { vector < int > mappingS; //Opening files vrb.title("Reading variant list in [" + fvcf + "] MAF=" + stb.str(threshold_maf)); bcf_srs_t * sr = bcf_sr_init(); if(!(bcf_sr_add_reader (sr, fvcf.c_str()))) { switch (sr->errnum) { case not_bgzf: vrb.error("File not compressed with bgzip!"); case idx_load_failed: vrb.error("Impossible to load index file!"); case file_type_error: vrb.error("File format not detected by htslib!"); default : vrb.error("Unknown error!"); } } //Sample processing int included_sample = 0; int n_samples = bcf_hdr_nsamples(sr->readers[0].header); for (int i = 0 ; i < n_samples ; i ++) { mappingS.push_back(findSample(string(sr->readers[0].header->samples[i]))); if (mappingS.back() >= 0) included_sample ++; } vrb.bullet("#samples = " + stb.str(included_sample)); //Variant processing unsigned int n_excludedV_mult = 0, n_excludedV_void = 0, n_excludedV_rare = 0, n_excludedV_uchr = 0, n_line = 0, n_excludedV_toofar = 0; int ngt, ngt_arr = 0, *gt_arr = NULL; bcf1_t * line; while(bcf_sr_next_line (sr)) { line = bcf_sr_get_line(sr, 0); if (line->n_allele == 2) { bcf_unpack(line, BCF_UN_STR); string sid = string(line->d.id); string chr = string(bcf_hdr_id2name(sr->readers[0].header, line->rid)); int chr_idx = findCHR(chr); if (chr_idx >= 0) { unsigned int pos = line->pos + 1; ngt = bcf_get_genotypes(sr->readers[0].header, line, >_arr, &ngt_arr); if (ngt == 2*n_samples) { double freq = 0.0, tot = 0.0; for(int i = 0 ; i < n_samples ; i ++) { assert(gt_arr[2*i+0] != bcf_gt_missing && gt_arr[2*i+1] != bcf_gt_missing); if (mappingS[i] >= 0) { freq += bcf_gt_allele(gt_arr[2*i+0]) + bcf_gt_allele(gt_arr[2*i+1]); tot += 2.0; } } double maf = freq / tot; if (maf > 0.5) maf = 1.0 - maf; if (maf >= threshold_maf) { int dist_tss = getDistance(chr_idx, pos); if (dist_tss < 1e6) { string tmp_id = chr + "_" + stb.str(pos); genotype_uuid.insert(pair < string, unsigned int > (tmp_id, genotype_pos.size())); genotype_chr.push_back(chr_idx); genotype_pos.push_back(pos); genotype_maf.push_back(maf); genotype_dist.push_back(dist_tss); genotype_haps.push_back(vector < bool > (2 * included_sample, false)); for(int i = 0 ; i < n_samples ; i ++) { if (mappingS[i] >= 0) { genotype_haps.back()[2 * mappingS[i] + 0] = bcf_gt_allele(gt_arr[2 * i + 0]); genotype_haps.back()[2 * mappingS[i] + 1] = bcf_gt_allele(gt_arr[2 * i + 1]); } } } else n_excludedV_toofar++; } else n_excludedV_rare ++; } else n_excludedV_void ++; } else n_excludedV_uchr ++; } else n_excludedV_mult ++; if (n_line % 100000 == 0) vrb.bullet("#lines = " + stb.str(n_line)); n_line ++; } genotype_qtl = vector < bool > (genotype_pos.size(), false); genotype_gwas = vector < bool > (genotype_pos.size(), false); genotype_bin = vector < int > (genotype_pos.size(), -1); //Finalize bcf_sr_destroy(sr); vrb.bullet(stb.str(genotype_pos.size()) + " variants included"); if (n_excludedV_mult > 0) vrb.bullet(stb.str(n_excludedV_mult) + " multi-allelic variants excluded"); if (n_excludedV_uchr > 0) vrb.bullet(stb.str(n_excludedV_uchr) + " variants with unreferenced chromosome in --tss"); if (n_excludedV_rare > 0) vrb.bullet(stb.str(n_excludedV_rare) + " maf filtered variants"); if (n_excludedV_toofar > 0) vrb.bullet(stb.str(n_excludedV_toofar) + " too far variants"); }
static void cross_check_gts(args_t *args) { // Initialize things: check which tags are defined in the header, sample names etc. if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 ) { if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname); if ( !args->no_PLs ) { fprintf(stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname); args->no_PLs = 99; } } args->nsmpl = bcf_hdr_nsamples(args->sm_hdr); args->narr = (args->nsmpl-1)*args->nsmpl/2; uint32_t *ndif = (uint32_t*) calloc(args->narr,4); uint32_t *ntot = (uint32_t*) calloc(args->narr,4); while ( bcf_sr_next_line(args->files) ) { bcf1_t *line = bcf_sr_get_line(args->files,0); // use PLs unless no_PLs is set and GT exists if ( args->no_PLs ) { if ( process_GT(args,line,ntot,ndif)==0 ) continue; } process_PL(args,line,ntot,ndif); } FILE *fp = stdout; print_header(args, fp); float *tmp = (float*)malloc(sizeof(float)*args->nsmpl*(args->nsmpl-1)/2); // Output pairwise distances fprintf(fp, "# ERR, error rate\t[2]Pairwise error rate\t[3]Number of sites compared\t[4]Sample i\t[5]Sample j\n"); int i,j, idx = 0; for (i=0; i<args->nsmpl; i++) { for (j=0; j<i; j++) { float err = ntot[idx] ? (float)ndif[idx]/ntot[idx] : 1e-10; fprintf(fp, "ERR\t%f\t%"PRId32"\t%s\t%s\n", err, ntot[idx],args->sm_hdr->samples[i],args->sm_hdr->samples[j]); PDIST(tmp,i,j) = err; idx++; } } // Cluster samples int nlist; float clust_max_err = args->max_intra_err; hclust_t *clust = hclust_init(args->nsmpl,tmp); cluster_t *list = hclust_create_list(clust,args->min_inter_err,&clust_max_err,&nlist); fprintf(fp, "# CLUSTER\t[2]Maximum inter-cluster ERR\t[3-]List of samples\n"); for (i=0; i<nlist; i++) { fprintf(fp,"CLUSTER\t%f", list[i].dist); for (j=0; j<list[i].nmemb; j++) fprintf(fp,"\t%s",args->sm_hdr->samples[list[i].memb[j]]); fprintf(fp,"\n"); } hclust_destroy_list(list,nlist); // Debugging output: the cluster graph and data used for deciding char **dbg = hclust_explain(clust,&nlist); for (i=0; i<nlist; i++) fprintf(fp,"DBG\t%s\n", dbg[i]); fprintf(fp, "# TH, clustering threshold\t[2]Value\nTH\t%f\n",clust_max_err); fprintf(fp, "# DOT\t[2]Cluster graph, visualize e.g. as \"this-output.txt | grep ^DOT | cut -f2- | dot -Tsvg -o graph.svg\"\n"); fprintf(fp, "DOT\t%s\n", hclust_create_dot(clust,args->sm_hdr->samples,clust_max_err)); hclust_destroy(clust); free(tmp); // Deprecated output for temporary backward compatibility fprintf(fp, "# Warning: The CN block is deprecated and will be removed in future releases. Use ERR instead.\n"); fprintf(fp, "# [1]CN\t[2]Discordance\t[3]Number of sites\t[4]Average minimum depth\t[5]Sample i\t[6]Sample j\n"); idx = 0; for (i=0; i<args->nsmpl; i++) { for (j=0; j<i; j++) { fprintf(fp, "CN\t%"PRId32"\t%"PRId32"\t0\t%s\t%s\n", ndif[idx], ntot[idx],args->sm_hdr->samples[i],args->sm_hdr->samples[j]); idx++; } } free(ndif); free(ntot); free(args->tmp_arr); }
int process_region_guess(args_t *args, char *seq, regitr_t *itr) { int kitr = 1; uint32_t start = 0, end = INT_MAX; reg_stats_t *stats = NULL; // set the start and the end position if ( itr ) { start = itr->reg[itr->i].start; end = itr->reg[itr->i].end; // flush all records with the same coordinates while ( itr->i+kitr<itr->n && start==itr->reg[itr->i+kitr].start && end==itr->reg[itr->i+kitr].end ) kitr++; int min,max,ret = ploidy_query(args->ploidy, seq, start, args->sex2ploidy, &min, &max); assert(ret); stats = expand_regs(args, seq,start,end); } else { // background region int spos, epos; const char *ptr = hts_parse_reg(args->background, &spos, &epos); if ( !ptr ) error("Could not parse the region: %s\n", args->background); seq = (char*) malloc(ptr - args->background + 1); memcpy(seq,args->background,ptr-args->background); seq[ptr-args->background] = 0; start = spos; end = epos; } if ( bcf_sr_seek(args->sr,seq,start)!=0 ) { // sequence not present if ( !itr ) free(seq); return kitr; } int ismpl, rid = bcf_hdr_name2id(args->hdr,seq); if ( !itr ) free(seq); while ( bcf_sr_next_line(args->sr) ) { bcf1_t *rec = bcf_sr_get_line(args->sr,0); if ( rec->rid!=rid || rec->pos > end ) break; if ( args->guess & GUESS_GT ) // use GTs to guess the ploidy { bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT"); if ( !fmt ) continue; for (ismpl=0; ismpl<args->nsample; ismpl++) { count_t *counts = stats ? &stats->counts[ismpl] : &args->bg_counts[ismpl]; int gt = bcf_gt_type(fmt, ismpl, NULL,NULL); if ( gt==GT_UNKN ) counts->nmiss++; else if ( gt==GT_HET_RA || gt==GT_HET_AA ) counts->nhet++; else counts->nhom++; } } else // use PLs to guess the ploidy { int gl2pl = args->guess & GUESS_PL ? 1 : -1; int npl = bcf_get_format_int32(args->hdr,rec,args->guess&GUESS_PL?"PL":"GL",&args->pls,&args->npls); if ( npl<=0 ) continue; npl /= args->nsample; for (ismpl=0; ismpl<args->nsample; ismpl++) { int32_t *ptr = args->pls + ismpl*npl; int phom = INT_MAX, phet = INT_MAX, ial, jal, k = 0; for (ial=0; ial<rec->n_allele; ial++) { for (jal=0; jal<ial; jal++) { if ( ptr[k] == bcf_int32_missing || ptr[k] == bcf_int32_vector_end ) break; ptr[k] *= gl2pl; if ( phet > ptr[k] ) phet = ptr[k]; k++; } if ( ptr[k] == bcf_int32_missing || ptr[k] == bcf_int32_vector_end ) break; ptr[k] *= gl2pl; if ( phom > ptr[k] ) phom = ptr[k]; k++; } count_t *counts = stats ? &stats->counts[ismpl] : &args->bg_counts[ismpl]; if ( k == rec->n_allele ) counts->nhom++; // haploid else if ( phet == phom || k != rec->n_allele*(rec->n_allele+1)/2 ) counts->nmiss++; else if ( phet < phom ) counts->nhet++; else counts->nhom++; } } } return kitr; }
static void concat(args_t *args) { int i; if ( args->phased_concat ) // phased concat { // keep only two open files at a time while ( args->ifname < args->nfnames ) { int new_file = 0; while ( args->files->nreaders < 2 && args->ifname < args->nfnames ) { if ( !bcf_sr_add_reader(args->files,args->fnames[args->ifname]) ) error("Failed to open %s: %s\n", args->fnames[args->ifname],bcf_sr_strerror(args->files->errnum)); new_file = 1; args->ifname++; if ( args->start_pos[args->ifname-1]==-1 ) break; // new chromosome, start with only one file open if ( args->ifname < args->nfnames && args->start_pos[args->ifname]==-1 ) break; // next file starts on a different chromosome } // is there a line from the previous run? Seek the newly opened reader to that position int seek_pos = -1; int seek_chr = -1; if ( bcf_sr_has_line(args->files,0) ) { bcf1_t *line = bcf_sr_get_line(args->files,0); bcf_sr_seek(args->files, bcf_seqname(args->files->readers[0].header,line), line->pos); seek_pos = line->pos; seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[0].header,line)); } else if ( new_file ) bcf_sr_seek(args->files,NULL,0); // set to start int nret; while ( (nret = bcf_sr_next_line(args->files)) ) { if ( !bcf_sr_has_line(args->files,0) ) // no input from the first reader { // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped if ( ! bcf_sr_region_done(args->files,0) ) continue; phased_flush(args); bcf_sr_remove_reader(args->files, 0); } // Get a line to learn about current position for (i=0; i<args->files->nreaders; i++) if ( bcf_sr_has_line(args->files,i) ) break; bcf1_t *line = bcf_sr_get_line(args->files,i); // This can happen after bcf_sr_seek: indel may start before the coordinate which we seek to. if ( seek_chr>=0 && seek_pos>line->pos && seek_chr==bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line)) ) continue; seek_pos = seek_chr = -1; // Check if the position overlaps with the next, yet unopened, reader int must_seek = 0; while ( args->ifname < args->nfnames && args->start_pos[args->ifname]!=-1 && line->pos >= args->start_pos[args->ifname] ) { must_seek = 1; if ( !bcf_sr_add_reader(args->files,args->fnames[args->ifname]) ) error("Failed to open %s: %s\n", args->fnames[args->ifname],bcf_sr_strerror(args->files->errnum)); args->ifname++; } if ( must_seek ) { bcf_sr_seek(args->files, bcf_seqname(args->files->readers[i].header,line), line->pos); seek_pos = line->pos; seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line)); continue; } // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped if ( args->files->nreaders>1 && !bcf_sr_has_line(args->files,1) && !bcf_sr_region_done(args->files,1) ) continue; phased_push(args, bcf_sr_get_line(args->files,0), args->files->nreaders>1 ? bcf_sr_get_line(args->files,1) : NULL); } if ( args->files->nreaders ) { phased_flush(args); while ( args->files->nreaders ) bcf_sr_remove_reader(args->files, 0); } } } else if ( args->files ) // combining overlapping files, using synced reader { while ( bcf_sr_next_line(args->files) ) { for (i=0; i<args->files->nreaders; i++) { bcf1_t *line = bcf_sr_get_line(args->files,i); if ( !line ) continue; bcf_translate(args->out_hdr, args->files->readers[i].header, line); bcf_write1(args->out_fh, args->out_hdr, line); if ( args->remove_dups ) break; } } } else // concatenating { kstring_t tmp = {0,0,0}; int prev_chr_id = -1, prev_pos; bcf1_t *line = bcf_init(); for (i=0; i<args->nfnames; i++) { htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]); if ( !fp->is_bin && args->output_type&FT_VCF ) { line->max_unpack = BCF_UN_STR; // if VCF is on both input and output, avoid VCF to BCF conversion while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 ) { char *str = fp->line.s; while ( *str && *str!='\t' ) str++; tmp.l = 0; kputsn(fp->line.s,str-fp->line.s,&tmp); int chr_id = bcf_hdr_name2id(args->out_hdr, tmp.s); if ( chr_id<0 ) error("The sequence \"%s\" not defined in the header: %s\n(Quick workaround: index the file.)\n", tmp.s, args->fnames[i]); if ( prev_chr_id!=chr_id ) { prev_pos = -1; if ( args->seen_seq[chr_id] ) error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", tmp.s); } char *end; int pos = strtol(str+1,&end,10) - 1; if ( end==str+1 ) error("Could not parse line: %s\n", fp->line.s); if ( prev_pos > pos ) error("The chromosome block %s is not sorted, consider running with -a.\n", tmp.s); args->seen_seq[chr_id] = 1; prev_chr_id = chr_id; if ( vcf_write_line(args->out_fh, &fp->line)!=0 ) error("Failed to write %d bytes\n", fp->line.l); } } else { // BCF conversion is required line->max_unpack = 0; while ( bcf_read(fp, hdr, line)==0 ) { bcf_translate(args->out_hdr, hdr, line); if ( prev_chr_id!=line->rid ) { prev_pos = -1; if ( args->seen_seq[line->rid] ) error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", bcf_seqname(args->out_hdr, line)); } if ( prev_pos > line->pos ) error("The chromosome block %s is not sorted, consider running with -a.\n", bcf_seqname(args->out_hdr, line)); args->seen_seq[line->rid] = 1; prev_chr_id = line->rid; if ( bcf_write(args->out_fh, args->out_hdr, line)!=0 ) error("Failed to write\n"); } } bcf_hdr_destroy(hdr); hts_close(fp); } bcf_destroy(line); free(tmp.s); } }
int run(int argc, char **argv) { char *trio_samples = NULL, *trio_file = NULL, *rules_fname = NULL, *rules_string = NULL; memset(&args,0,sizeof(args_t)); args.mode = 0; args.output_fname = "-"; static struct option loptions[] = { {"trio",1,0,'t'}, {"trio-file",1,0,'T'}, {"delete",0,0,'d'}, {"list",1,0,'l'}, {"count",0,0,'c'}, {"rules",1,0,'r'}, {"rules-file",1,0,'R'}, {"output",required_argument,NULL,'o'}, {"output-type",required_argument,NULL,'O'}, {0,0,0,0} }; int c; while ((c = getopt_long(argc, argv, "?ht:T:l:cdr:R:o:O:",loptions,NULL)) >= 0) { switch (c) { case 'o': args.output_fname = optarg; break; case 'O': switch (optarg[0]) { case 'b': args.output_type = FT_BCF_GZ; break; case 'u': args.output_type = FT_BCF; break; case 'z': args.output_type = FT_VCF_GZ; break; case 'v': args.output_type = FT_VCF; break; default: error("The output type \"%s\" not recognised\n", optarg); }; break; case 'R': rules_fname = optarg; break; case 'r': rules_string = optarg; break; case 'd': args.mode |= MODE_DELETE; break; case 'c': args.mode |= MODE_COUNT; break; case 'l': if ( !strcmp("+",optarg) ) args.mode |= MODE_LIST_GOOD; else if ( !strcmp("x",optarg) ) args.mode |= MODE_LIST_BAD; else error("The argument not recognised: --list %s\n", optarg); break; case 't': trio_samples = optarg; break; case 'T': trio_file = optarg; break; case 'h': case '?': default: error("%s",usage()); break; } } if ( rules_fname ) args.rules = regidx_init(rules_fname, parse_rules, NULL, sizeof(rule_t), &args); else args.rules = init_rules(&args, rules_string); if ( !args.rules ) return -1; args.itr = regitr_init(args.rules); args.itr_ori = regitr_init(args.rules); char *fname = NULL; if ( optind>=argc || argv[optind][0]=='-' ) { if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin else error("%s",usage()); } else fname = argv[optind]; if ( !trio_samples && !trio_file ) error("Expected the -t/T option\n"); if ( !args.mode ) error("Expected one of the -c, -d or -l options\n"); if ( args.mode&MODE_DELETE && !(args.mode&(MODE_LIST_GOOD|MODE_LIST_BAD)) ) args.mode |= MODE_LIST_GOOD|MODE_LIST_BAD; args.sr = bcf_sr_init(); if ( !bcf_sr_add_reader(args.sr, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args.sr->errnum)); args.hdr = bcf_sr_get_header(args.sr, 0); args.out_fh = hts_open(args.output_fname,hts_bcf_wmode(args.output_type)); if ( args.out_fh == NULL ) error("Can't write to \"%s\": %s\n", args.output_fname, strerror(errno)); bcf_hdr_write(args.out_fh, args.hdr); int i, n = 0; char **list; if ( trio_samples ) { args.ntrios = 1; args.trios = (trio_t*) calloc(1,sizeof(trio_t)); list = hts_readlist(trio_samples, 0, &n); if ( n!=3 ) error("Expected three sample names with -t\n"); args.trios[0].imother = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[0]); args.trios[0].ifather = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[1]); args.trios[0].ichild = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[2]); for (i=0; i<n; i++) free(list[i]); free(list); } if ( trio_file ) { list = hts_readlist(trio_file, 1, &n); args.ntrios = n; args.trios = (trio_t*) calloc(n,sizeof(trio_t)); for (i=0; i<n; i++) { char *ss = list[i], *se; se = strchr(ss, ','); if ( !se ) error("Could not parse %s: %s\n",trio_file, ss); *se = 0; args.trios[i].imother = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, ss); if ( args.trios[i].imother<0 ) error("No such sample: \"%s\"\n", ss); ss = ++se; se = strchr(ss, ','); if ( !se ) error("Could not parse %s\n",trio_file); *se = 0; args.trios[i].ifather = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, ss); if ( args.trios[i].ifather<0 ) error("No such sample: \"%s\"\n", ss); ss = ++se; if ( *ss=='\0' ) error("Could not parse %s\n",trio_file); args.trios[i].ichild = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, ss); if ( args.trios[i].ichild<0 ) error("No such sample: \"%s\"\n", ss); free(list[i]); } free(list); } while ( bcf_sr_next_line(args.sr) ) { bcf1_t *line = bcf_sr_get_line(args.sr,0); line = process(line); if ( line ) { if ( line->errcode ) error("TODO: Unchecked error (%d), exiting\n",line->errcode); bcf_write1(args.out_fh, args.hdr, line); } } fprintf(stderr,"# [1]nOK\t[2]nBad\t[3]nSkipped\t[4]Trio\n"); for (i=0; i<args.ntrios; i++) { trio_t *trio = &args.trios[i]; fprintf(stderr,"%d\t%d\t%d\t%s,%s,%s\n", trio->nok,trio->nbad,args.nrec-(trio->nok+trio->nbad), bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->imother), bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ifather), bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ichild) ); } free(args.gt_arr); free(args.trios); regitr_destroy(args.itr); regitr_destroy(args.itr_ori); regidx_destroy(args.rules); bcf_sr_destroy(args.sr); if ( hts_close(args.out_fh)!=0 ) error("Error: close failed\n"); return 0; }
static void init_data(args_t *args) { bcf_srs_t *files = bcf_sr_init(); if ( args->regions_list ) { if ( bcf_sr_set_regions(files, args->regions_list, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } if ( args->targets_list ) { if ( bcf_sr_set_targets(files, args->targets_list, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); } if ( !bcf_sr_add_reader(files, args->fname) ) error("Failed to open %s: %s\n", args->fname,bcf_sr_strerror(files->errnum)); bcf_hdr_t *hdr = files->readers[0].header; if ( !args->sample ) { if ( bcf_hdr_nsamples(hdr)>1 ) error("Missing the option -s, --sample\n"); args->sample = hdr->samples[0]; } else if ( bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,args->sample)<0 ) error("No such sample: %s\n", args->sample); int ret = bcf_hdr_set_samples(hdr, args->sample, 0); if ( ret<0 ) error("Error setting the sample: %s\n", args->sample); if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,bcf_hdr_id2int(hdr,BCF_DT_ID,"BAF")) ) error("The tag FORMAT/BAF is not present in the VCF: %s\n", args->fname); int i; args->xvals = (double*) calloc(args->nbins,sizeof(double)); for (i=0; i<args->nbins; i++) args->xvals[i] = 1.0*i/(args->nbins-1); // collect BAF distributions for all chromosomes int idist = -1, nbaf = 0, nprocessed = 0, ntotal = 0, prev_chr = -1; float *baf = NULL; while ( bcf_sr_next_line(files) ) { ntotal++; bcf1_t *line = bcf_sr_get_line(files,0); if ( bcf_get_format_float(hdr,line,"BAF",&baf,&nbaf) != 1 ) continue; if ( bcf_float_is_missing(baf[0]) ) continue; nprocessed++; if ( prev_chr==-1 || prev_chr!=line->rid ) { // new chromosome idist = args->ndist++; args->dist = (dist_t*) realloc(args->dist, sizeof(dist_t)*args->ndist); memset(&args->dist[idist],0,sizeof(dist_t)); args->dist[idist].chr = strdup(bcf_seqname(hdr,line)); args->dist[idist].yvals = (double*) calloc(args->nbins,sizeof(double)); args->dist[idist].xvals = args->xvals; args->dist[idist].nvals = args->nbins; prev_chr = line->rid; } int bin = baf[0]*(args->nbins-1); args->dist[idist].yvals[bin]++; // the distribution } free(baf); bcf_sr_destroy(files); for (idist=0; idist<args->ndist; idist++) { #if 0 int j; for (j=0; j<args->nbins; j++) { double x = args->dist[idist].xvals[j]; args->dist[idist].yvals[j] = exp(-(x-0.5)*(x-0.5)/1e-3); } #endif init_dist(args, &args->dist[idist],args->verbose); } args->dat_fp = open_file(&args->dat_fname,"w","%s/dist.dat", args->output_dir); fprintf(args->dat_fp, "# This file was produced by: bcftools polysomy(%s+htslib-%s), the command line was:\n", bcftools_version(),hts_version()); fprintf(args->dat_fp, "# \t bcftools %s ", args->argv[0]); for (i=1; i<args->argc; i++) fprintf(args->dat_fp, " %s",args->argv[i]); fprintf(args->dat_fp,"\n#\n"); fprintf(args->dat_fp,"# DIST\t[2]Chrom\t[3]BAF\t[4]Normalized Count\n"); fprintf(args->dat_fp,"# FIT\t[2]Goodness of Fit\t[3]iFrom\t[4]iTo\t[5]The Fitted Function\n"); fprintf(args->dat_fp,"# CN\t[2]Chrom\t[3]Estimated Copy Number\t[4]Absolute fit deviation\n"); char *fname = NULL; FILE *fp = open_file(&fname,"w","%s/dist.py", args->output_dir); //-------- matplotlib script -------------- fprintf(fp, "#!/usr/bin/env python\n" "#\n" "import matplotlib as mpl\n" "mpl.use('Agg')\n" "import matplotlib.pyplot as plt\n" "import csv,sys,argparse\n" "from math import exp\n" "\n" "outdir = '%s'\n" "\n" "def read_dat(dat,fit,cn):\n" " csv.register_dialect('tab', delimiter='\t', quoting=csv.QUOTE_NONE)\n" " with open(outdir+'/dist.dat', 'rb') as f:\n" " reader = csv.reader(f, 'tab')\n" " for row in reader:\n" " if row[0][0]=='#': continue\n" " type = row[0]\n" " chr = row[1]\n" " if type=='DIST':\n" " if chr not in dat: dat[chr] = []\n" " dat[chr].append(row)\n" " elif type=='FIT':\n" " if chr not in fit: fit[chr] = []\n" " fit[chr].append(row)\n" " elif type=='CN':\n" " cn[chr] = row[2]\n" "\n" "def plot_dist(dat,fit,chr):\n" " fig, ax = plt.subplots(1, 1, figsize=(7,5))\n" " ax.plot([x[2] for x in dat[chr]],[x[3] for x in dat[chr]],'k-',label='Distribution')\n" " if chr in fit:\n" " for i in range(len(fit[chr])):\n" " pfit = fit[chr][i]\n" " exec('def xfit(x): return '+pfit[5])\n" " istart = int(pfit[3])\n" " iend = int(pfit[4])+1\n" " vals = dat[chr][istart:iend]\n" " args = {}\n" " if i==0: args = {'label':'Target to Fit'}\n" " ax.plot([x[2] for x in vals],[x[3] for x in vals],'r-',**args)\n" " if i==0: args = {'label':'Best Fit'}\n" " ax.plot([x[2] for x in vals],[xfit(float(x[2])) for x in vals],'g-',**args)\n" " ax.set_title('BAF distribution, chr'+chr)\n" " ax.set_xlabel('BAF')\n" " ax.set_ylabel('Frequency')\n" " ax.legend(loc='best',prop={'size':7},frameon=False)\n" " plt.savefig(outdir+'/dist.chr'+chr+'.png')\n" " plt.close()\n" "\n" "def plot_copy_number(cn):\n" " fig, ax = plt.subplots(1, 1, figsize=(7,5))\n" " xlabels = sorted(cn.keys())\n" " xvals = range(len(xlabels))\n" " yvals = [float(cn[x]) for x in xlabels]\n" " ax.plot(xvals,yvals,'o',color='red')\n" " for i in range(len(xvals)):\n" " if yvals[i]==-1: ax.annotate('?', xy=(xvals[i],0.5),va='center',ha='center',color='red',fontweight='bold')\n" " ax.tick_params(axis='both', which='major', labelsize=9)\n" " ax.set_xticks(xvals)\n" " ax.set_xticklabels(xlabels,rotation=45)\n" " ax.set_xlim(-1,len(xlabels))\n" " ax.set_ylim(0,5.0)\n" " ax.set_yticks([1.0,2.0,3.0,4.0])\n" " ax.set_xlabel('Chromosome')\n" " ax.set_ylabel('Copy Number')\n" " plt.savefig(outdir+'/copy-number.png')\n" " plt.close()\n" "\n" "class myParser(argparse.ArgumentParser):\n" " def error(self, message):\n" " self.print_help()\n" " sys.stderr.write('error: %%s\\n' %% message)\n" " sys.exit(2)\n" "\n" "def main():\n" " parser = myParser()\n" " parser.add_argument('-a', '--all', action='store_true', help='Create all plots')\n" " parser.add_argument('-c', '--copy-number', action='store_true', help='Create copy-number plot')\n" " parser.add_argument('-d', '--distrib', metavar='CHR', help='Plot BAF distribution of a single chromosome')\n" " args = parser.parse_args()\n" " dat = {}; fit = {}; cn = {}\n" " read_dat(dat,fit,cn)\n" " if args.distrib!=None:\n" " plot_dist(dat,fit,args.distrib)\n" " if args.all:\n" " for chr in dat: plot_dist(dat,fit,chr)\n" " plot_copy_number(cn)\n" " elif args.copy_number:\n" " plot_copy_number(cn)\n" " else:\n" " for chr in dat: plot_dist(dat,fit,chr)\n" "\n" "if __name__ == '__main__':\n" " main()\n", args->output_dir); //--------------------------------------- chmod(fname, S_IWUSR|S_IRUSR|S_IRGRP|S_IROTH|S_IXUSR|S_IXGRP|S_IXOTH); free(fname); fclose(fp); }
int main_plugin(int argc, char *argv[]) { int c; args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->argc = argc; args->argv = argv; args->files = bcf_sr_init(); args->output_fname = "-"; args->output_type = FT_VCF; args->nplugin_paths = -1; int regions_is_file = 0, targets_is_file = 0, plist_only = 0; if ( argc==1 ) usage(args); char *plugin_name = NULL; if ( argv[1][0]!='-' ) { plugin_name = argv[1]; argc--; argv++; } static struct option loptions[] = { {"verbose",0,0,'v'}, {"help",0,0,'h'}, {"list-plugins",0,0,'l'}, {"output",1,0,'o'}, {"output-type",1,0,'O'}, {"include",1,0,'i'}, {"exclude",1,0,'e'}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, {0,0,0,0} }; while ((c = getopt_long(argc, argv, "h?o:O:r:R:li:e:v",loptions,NULL)) >= 0) { switch (c) { case 'v': args->verbose = 1; break; case 'o': args->output_fname = optarg; break; case 'O': switch (optarg[0]) { case 'b': args->output_type = FT_BCF_GZ; break; case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; default: error("The output type \"%s\" not recognised\n", optarg); }; break; case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; case 't': args->targets_list = optarg; break; case 'T': args->targets_list = optarg; targets_is_file = 1; break; case 'l': plist_only = 1; break; case '?': case 'h': load_plugin(args, plugin_name, 1, &args->plugin); fprintf(stderr,"%s",args->plugin.usage()); return 0; break; default: error("Unknown argument: %s\n", optarg); } } if ( plist_only ) return list_plugins(args); char *fname = NULL; if ( optind>=argc || argv[optind][0]=='-' ) { if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin else usage(args); args->plugin.argc = argc - optind + 1; args->plugin.argv = argv + optind - 1; } else { fname = argv[optind]; args->plugin.argc = argc - optind; args->plugin.argv = argv + optind; } optind = 0; args->plugin.argv[0] = plugin_name; load_plugin(args, plugin_name, 1, &args->plugin); if ( args->regions_list ) { if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } if ( args->targets_list ) { if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); args->files->collapse |= COLLAPSE_SOME; } if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open or the file not indexed: %s\n", fname); init_data(args); while ( bcf_sr_next_line(args->files) ) { bcf1_t *line = bcf_sr_get_line(args->files,0); if ( args->filter ) { int pass = filter_test(args->filter, line, NULL); if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; if ( !pass ) continue; } line = args->plugin.process(line); if ( line ) bcf_write1(args->out_fh, args->hdr_out, line); } destroy_data(args); bcf_sr_destroy(args->files); free(args); return 0; }
int main_vcfcall(int argc, char *argv[]) { char *samples_fname = NULL; args_t args; memset(&args, 0, sizeof(args_t)); args.argc = argc; args.argv = argv; args.aux.prior_type = -1; args.aux.indel_frac = -1; args.aux.theta = 1e-3; args.aux.pref = 0.5; args.aux.min_perm_p = 0.01; args.aux.min_lrt = 1; args.flag = CF_ACGT_ONLY; args.output_fname = "-"; args.output_type = FT_VCF; args.aux.trio_Pm_SNPs = 1 - 1e-8; args.aux.trio_Pm_ins = args.aux.trio_Pm_del = 1 - 1e-9; int i, c, samples_is_file = 0; static struct option loptions[] = { {"help",0,0,'h'}, {"gvcf",1,0,'g'}, {"format-fields",1,0,'f'}, {"output",1,0,'o'}, {"output-type",1,0,'O'}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, {"samples",1,0,'s'}, {"samples-file",1,0,'S'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, {"keep-alts",0,0,'A'}, {"insert-missed",0,0,'i'}, {"skip-Ns",0,0,'N'}, // now the new default {"keep-masked-refs",0,0,'M'}, {"skip-variants",1,0,'V'}, {"variants-only",0,0,'v'}, {"consensus-caller",0,0,'c'}, {"constrain",1,0,'C'}, {"multiallelic-caller",0,0,'m'}, {"pval-threshold",1,0,'p'}, {"prior",1,0,'P'}, {"chromosome-X",0,0,'X'}, {"chromosome-Y",0,0,'Y'}, {"novel-rate",1,0,'n'}, {0,0,0,0} }; char *tmp = NULL; while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:XYn:P:f:ig:", loptions, NULL)) >= 0) { switch (c) { case 'g': args.flag |= CF_GVCF; args.gvcf.min_dp = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse, expected integer argument: -g %s\n", optarg); break; case 'f': args.aux.output_tags |= parse_format_flag(optarg); break; case 'M': args.flag &= ~CF_ACGT_ONLY; break; // keep sites where REF is N case 'N': args.flag |= CF_ACGT_ONLY; break; // omit sites where first base in REF is N (the new default) case 'A': args.aux.flag |= CALL_KEEPALT; break; case 'c': args.flag |= CF_CCALL; break; // the original EM based calling method case 'i': args.flag |= CF_INS_MISSED; break; case 'v': args.aux.flag |= CALL_VARONLY; break; case 'o': args.output_fname = optarg; break; case 'O': switch (optarg[0]) { case 'b': args.output_type = FT_BCF_GZ; break; case 'u': args.output_type = FT_BCF; break; case 'z': args.output_type = FT_VCF_GZ; break; case 'v': args.output_type = FT_VCF; break; default: error("The output type \"%s\" not recognised\n", optarg); } break; case 'C': if ( !strcasecmp(optarg,"alleles") ) args.aux.flag |= CALL_CONSTR_ALLELES; else if ( !strcasecmp(optarg,"trio") ) args.aux.flag |= CALL_CONSTR_TRIO; else error("Unknown argument to -C: \"%s\"\n", optarg); break; case 'X': args.aux.flag |= CALL_CHR_X; break; case 'Y': args.aux.flag |= CALL_CHR_Y; break; case 'V': if ( !strcasecmp(optarg,"snps") ) args.flag |= CF_INDEL_ONLY; else if ( !strcasecmp(optarg,"indels") ) args.flag |= CF_NO_INDEL; else error("Unknown skip category \"%s\" (-S argument must be \"snps\" or \"indels\")\n", optarg); break; case 'm': args.flag |= CF_MCALL; break; // multiallelic calling method case 'p': args.aux.pref = atof(optarg); break; case 'P': args.aux.theta = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse, expected float argument: -P %s\n", optarg); break; case 'n': parse_novel_rate(&args,optarg); break; case 'r': args.regions = optarg; break; case 'R': args.regions = optarg; args.regions_is_file = 1; break; case 't': args.targets = optarg; break; case 'T': args.targets = optarg; args.targets_is_file = 1; break; case 's': samples_fname = optarg; break; case 'S': samples_fname = optarg; samples_is_file = 1; break; default: usage(&args); } } if ( optind>=argc ) { if ( !isatty(fileno((FILE *)stdin)) ) args.bcf_fname = "-"; // reading from stdin else usage(&args); } else args.bcf_fname = argv[optind++]; // Sanity check options and initialize if ( samples_fname ) { args.samples = read_samples(&args.aux, samples_fname, samples_is_file, &args.nsamples); args.aux.ploidy = (uint8_t*) calloc(args.nsamples+1, 1); args.aux.all_diploid = 1; for (i=0; i<args.nsamples; i++) { args.aux.ploidy[i] = args.samples[i][strlen(args.samples[i]) + 1]; if ( args.aux.ploidy[i]!=2 ) args.aux.all_diploid = 0; } } if ( args.flag & CF_GVCF ) { // Force some flags to avoid unnecessary branching args.aux.flag &= ~CALL_KEEPALT; args.aux.flag |= CALL_VARONLY; } if ( (args.flag & CF_CCALL ? 1 : 0) + (args.flag & CF_MCALL ? 1 : 0) + (args.flag & CF_QCALL ? 1 : 0) > 1 ) error("Only one of -c or -m options can be given\n"); if ( !(args.flag & CF_CCALL) && !(args.flag & CF_MCALL) && !(args.flag & CF_QCALL) ) error("Expected -c or -m option\n"); if ( args.aux.n_perm && args.aux.ngrp1_samples<=0 ) error("Expected -1 with -U\n"); // not sure about this, please fix if ( args.aux.flag & CALL_CONSTR_ALLELES ) { if ( !args.targets ) error("Expected -t or -T with \"-C alleles\"\n"); if ( !(args.flag & CF_MCALL) ) error("The \"-C alleles\" mode requires -m\n"); } if ( args.aux.flag & CALL_CHR_X && args.aux.flag & CALL_CHR_Y ) error("Only one of -X or -Y should be given\n"); if ( args.flag & CF_INS_MISSED && !(args.aux.flag&CALL_CONSTR_ALLELES) ) error("The -i option requires -C alleles\n"); init_data(&args); while ( bcf_sr_next_line(args.aux.srs) ) { bcf1_t *bcf_rec = args.aux.srs->readers[0].buffer[0]; if ( args.samples_map ) bcf_subset(args.aux.hdr, bcf_rec, args.nsamples, args.samples_map); bcf_unpack(bcf_rec, BCF_UN_STR); // Skip unwanted sites if ( args.aux.flag & CALL_VARONLY ) { int is_ref = 0; if ( bcf_rec->n_allele==1 ) is_ref = 1; // not a variant else if ( bcf_rec->n_allele==2 ) { // second allele is mpileup's X, not a variant if ( bcf_rec->d.allele[1][0]=='X' ) is_ref = 1; else if ( bcf_rec->d.allele[1][0]=='<' && bcf_rec->d.allele[1][1]=='X' && bcf_rec->d.allele[1][2]=='>' ) is_ref = 1; } if ( is_ref ) { // gVCF output if ( args.flag & CF_GVCF ) gvcf_write(args.out_fh, &args.gvcf, args.aux.hdr, bcf_rec, 1); continue; } } if ( (args.flag & CF_INDEL_ONLY) && bcf_is_snp(bcf_rec) ) continue; // not an indel if ( (args.flag & CF_NO_INDEL) && !bcf_is_snp(bcf_rec) ) continue; // not a SNP if ( (args.flag & CF_ACGT_ONLY) && (bcf_rec->d.allele[0][0]=='N' || bcf_rec->d.allele[0][0]=='n') ) continue; // REF[0] is 'N' bcf_unpack(bcf_rec, BCF_UN_ALL); // Various output modes: QCall output (todo) if ( args.flag & CF_QCALL ) { qcall(&args.aux, bcf_rec); continue; } // Calling modes which output VCFs int ret; if ( args.flag & CF_MCALL ) ret = mcall(&args.aux, bcf_rec); else ret = ccall(&args.aux, bcf_rec); if ( ret==-1 ) error("Something is wrong\n"); // gVCF output if ( args.flag & CF_GVCF ) { gvcf_write(args.out_fh, &args.gvcf, args.aux.hdr, bcf_rec, ret?0:1); continue; } // Normal output if ( (args.aux.flag & CALL_VARONLY) && ret==0 ) continue; // not a variant bcf_write1(args.out_fh, args.aux.hdr, bcf_rec); } if ( args.flag & CF_GVCF ) gvcf_write(args.out_fh, &args.gvcf, args.aux.hdr, NULL, 0); if ( args.flag & CF_INS_MISSED ) bcf_sr_regions_flush(args.aux.srs->targets); destroy_data(&args); return 0; }
int next_line() { return bcf_sr_next_line(m_sr); }