int main_vcfcall(int argc, char *argv[]) { char *samples_fname = NULL; args_t args; memset(&args, 0, sizeof(args_t)); args.argc = argc; args.argv = argv; args.aux.prior_type = -1; args.aux.indel_frac = -1; args.aux.theta = 1e-3; args.aux.pref = 0.5; args.aux.min_perm_p = 0.01; args.aux.min_lrt = 1; args.flag = CF_ACGT_ONLY; args.output_fname = "-"; args.output_type = FT_VCF; args.aux.trio_Pm_SNPs = 1 - 1e-8; args.aux.trio_Pm_ins = args.aux.trio_Pm_del = 1 - 1e-9; int i, c, samples_is_file = 0; static struct option loptions[] = { {"help",0,0,'h'}, {"gvcf",1,0,'g'}, {"format-fields",1,0,'f'}, {"output",1,0,'o'}, {"output-type",1,0,'O'}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, {"samples",1,0,'s'}, {"samples-file",1,0,'S'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, {"keep-alts",0,0,'A'}, {"insert-missed",0,0,'i'}, {"skip-Ns",0,0,'N'}, // now the new default {"keep-masked-refs",0,0,'M'}, {"skip-variants",1,0,'V'}, {"variants-only",0,0,'v'}, {"consensus-caller",0,0,'c'}, {"constrain",1,0,'C'}, {"multiallelic-caller",0,0,'m'}, {"pval-threshold",1,0,'p'}, {"prior",1,0,'P'}, {"chromosome-X",0,0,'X'}, {"chromosome-Y",0,0,'Y'}, {"novel-rate",1,0,'n'}, {0,0,0,0} }; char *tmp = NULL; while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:XYn:P:f:ig:", loptions, NULL)) >= 0) { switch (c) { case 'g': args.flag |= CF_GVCF; args.gvcf.min_dp = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse, expected integer argument: -g %s\n", optarg); break; case 'f': args.aux.output_tags |= parse_format_flag(optarg); break; case 'M': args.flag &= ~CF_ACGT_ONLY; break; // keep sites where REF is N case 'N': args.flag |= CF_ACGT_ONLY; break; // omit sites where first base in REF is N (the new default) case 'A': args.aux.flag |= CALL_KEEPALT; break; case 'c': args.flag |= CF_CCALL; break; // the original EM based calling method case 'i': args.flag |= CF_INS_MISSED; break; case 'v': args.aux.flag |= CALL_VARONLY; break; case 'o': args.output_fname = optarg; break; case 'O': switch (optarg[0]) { case 'b': args.output_type = FT_BCF_GZ; break; case 'u': args.output_type = FT_BCF; break; case 'z': args.output_type = FT_VCF_GZ; break; case 'v': args.output_type = FT_VCF; break; default: error("The output type \"%s\" not recognised\n", optarg); } break; case 'C': if ( !strcasecmp(optarg,"alleles") ) args.aux.flag |= CALL_CONSTR_ALLELES; else if ( !strcasecmp(optarg,"trio") ) args.aux.flag |= CALL_CONSTR_TRIO; else error("Unknown argument to -C: \"%s\"\n", optarg); break; case 'X': args.aux.flag |= CALL_CHR_X; break; case 'Y': args.aux.flag |= CALL_CHR_Y; break; case 'V': if ( !strcasecmp(optarg,"snps") ) args.flag |= CF_INDEL_ONLY; else if ( !strcasecmp(optarg,"indels") ) args.flag |= CF_NO_INDEL; else error("Unknown skip category \"%s\" (-S argument must be \"snps\" or \"indels\")\n", optarg); break; case 'm': args.flag |= CF_MCALL; break; // multiallelic calling method case 'p': args.aux.pref = atof(optarg); break; case 'P': args.aux.theta = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse, expected float argument: -P %s\n", optarg); break; case 'n': parse_novel_rate(&args,optarg); break; case 'r': args.regions = optarg; break; case 'R': args.regions = optarg; args.regions_is_file = 1; break; case 't': args.targets = optarg; break; case 'T': args.targets = optarg; args.targets_is_file = 1; break; case 's': samples_fname = optarg; break; case 'S': samples_fname = optarg; samples_is_file = 1; break; default: usage(&args); } } if ( optind>=argc ) { if ( !isatty(fileno((FILE *)stdin)) ) args.bcf_fname = "-"; // reading from stdin else usage(&args); } else args.bcf_fname = argv[optind++]; // Sanity check options and initialize if ( samples_fname ) { args.samples = read_samples(&args.aux, samples_fname, samples_is_file, &args.nsamples); args.aux.ploidy = (uint8_t*) calloc(args.nsamples+1, 1); args.aux.all_diploid = 1; for (i=0; i<args.nsamples; i++) { args.aux.ploidy[i] = args.samples[i][strlen(args.samples[i]) + 1]; if ( args.aux.ploidy[i]!=2 ) args.aux.all_diploid = 0; } } if ( args.flag & CF_GVCF ) { // Force some flags to avoid unnecessary branching args.aux.flag &= ~CALL_KEEPALT; args.aux.flag |= CALL_VARONLY; } if ( (args.flag & CF_CCALL ? 1 : 0) + (args.flag & CF_MCALL ? 1 : 0) + (args.flag & CF_QCALL ? 1 : 0) > 1 ) error("Only one of -c or -m options can be given\n"); if ( !(args.flag & CF_CCALL) && !(args.flag & CF_MCALL) && !(args.flag & CF_QCALL) ) error("Expected -c or -m option\n"); if ( args.aux.n_perm && args.aux.ngrp1_samples<=0 ) error("Expected -1 with -U\n"); // not sure about this, please fix if ( args.aux.flag & CALL_CONSTR_ALLELES ) { if ( !args.targets ) error("Expected -t or -T with \"-C alleles\"\n"); if ( !(args.flag & CF_MCALL) ) error("The \"-C alleles\" mode requires -m\n"); } if ( args.aux.flag & CALL_CHR_X && args.aux.flag & CALL_CHR_Y ) error("Only one of -X or -Y should be given\n"); if ( args.flag & CF_INS_MISSED && !(args.aux.flag&CALL_CONSTR_ALLELES) ) error("The -i option requires -C alleles\n"); init_data(&args); while ( bcf_sr_next_line(args.aux.srs) ) { bcf1_t *bcf_rec = args.aux.srs->readers[0].buffer[0]; if ( args.samples_map ) bcf_subset(args.aux.hdr, bcf_rec, args.nsamples, args.samples_map); bcf_unpack(bcf_rec, BCF_UN_STR); // Skip unwanted sites if ( args.aux.flag & CALL_VARONLY ) { int is_ref = 0; if ( bcf_rec->n_allele==1 ) is_ref = 1; // not a variant else if ( bcf_rec->n_allele==2 ) { // second allele is mpileup's X, not a variant if ( bcf_rec->d.allele[1][0]=='X' ) is_ref = 1; else if ( bcf_rec->d.allele[1][0]=='<' && bcf_rec->d.allele[1][1]=='X' && bcf_rec->d.allele[1][2]=='>' ) is_ref = 1; } if ( is_ref ) { // gVCF output if ( args.flag & CF_GVCF ) gvcf_write(args.out_fh, &args.gvcf, args.aux.hdr, bcf_rec, 1); continue; } } if ( (args.flag & CF_INDEL_ONLY) && bcf_is_snp(bcf_rec) ) continue; // not an indel if ( (args.flag & CF_NO_INDEL) && !bcf_is_snp(bcf_rec) ) continue; // not a SNP if ( (args.flag & CF_ACGT_ONLY) && (bcf_rec->d.allele[0][0]=='N' || bcf_rec->d.allele[0][0]=='n') ) continue; // REF[0] is 'N' bcf_unpack(bcf_rec, BCF_UN_ALL); // Various output modes: QCall output (todo) if ( args.flag & CF_QCALL ) { qcall(&args.aux, bcf_rec); continue; } // Calling modes which output VCFs int ret; if ( args.flag & CF_MCALL ) ret = mcall(&args.aux, bcf_rec); else ret = ccall(&args.aux, bcf_rec); if ( ret==-1 ) error("Something is wrong\n"); // gVCF output if ( args.flag & CF_GVCF ) { gvcf_write(args.out_fh, &args.gvcf, args.aux.hdr, bcf_rec, ret?0:1); continue; } // Normal output if ( (args.aux.flag & CALL_VARONLY) && ret==0 ) continue; // not a variant bcf_write1(args.out_fh, args.aux.hdr, bcf_rec); } if ( args.flag & CF_GVCF ) gvcf_write(args.out_fh, &args.gvcf, args.aux.hdr, NULL, 0); if ( args.flag & CF_INS_MISSED ) bcf_sr_regions_flush(args.aux.srs->targets); destroy_data(&args); return 0; }
static void vcfroh(args_t *args, bcf1_t *line) { // Are we done? if ( !line ) { flush_viterbi(args); return; } args->ntot++; // Skip unwanted lines if ( line->rid == args->skip_rid ) return; if ( line->n_allele==1 ) return; // no ALT allele if ( line->n_allele!=2 ) return; // only biallelic sites if ( args->snps_only && !bcf_is_snp(line) ) return; // Initialize genetic map int skip_rid = 0; if ( args->prev_rid<0 ) { args->prev_rid = line->rid; args->prev_pos = line->pos; skip_rid = load_genmap(args, line); if ( !skip_rid && args->vi_training ) push_rid(args, line->rid); } // New chromosome? if ( args->prev_rid!=line->rid ) { skip_rid = load_genmap(args, line); if ( args->vi_training ) { if ( !skip_rid ) push_rid(args, line->rid); } else { flush_viterbi(args); args->nsites = 0; } args->prev_rid = line->rid; args->prev_pos = line->pos; } if ( skip_rid ) { fprintf(pysamerr,"Skipping the sequence, no genmap for %s\n", bcf_seqname(args->hdr,line)); args->skip_rid = line->rid; return; } if ( args->prev_pos > line->pos ) error("The file is not sorted?!\n"); args->prev_rid = line->rid; args->prev_pos = line->pos; // Ready for the new site int m = args->msites; hts_expand(uint32_t,args->nsites+1,args->msites,args->sites); if ( args->msites!=m ) args->eprob = (double*) realloc(args->eprob,sizeof(double)*args->msites*2); // Set likelihoods and alternate allele frequencies double alt_freq, pdg[3]; if ( parse_line(args, line, &alt_freq, pdg)<0 ) return; // something went wrong args->nused++; // Calculate emission probabilities P(D|AZ) and P(D|HW) double *eprob = &args->eprob[2*args->nsites]; eprob[STATE_AZ] = pdg[0]*(1-alt_freq) + pdg[2]*alt_freq; eprob[STATE_HW] = pdg[0]*(1-alt_freq)*(1-alt_freq) + 2*pdg[1]*(1-alt_freq)*alt_freq + pdg[2]*alt_freq*alt_freq; args->sites[args->nsites] = line->pos; args->nsites++; }
int main_vcfview(int argc, char *argv[]) { int i, c, clevel = -1, flag = 0, n_samples = -1, *imap = 0, excl_snp = 0, excl_indel = 0; char *fn_ref = 0, *fn_out = 0, moder[8], **samples = 0; bcf_hdr_t *h, *hsub = 0; htsFile *in; bcf1_t *b; while ((c = getopt(argc, argv, "l:bSt:o:T:s:GNI")) >= 0) { switch (c) { case 'l': clevel = atoi(optarg); flag |= 2; break; case 'S': flag |= 1; break; case 'b': flag |= 2; break; case 'G': n_samples = 0; break; case 't': fn_ref = optarg; flag |= 1; break; case 'o': fn_out = optarg; break; case 's': samples = hts_readlines(optarg, &n_samples); break; case 'N': excl_snp = 1; break; case 'I': excl_indel = 1; break; } } if (argc == optind) { fprintf(stderr, "\nUsage: vcfview [options] <in.bcf>|<in.vcf>|<in.vcf.gz>\n\n"); fprintf(stderr, "Options: -b output in BCF\n"); fprintf(stderr, " -S input is VCF\n"); fprintf(stderr, " -o FILE output file name [stdout]\n"); fprintf(stderr, " -l INT compression level [%d]\n", clevel); fprintf(stderr, " -t FILE list of reference names and lengths [null]\n"); fprintf(stderr, " -s FILE/STR list of samples (STR if started with ':'; FILE otherwise) [null]\n"); fprintf(stderr, " -G drop individual genotype information\n"); fprintf(stderr, " -N exclude SNPs\n"); fprintf(stderr, " -I exclude INDELs\n"); fprintf(stderr, "\n"); return 1; } strcpy(moder, "r"); if ((flag&1) == 0 && !(file_type(argv[optind])&(IS_VCF|IS_VCF_GZ))) strcat(moder, "b"); in = hts_open(argv[optind], moder, fn_ref); h = vcf_hdr_read(in); if (h == 0) { fprintf(stderr, "[E::%s] fail to read the VCF/BCF2 header\n", __func__); hts_close(in); return 1; } if (n_samples >= 0) { if (n_samples) imap = (int*)malloc(n_samples * sizeof(int)); hsub = bcf_hdr_subset(h, n_samples, samples, imap); } b = bcf_init1(); if ((flag&4) == 0) { // VCF/BCF output htsFile *out; char modew[8]; strcpy(modew, "w"); if (clevel >= 0 && clevel <= 9) sprintf(modew + 1, "%d", clevel); if (flag&2) strcat(modew, "b"); out = hts_open(fn_out? fn_out : "-", modew, 0); vcf_hdr_write(out, hsub? hsub : h); if (optind + 1 < argc && !(flag&1)) { // BAM input and has a region hts_idx_t *idx; if ((idx = bcf_index_load(argv[optind])) == 0) { fprintf(stderr, "[E::%s] fail to load the BCF index\n", __func__); return 1; } for (i = optind + 1; i < argc; ++i) { hts_itr_t *iter; if ((iter = bcf_itr_querys(idx, h, argv[i])) == 0) { fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, argv[i]); continue; } while (bcf_itr_next((BGZF*)in->fp, iter, b) >= 0) { if (excl_snp && bcf_is_snp(b)) continue; if (excl_indel && !bcf_is_snp(b)) continue; if (n_samples >= 0) { bcf_subset(h, b, n_samples, imap); vcf_write1(out, hsub, b); } else vcf_write1(out, h, b); } hts_itr_destroy(iter); } hts_idx_destroy(idx); } else { while (vcf_read1(in, h, b) >= 0) { if (excl_snp && bcf_is_snp(b)) continue; if (excl_indel && !bcf_is_snp(b)) continue; if (n_samples >= 0) { bcf_subset(h, b, n_samples, imap); vcf_write1(out, hsub, b); } else vcf_write1(out, h, b); } } hts_close(out); } bcf_destroy1(b); if (n_samples > 0) { for (i = 0; i < n_samples; ++i) free(samples[i]); free(samples); bcf_hdr_destroy(hsub); free(imap); } bcf_hdr_destroy(h); hts_close(in); return 0; }
int bcf_p1_cal(call_t *call, bcf1_t *b, int do_contrast, bcf_p1aux_t *ma, bcf_p1rst_t *rst) { int i, k; long double sum = 0.; ma->is_indel = bcf_is_snp(b) ? 0 : 1; rst->perm_rank = -1; ma->PL = call->PLs; ma->PL_len = call->nPLs / b->n_sample; if (b->n_allele < 2) return -1; // FIXME: find a better solution rst->rank0 = cal_pdg(b, ma); rst->f_exp = mc_cal_afs(ma, &rst->p_ref_folded, &rst->p_var_folded); rst->p_ref = ma->afs1[ma->M]; for (k = 0, sum = 0.; k < ma->M; ++k) sum += ma->afs1[k]; rst->p_var = (double)sum; { // compute the allele count double max = -1; rst->ac = -1; for (k = 0; k <= ma->M; ++k) if (max < ma->z[k]) max = ma->z[k], rst->ac = k; rst->ac = ma->M - rst->ac; } // calculate f_flat and f_em for (k = 0, sum = 0.; k <= ma->M; ++k) sum += (long double)ma->z[k]; rst->f_flat = 0.; for (k = 0; k <= ma->M; ++k) { double p = ma->z[k] / sum; rst->f_flat += k * p; } rst->f_flat /= ma->M; { // estimate equal-tail credible interval (95% level) int l, h; double p; for (i = 0, p = 0.; i <= ma->M; ++i) if (p + ma->afs1[i] > 0.025) break; else p += ma->afs1[i]; l = i; for (i = ma->M, p = 0.; i >= 0; --i) if (p + ma->afs1[i] > 0.025) break; else p += ma->afs1[i]; h = i; rst->cil = (double)(ma->M - h) / ma->M; rst->cih = (double)(ma->M - l) / ma->M; } if (ma->n1 > 0) { // compute LRT double max0, max1, max2; for (k = 0, max0 = -1; k <= ma->M; ++k) if (max0 < ma->z[k]) max0 = ma->z[k]; for (k = 0, max1 = -1; k <= ma->n1 * 2; ++k) if (max1 < ma->z1[k]) max1 = ma->z1[k]; for (k = 0, max2 = -1; k <= ma->M - ma->n1 * 2; ++k) if (max2 < ma->z2[k]) max2 = ma->z2[k]; rst->lrt = log(max1 * max2 / max0); rst->lrt = rst->lrt < 0? 1 : kf_gammaq(.5, rst->lrt); } else rst->lrt = -1.0; rst->cmp[0] = rst->cmp[1] = rst->cmp[2] = rst->p_chi2 = -1.0; if (do_contrast && rst->p_var > 0.5) // skip contrast2() if the locus is a strong non-variant rst->p_chi2 = contrast2(ma, rst->cmp); return 0; }