int main_vcfcall(int argc, char *argv[])
{
    char *samples_fname = NULL;
    args_t args;
    memset(&args, 0, sizeof(args_t));
    args.argc = argc; args.argv = argv;
    args.aux.prior_type = -1;
    args.aux.indel_frac = -1;
    args.aux.theta      = 1e-3;
    args.aux.pref       = 0.5;
    args.aux.min_perm_p = 0.01;
    args.aux.min_lrt    = 1;
    args.flag           = CF_ACGT_ONLY;
    args.output_fname   = "-";
    args.output_type    = FT_VCF;
    args.aux.trio_Pm_SNPs = 1 - 1e-8;
    args.aux.trio_Pm_ins  = args.aux.trio_Pm_del  = 1 - 1e-9;

    int i, c, samples_is_file = 0;

    static struct option loptions[] =
    {
        {"help",0,0,'h'},
        {"gvcf",1,0,'g'},
        {"format-fields",1,0,'f'},
        {"output",1,0,'o'},
        {"output-type",1,0,'O'},
        {"regions",1,0,'r'},
        {"regions-file",1,0,'R'},
        {"samples",1,0,'s'},
        {"samples-file",1,0,'S'},
        {"targets",1,0,'t'},
        {"targets-file",1,0,'T'},
        {"keep-alts",0,0,'A'},
        {"insert-missed",0,0,'i'},
        {"skip-Ns",0,0,'N'},            // now the new default
        {"keep-masked-refs",0,0,'M'},
        {"skip-variants",1,0,'V'},
        {"variants-only",0,0,'v'},
        {"consensus-caller",0,0,'c'},
        {"constrain",1,0,'C'},
        {"multiallelic-caller",0,0,'m'},
        {"pval-threshold",1,0,'p'},
        {"prior",1,0,'P'},
        {"chromosome-X",0,0,'X'},
        {"chromosome-Y",0,0,'Y'},
        {"novel-rate",1,0,'n'},
        {0,0,0,0}
    };

    char *tmp = NULL;
    while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:XYn:P:f:ig:", loptions, NULL)) >= 0)
    {
        switch (c)
        {
            case 'g':
                args.flag |= CF_GVCF;
                args.gvcf.min_dp = strtol(optarg,&tmp,10);
                if ( *tmp ) error("Could not parse, expected integer argument: -g %s\n", optarg);
                break;
            case 'f': args.aux.output_tags |= parse_format_flag(optarg); break;
            case 'M': args.flag &= ~CF_ACGT_ONLY; break;     // keep sites where REF is N
            case 'N': args.flag |= CF_ACGT_ONLY; break;      // omit sites where first base in REF is N (the new default)
            case 'A': args.aux.flag |= CALL_KEEPALT; break;
            case 'c': args.flag |= CF_CCALL; break;          // the original EM based calling method
            case 'i': args.flag |= CF_INS_MISSED; break;
            case 'v': args.aux.flag |= CALL_VARONLY; break;
            case 'o': args.output_fname = optarg; break;
            case 'O':
                      switch (optarg[0]) {
                          case 'b': args.output_type = FT_BCF_GZ; break;
                          case 'u': args.output_type = FT_BCF; break;
                          case 'z': args.output_type = FT_VCF_GZ; break;
                          case 'v': args.output_type = FT_VCF; break;
                          default: error("The output type \"%s\" not recognised\n", optarg);
                      }
                      break;
            case 'C':
                      if ( !strcasecmp(optarg,"alleles") ) args.aux.flag |= CALL_CONSTR_ALLELES;
                      else if ( !strcasecmp(optarg,"trio") ) args.aux.flag |= CALL_CONSTR_TRIO;
                      else error("Unknown argument to -C: \"%s\"\n", optarg);
                      break;
            case 'X': args.aux.flag |= CALL_CHR_X; break;
            case 'Y': args.aux.flag |= CALL_CHR_Y; break;
            case 'V':
                      if ( !strcasecmp(optarg,"snps") ) args.flag |= CF_INDEL_ONLY;
                      else if ( !strcasecmp(optarg,"indels") ) args.flag |= CF_NO_INDEL;
                      else error("Unknown skip category \"%s\" (-S argument must be \"snps\" or \"indels\")\n", optarg);
                      break;
            case 'm': args.flag |= CF_MCALL; break;         // multiallelic calling method
            case 'p': args.aux.pref = atof(optarg); break;
            case 'P': args.aux.theta = strtod(optarg,&tmp);
                      if ( *tmp ) error("Could not parse, expected float argument: -P %s\n", optarg);
                      break;
            case 'n': parse_novel_rate(&args,optarg); break;
            case 'r': args.regions = optarg; break;
            case 'R': args.regions = optarg; args.regions_is_file = 1; break;
            case 't': args.targets = optarg; break;
            case 'T': args.targets = optarg; args.targets_is_file = 1; break;
            case 's': samples_fname = optarg; break;
            case 'S': samples_fname = optarg; samples_is_file = 1; break;
            default: usage(&args);
        }
    }
    if ( optind>=argc )
    {
        if ( !isatty(fileno((FILE *)stdin)) ) args.bcf_fname = "-";  // reading from stdin
        else usage(&args);
    }
    else args.bcf_fname = argv[optind++];

    // Sanity check options and initialize
    if ( samples_fname )
    {
        args.samples = read_samples(&args.aux, samples_fname, samples_is_file, &args.nsamples);
        args.aux.ploidy = (uint8_t*) calloc(args.nsamples+1, 1);
        args.aux.all_diploid = 1;
        for (i=0; i<args.nsamples; i++)
        {
            args.aux.ploidy[i] = args.samples[i][strlen(args.samples[i]) + 1];
            if ( args.aux.ploidy[i]!=2 ) args.aux.all_diploid = 0;
        }
    }
    if ( args.flag & CF_GVCF )
    {
        // Force some flags to avoid unnecessary branching
        args.aux.flag &= ~CALL_KEEPALT;
        args.aux.flag |= CALL_VARONLY;
    }
    if ( (args.flag & CF_CCALL ? 1 : 0) + (args.flag & CF_MCALL ? 1 : 0) + (args.flag & CF_QCALL ? 1 : 0) > 1 ) error("Only one of -c or -m options can be given\n");
    if ( !(args.flag & CF_CCALL) && !(args.flag & CF_MCALL) && !(args.flag & CF_QCALL) ) error("Expected -c or -m option\n");
    if ( args.aux.n_perm && args.aux.ngrp1_samples<=0 ) error("Expected -1 with -U\n");    // not sure about this, please fix
    if ( args.aux.flag & CALL_CONSTR_ALLELES )
    {
        if ( !args.targets ) error("Expected -t or -T with \"-C alleles\"\n");
        if ( !(args.flag & CF_MCALL) ) error("The \"-C alleles\" mode requires -m\n");
    }
    if ( args.aux.flag & CALL_CHR_X && args.aux.flag & CALL_CHR_Y ) error("Only one of -X or -Y should be given\n");
    if ( args.flag & CF_INS_MISSED && !(args.aux.flag&CALL_CONSTR_ALLELES) ) error("The -i option requires -C alleles\n");
    init_data(&args);

    while ( bcf_sr_next_line(args.aux.srs) )
    {
        bcf1_t *bcf_rec = args.aux.srs->readers[0].buffer[0];
        if ( args.samples_map ) bcf_subset(args.aux.hdr, bcf_rec, args.nsamples, args.samples_map);
        bcf_unpack(bcf_rec, BCF_UN_STR);

        // Skip unwanted sites
        if ( args.aux.flag & CALL_VARONLY )
        {
            int is_ref = 0;
            if ( bcf_rec->n_allele==1 ) is_ref = 1;     // not a variant
            else if ( bcf_rec->n_allele==2 )
            {
                // second allele is mpileup's X, not a variant
                if ( bcf_rec->d.allele[1][0]=='X' ) is_ref = 1;
                else if ( bcf_rec->d.allele[1][0]=='<' && bcf_rec->d.allele[1][1]=='X' && bcf_rec->d.allele[1][2]=='>' ) is_ref = 1;
            }
            if ( is_ref )
            {
                // gVCF output
                if ( args.flag & CF_GVCF ) gvcf_write(args.out_fh, &args.gvcf, args.aux.hdr, bcf_rec, 1);
                continue;
            }
        }
        if ( (args.flag & CF_INDEL_ONLY) && bcf_is_snp(bcf_rec) ) continue;    // not an indel
        if ( (args.flag & CF_NO_INDEL) && !bcf_is_snp(bcf_rec) ) continue;     // not a SNP
        if ( (args.flag & CF_ACGT_ONLY) && (bcf_rec->d.allele[0][0]=='N' || bcf_rec->d.allele[0][0]=='n') ) continue;   // REF[0] is 'N'

        bcf_unpack(bcf_rec, BCF_UN_ALL);

        // Various output modes: QCall output (todo)
        if ( args.flag & CF_QCALL )
        {
            qcall(&args.aux, bcf_rec);
            continue;
        }

        // Calling modes which output VCFs
        int ret;
        if ( args.flag & CF_MCALL )
            ret = mcall(&args.aux, bcf_rec);
        else
            ret = ccall(&args.aux, bcf_rec);
        if ( ret==-1 ) error("Something is wrong\n");

        // gVCF output
        if ( args.flag & CF_GVCF )
        {
            gvcf_write(args.out_fh, &args.gvcf, args.aux.hdr, bcf_rec, ret?0:1);
            continue;
        }

        // Normal output
        if ( (args.aux.flag & CALL_VARONLY) && ret==0 ) continue;     // not a variant
        bcf_write1(args.out_fh, args.aux.hdr, bcf_rec);
    }
    if ( args.flag & CF_GVCF ) gvcf_write(args.out_fh, &args.gvcf, args.aux.hdr, NULL, 0);
    if ( args.flag & CF_INS_MISSED ) bcf_sr_regions_flush(args.aux.srs->targets);
    destroy_data(&args);
    return 0;
}
Exemple #2
0
static void vcfroh(args_t *args, bcf1_t *line)
{
    // Are we done?
    if ( !line )
    { 
        flush_viterbi(args);
        return; 
    }
    args->ntot++;

    // Skip unwanted lines
    if ( line->rid == args->skip_rid ) return;
    if ( line->n_allele==1 ) return;    // no ALT allele
    if ( line->n_allele!=2 ) return;    // only biallelic sites
    if ( args->snps_only && !bcf_is_snp(line) ) return;

    // Initialize genetic map
    int skip_rid = 0;
    if ( args->prev_rid<0 )
    {
        args->prev_rid = line->rid;
        args->prev_pos = line->pos;
        skip_rid = load_genmap(args, line);
        if ( !skip_rid && args->vi_training ) push_rid(args, line->rid);
    }

    // New chromosome?
    if ( args->prev_rid!=line->rid )
    {
        skip_rid = load_genmap(args, line);
        if ( args->vi_training )
        {
            if ( !skip_rid ) push_rid(args, line->rid);
        }
        else
        {
            flush_viterbi(args);
            args->nsites = 0;
        }
        args->prev_rid = line->rid;
        args->prev_pos = line->pos;
    }

    if ( skip_rid )
    {
        fprintf(pysamerr,"Skipping the sequence, no genmap for %s\n", bcf_seqname(args->hdr,line));
        args->skip_rid = line->rid;
        return;
    }
    if ( args->prev_pos > line->pos ) error("The file is not sorted?!\n");

    args->prev_rid = line->rid;
    args->prev_pos = line->pos;


    // Ready for the new site
    int m = args->msites;
    hts_expand(uint32_t,args->nsites+1,args->msites,args->sites);
    if ( args->msites!=m )
        args->eprob = (double*) realloc(args->eprob,sizeof(double)*args->msites*2);

    // Set likelihoods and alternate allele frequencies
    double alt_freq, pdg[3];
    if ( parse_line(args, line, &alt_freq, pdg)<0 ) return; // something went wrong

    args->nused++;

    // Calculate emission probabilities P(D|AZ) and P(D|HW)
    double *eprob = &args->eprob[2*args->nsites];
    eprob[STATE_AZ] = pdg[0]*(1-alt_freq) + pdg[2]*alt_freq;
    eprob[STATE_HW] = pdg[0]*(1-alt_freq)*(1-alt_freq) + 2*pdg[1]*(1-alt_freq)*alt_freq + pdg[2]*alt_freq*alt_freq;

    args->sites[args->nsites] = line->pos;
    args->nsites++;
}
Exemple #3
0
int main_vcfview(int argc, char *argv[])
{
	int i, c, clevel = -1, flag = 0, n_samples = -1, *imap = 0, excl_snp = 0, excl_indel = 0;
	char *fn_ref = 0, *fn_out = 0, moder[8], **samples = 0;
	bcf_hdr_t *h, *hsub = 0;
	htsFile *in;
	bcf1_t *b;

	while ((c = getopt(argc, argv, "l:bSt:o:T:s:GNI")) >= 0) {
		switch (c) {
		case 'l': clevel = atoi(optarg); flag |= 2; break;
		case 'S': flag |= 1; break;
		case 'b': flag |= 2; break;
		case 'G': n_samples = 0; break;
		case 't': fn_ref = optarg; flag |= 1; break;
		case 'o': fn_out = optarg; break;
		case 's': samples = hts_readlines(optarg, &n_samples); break;
		case 'N': excl_snp = 1; break;
		case 'I': excl_indel = 1; break;
		}
	}
	if (argc == optind) {
		fprintf(stderr, "\nUsage:   vcfview [options] <in.bcf>|<in.vcf>|<in.vcf.gz>\n\n");
		fprintf(stderr, "Options: -b           output in BCF\n");
		fprintf(stderr, "         -S           input is VCF\n");
		fprintf(stderr, "         -o FILE      output file name [stdout]\n");
		fprintf(stderr, "         -l INT       compression level [%d]\n", clevel);
		fprintf(stderr, "         -t FILE      list of reference names and lengths [null]\n");
		fprintf(stderr, "         -s FILE/STR  list of samples (STR if started with ':'; FILE otherwise) [null]\n");
		fprintf(stderr, "         -G           drop individual genotype information\n");
		fprintf(stderr, "         -N           exclude SNPs\n");
		fprintf(stderr, "         -I           exclude INDELs\n");
		fprintf(stderr, "\n");
		return 1;
	}
	strcpy(moder, "r");
	if ((flag&1) == 0 && !(file_type(argv[optind])&(IS_VCF|IS_VCF_GZ))) strcat(moder, "b");

	in = hts_open(argv[optind], moder, fn_ref);
	h = vcf_hdr_read(in);
	if (h == 0) {
		fprintf(stderr, "[E::%s] fail to read the VCF/BCF2 header\n", __func__);
		hts_close(in);
		return 1;
	}
	if (n_samples >= 0) {
		if (n_samples) imap = (int*)malloc(n_samples * sizeof(int));
		hsub = bcf_hdr_subset(h, n_samples, samples, imap);
	}
	b = bcf_init1();

	if ((flag&4) == 0) { // VCF/BCF output
		htsFile *out;
		char modew[8];
		strcpy(modew, "w");
		if (clevel >= 0 && clevel <= 9) sprintf(modew + 1, "%d", clevel);
		if (flag&2) strcat(modew, "b");
		out = hts_open(fn_out? fn_out : "-", modew, 0);
		vcf_hdr_write(out, hsub? hsub : h);
		if (optind + 1 < argc && !(flag&1)) { // BAM input and has a region
			hts_idx_t *idx;
			if ((idx = bcf_index_load(argv[optind])) == 0) {
				fprintf(stderr, "[E::%s] fail to load the BCF index\n", __func__);
				return 1;
			}
			for (i = optind + 1; i < argc; ++i) {
				hts_itr_t *iter;
				if ((iter = bcf_itr_querys(idx, h, argv[i])) == 0) {
					fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, argv[i]);
					continue;
				}
				while (bcf_itr_next((BGZF*)in->fp, iter, b) >= 0) {
					if (excl_snp && bcf_is_snp(b)) continue;
					if (excl_indel && !bcf_is_snp(b)) continue;
					if (n_samples >= 0) {
						bcf_subset(h, b, n_samples, imap);
						vcf_write1(out, hsub, b);
					} else vcf_write1(out, h, b);
				}
				hts_itr_destroy(iter);
			}
			hts_idx_destroy(idx);
		} else {
			while (vcf_read1(in, h, b) >= 0) {
				if (excl_snp && bcf_is_snp(b)) continue;
				if (excl_indel && !bcf_is_snp(b)) continue;
				if (n_samples >= 0) {
					bcf_subset(h, b, n_samples, imap);
					vcf_write1(out, hsub, b);
				} else vcf_write1(out, h, b);
			}
		}
		hts_close(out);
	}

	bcf_destroy1(b);
	if (n_samples > 0) {
		for (i = 0; i < n_samples; ++i) free(samples[i]);
		free(samples);
		bcf_hdr_destroy(hsub);
		free(imap);
	}
	bcf_hdr_destroy(h);
	hts_close(in);
	return 0;
}
Exemple #4
0
int bcf_p1_cal(call_t *call, bcf1_t *b, int do_contrast, bcf_p1aux_t *ma, bcf_p1rst_t *rst)
{
    int i, k;
    long double sum = 0.;
    ma->is_indel = bcf_is_snp(b) ? 0 : 1;
    rst->perm_rank = -1;

    ma->PL = call->PLs;
    ma->PL_len = call->nPLs / b->n_sample;
    if (b->n_allele < 2) return -1; // FIXME: find a better solution

    rst->rank0 = cal_pdg(b, ma);
    rst->f_exp = mc_cal_afs(ma, &rst->p_ref_folded, &rst->p_var_folded);
    rst->p_ref = ma->afs1[ma->M];
    for (k = 0, sum = 0.; k < ma->M; ++k)
        sum += ma->afs1[k];
    rst->p_var = (double)sum;
    { // compute the allele count
        double max = -1;
        rst->ac = -1;
        for (k = 0; k <= ma->M; ++k)
            if (max < ma->z[k]) max = ma->z[k], rst->ac = k;
        rst->ac = ma->M - rst->ac;
    }
    // calculate f_flat and f_em
    for (k = 0, sum = 0.; k <= ma->M; ++k)
        sum += (long double)ma->z[k];
    rst->f_flat = 0.;
    for (k = 0; k <= ma->M; ++k) {
        double p = ma->z[k] / sum;
        rst->f_flat += k * p;
    }
    rst->f_flat /= ma->M;
    { // estimate equal-tail credible interval (95% level)
        int l, h;
        double p;
        for (i = 0, p = 0.; i <= ma->M; ++i)
            if (p + ma->afs1[i] > 0.025) break;
            else p += ma->afs1[i];
        l = i;
        for (i = ma->M, p = 0.; i >= 0; --i)
            if (p + ma->afs1[i] > 0.025) break;
            else p += ma->afs1[i];
        h = i;
        rst->cil = (double)(ma->M - h) / ma->M; rst->cih = (double)(ma->M - l) / ma->M;
    }
    if (ma->n1 > 0) { // compute LRT
        double max0, max1, max2;
        for (k = 0, max0 = -1; k <= ma->M; ++k)
            if (max0 < ma->z[k]) max0 = ma->z[k];
        for (k = 0, max1 = -1; k <= ma->n1 * 2; ++k)
            if (max1 < ma->z1[k]) max1 = ma->z1[k];
        for (k = 0, max2 = -1; k <= ma->M - ma->n1 * 2; ++k)
            if (max2 < ma->z2[k]) max2 = ma->z2[k];
        rst->lrt = log(max1 * max2 / max0);
        rst->lrt = rst->lrt < 0? 1 : kf_gammaq(.5, rst->lrt);
    } else rst->lrt = -1.0;
    rst->cmp[0] = rst->cmp[1] = rst->cmp[2] = rst->p_chi2 = -1.0;
    if (do_contrast && rst->p_var > 0.5) // skip contrast2() if the locus is a strong non-variant
        rst->p_chi2 = contrast2(ma, rst->cmp);
    return 0;
}