Ejemplo n.º 1
0
int process_PL(args_t *args, bcf1_t *line, uint32_t *ntot, uint32_t *ndif)
{
    int npl = bcf_get_format_int32(args->sm_hdr, line, "PL", &args->tmp_arr, &args->ntmp_arr);

    if ( npl<=0 ) return 1;                 // PL not present
    npl /= args->nsmpl;
    
    int i,j,k, idx = 0;
    for (i=1; i<args->nsmpl; i++)
    {
        int32_t *a = args->tmp_arr + i*npl;
        int imin = -1;
        for (k=0; k<npl; k++)
        {
            if ( a[k]==bcf_int32_vector_end ) break;
            if ( a[k]==bcf_int32_missing ) continue;
            if ( imin==-1 || a[imin] > a[k] ) imin = k;
        }
        if ( imin<0 ) { idx+=i; continue; }

        for (j=0; j<i; j++)
        {
            int32_t *b = args->tmp_arr + j*npl;
            int jmin = -1;
            for (k=0; k<npl; k++)
            {
                if ( b[k]==bcf_int32_vector_end ) break;
                if ( b[k]==bcf_int32_missing ) continue;
                if ( jmin==-1 || b[jmin] > b[k] ) jmin = k;
            }
            if ( jmin<0 ) { idx++; continue; }

            ntot[idx]++;
            if ( imin!=jmin ) ndif[idx]++;
            idx++;
        }
    }
    return 0;
}
Ejemplo n.º 2
0
static void cross_check_gts(args_t *args)
{
    int nsamples = bcf_hdr_nsamples(args->sm_hdr), ndp_arr = 0;
    unsigned int *dp = (unsigned int*) calloc(nsamples,sizeof(unsigned int)), *ndp = (unsigned int*) calloc(nsamples,sizeof(unsigned int)); // this will overflow one day...
    int fake_pls = args->no_PLs, ignore_dp = 0;

    int i,j,k,idx, pl_warned = 0, dp_warned = 0;
    int32_t *dp_arr = NULL;
    int *is_hom = args->hom_only ? (int*) malloc(sizeof(int)*nsamples) : NULL;
    if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 ) 
    {
        if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 )
            error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname);
        fprintf(stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname);
        fake_pls = 1;
    }
    if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "DP")<0 ) ignore_dp = 1;

    FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : stdout;
    print_header(args, fp);
    if ( args->all_sites ) fprintf(fp,"# [1]SD, Average Site Discordance\t[2]Chromosome\t[3]Position\t[4]Number of available pairs\t[5]Average discordance\n");

    while ( bcf_sr_next_line(args->files) )
    {
        bcf1_t *line = args->files->readers[0].buffer[0];
        bcf_unpack(line, BCF_UN_FMT);

        int npl;
        if ( !fake_pls )
        {
            npl = bcf_get_format_int32(args->sm_hdr, line, "PL", &args->pl_arr, &args->npl_arr);
            if ( npl<=0 ) { pl_warned++; continue; }
            npl /= nsamples;
        }
        else
            npl = fake_PLs(args, args->sm_hdr, line);
        if ( !ignore_dp && bcf_get_format_int32(args->sm_hdr, line, "DP", &dp_arr, &ndp_arr) <= 0 ) { dp_warned++; continue; }

        if ( args->hom_only )
        {
            for (i=0; i<nsamples; i++)
                is_hom[i] = is_hom_most_likely(line->n_allele, args->pl_arr+i*npl);
        }

        double sum = 0; int nsum = 0;
        idx = 0;
        for (i=0; i<nsamples; i++)
        {
            int *ipl = &args->pl_arr[i*npl];
            if ( *ipl==-1 ) { idx += i; continue; } // missing genotype
            if ( !ignore_dp && (dp_arr[i]==bcf_int32_missing || !dp_arr[i]) ) { idx += i; continue; }
            if ( args->hom_only && !is_hom[i] ) { idx += i; continue; }

            for (j=0; j<i; j++)
            {
                int *jpl = &args->pl_arr[j*npl];
                if ( *jpl==-1 ) { idx++; continue; } // missing genotype
                if ( !ignore_dp && (dp_arr[j]==bcf_int32_missing || !dp_arr[j]) ) { idx++; continue; }
                if ( args->hom_only && !is_hom[j] ) { idx++; continue; }

                int min_pl = INT_MAX;
                for (k=0; k<npl; k++)
                {
                    if ( ipl[k]==bcf_int32_missing || jpl[k]==bcf_int32_missing ) break;
                    if ( ipl[k]==bcf_int32_vector_end || jpl[k]==bcf_int32_vector_end ) { k = npl; break; }
                    if ( min_pl > ipl[k]+jpl[k] ) min_pl = ipl[k]+jpl[k];
                }
                if ( k!=npl ) { idx++; continue; }

                if ( args->all_sites ) { sum += min_pl; nsum++; }
                args->lks[idx] += min_pl;
                args->cnts[idx]++;

                if ( !ignore_dp )
                {
                    args->dps[idx] += dp_arr[i] < dp_arr[j] ? dp_arr[i] : dp_arr[j];
                    dp[i] += dp_arr[i]; ndp[i]++;
                    dp[j] += dp_arr[j]; ndp[j]++;
                }
                else
                {
                    args->dps[idx]++;
                    dp[i]++; ndp[i]++;
                    dp[j]++; ndp[j]++;
                }
                idx++;
            }
        }
        if ( args->all_sites ) 
            fprintf(fp,"SD\t%s\t%d\t%d\t%.0f\n", args->sm_hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1, nsum, nsum?sum/nsum:0);
    }
    if ( dp_arr ) free(dp_arr);
    if ( args->pl_arr ) free(args->pl_arr);
    if ( args->tmp_arr ) free(args->tmp_arr);
    if ( is_hom ) free(is_hom);

    if ( pl_warned ) fprintf(stderr, "[W::%s] PL was not found at %d site(s)\n", __func__, pl_warned);
    if ( dp_warned ) fprintf(stderr, "[W::%s] DP was not found at %d site(s)\n", __func__, dp_warned);

    // Output samples sorted by average discordance
    double *score  = (double*) calloc(nsamples,sizeof(double));
    args->sites = (double*) calloc(nsamples,sizeof(double));
    idx = 0;
    for (i=0; i<nsamples; i++)
    {
        for (j=0; j<i; j++)
        {
            score[i] += args->lks[idx];
            score[j] += args->lks[idx];
            args->sites[i] += args->cnts[idx];
            args->sites[j] += args->cnts[idx];
            idx++;
        }
    }
    for (i=0; i<nsamples; i++) 
        if ( args->sites[i] ) score[i] /= args->sites[i];
    double **p = (double**) malloc(sizeof(double*)*nsamples), avg_score = 0;
    for (i=0; i<nsamples; i++) p[i] = &score[i];
    qsort(p, nsamples, sizeof(int*), cmp_doubleptr);
    // The average discordance gives the number of differing sites in % with -G1
    fprintf(fp, "# [1]SM\t[2]Average Discordance\t[3]Average depth\t[4]Average number of sites\t[5]Sample\t[6]Sample ID\n");
    for (i=0; i<nsamples; i++)
    {
        idx = p[i] - score;
        double adp = ndp[idx] ? (double)dp[idx]/ndp[idx] : 0;
        double nsites = args->sites[idx]/(nsamples-1);
        avg_score += score[idx];
        fprintf(fp, "SM\t%f\t%.2lf\t%.0lf\t%s\t%d\n", score[idx]*100., adp, nsites, args->sm_hdr->samples[idx],i);
    }

    // Overall score: maximum absolute deviation from the average score
    fprintf(fp, "# [1] MD\t[2]Maximum deviation\t[3]The culprit\n");
    fprintf(fp, "MD\t%f\t%s\n", (score[idx] - avg_score/nsamples)*100., args->sm_hdr->samples[idx]);    // idx still set
    free(p);
    free(score);
    free(dp);
    free(ndp);

    // Pairwise discordances
    fprintf(fp, "# [1]CN\t[2]Discordance\t[3]Number of sites\t[4]Average minimum depth\t[5]Sample i\t[6]Sample j\n");
    idx = 0;
    for (i=0; i<nsamples; i++)
    {
        for (j=0; j<i; j++)
        {
            fprintf(fp, "CN\t%.0f\t%d\t%.2f\t%s\t%s\n", args->lks[idx], args->cnts[idx], args->cnts[idx]?(double)args->dps[idx]/args->cnts[idx]:0.0, 
                    args->sm_hdr->samples[i],args->sm_hdr->samples[j]);
            idx++;
        }
    }
    fclose(fp);
    if ( args->plot )
        plot_cross_check(args);
}
Ejemplo n.º 3
0
static void check_gt(args_t *args)
{
    int i,ret, *gt2ipl = NULL, m_gt2ipl = 0, *gt_arr = NULL, ngt_arr = 0;
    int fake_pls = args->no_PLs;

    // Initialize things: check which tags are defined in the header, sample names etc.
    if ( bcf_hdr_id2int(args->gt_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] GT not present in the header of %s?\n", __func__, args->files->readers[1].fname);
    if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 ) 
    {
        if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 )
            error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname);
        fprintf(stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname);
        fake_pls = 1;
    }

    FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : stdout;
    print_header(args, fp);

    int tgt_isample = -1, query_isample = 0;
    if ( args->target_sample ) 
    {
        tgt_isample = bcf_hdr_id2int(args->gt_hdr, BCF_DT_SAMPLE, args->target_sample);
        if ( tgt_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[1].fname, args->target_sample);
    }
    if ( args->all_sites )
    {
        if ( tgt_isample==-1 ) 
        {
            fprintf(stderr,"No target sample selected for comparison, using the first sample in %s: %s\n", args->gt_fname,args->gt_hdr->samples[0]);
            tgt_isample = 0;
        }
    }
    if ( args->query_sample )
    {
        query_isample = bcf_hdr_id2int(args->sm_hdr, BCF_DT_SAMPLE, args->query_sample);
        if ( query_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[0].fname, args->query_sample);
    }
    if ( args->all_sites )
        fprintf(fp, "# [1]SC, Site by Site Comparison\t[2]Chromosome\t[3]Position\t[4]-g alleles\t[5]-g GT (%s)\t[6]Coverage\t[7]Query alleles\t[8-]Query PLs (%s)\n", args->gt_hdr->samples[tgt_isample],args->sm_hdr->samples[query_isample]);

    // Main loop
    while ( (ret=bcf_sr_next_line(args->files)) )
    {
        if ( ret!=2 ) continue;
        bcf1_t *sm_line = args->files->readers[0].buffer[0];    // the query file
        bcf1_t *gt_line = args->files->readers[1].buffer[0];    // the -g target file
        bcf_unpack(sm_line, BCF_UN_FMT);
        bcf_unpack(gt_line, BCF_UN_FMT);

        // Init mapping from target genotype index to the sample's PL fields
        int n_gt2ipl = gt_line->n_allele*(gt_line->n_allele + 1)/2;
        if ( n_gt2ipl > m_gt2ipl )
        {
            m_gt2ipl = n_gt2ipl;
            gt2ipl   = (int*) realloc(gt2ipl, sizeof(int)*m_gt2ipl);
        }
        if ( !init_gt2ipl(args, gt_line, sm_line, gt2ipl, n_gt2ipl) ) continue;

        // Target genotypes
        int ngt, npl;
        if ( (ngt=bcf_get_genotypes(args->gt_hdr, gt_line, &gt_arr, &ngt_arr)) <= 0 ) 
            error("GT not present at %s:%d?", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1);
        ngt /= bcf_hdr_nsamples(args->gt_hdr);
        if ( ngt!=2 ) continue; // checking only diploid genotypes

        // Sample PLs
        if ( !fake_pls )
        {
            if ( (npl=bcf_get_format_int32(args->sm_hdr, sm_line, "PL", &args->pl_arr, &args->npl_arr)) <= 0 )
                error("PL not present at %s:%d?", args->sm_hdr->id[BCF_DT_CTG][sm_line->rid].key, sm_line->pos+1);
            npl /= bcf_hdr_nsamples(args->sm_hdr);
        }
        else
            npl = fake_PLs(args, args->sm_hdr, sm_line);

        // Calculate likelihoods for all samples, assuming diploid genotypes

        // For faster access to genotype likelihoods (PLs) of the query sample
        int max_ipl, *pl_ptr = args->pl_arr + query_isample*npl;
        double sum_pl = 0;  // for converting PLs to probs
        for (max_ipl=0; max_ipl<npl; max_ipl++) 
        {
            if ( pl_ptr[max_ipl]==bcf_int32_vector_end ) break;
            if ( pl_ptr[max_ipl]==bcf_int32_missing ) continue;
            sum_pl += pow(10, -0.1*pl_ptr[max_ipl]);
        }
        if ( sum_pl==0 ) continue; // no PLs present

        // The main stats: concordance of the query sample with the target -g samples
        for (i=0; i<bcf_hdr_nsamples(args->gt_hdr); i++)
        {
            int *gt_ptr = gt_arr + i*ngt;
            if ( gt_ptr[1]==bcf_int32_vector_end ) continue;    // skip haploid genotypes
            int a = bcf_gt_allele(gt_ptr[0]);
            int b = bcf_gt_allele(gt_ptr[1]);
            if ( a<0 || b<0 ) continue; // missing genotypes
            if ( args->hom_only && a!=b ) continue; // heterozygous genotype
            int igt_tgt = igt_tgt = bcf_alleles2gt(a,b); // genotype index in the target file
            int igt_qry = gt2ipl[igt_tgt];  // corresponding genotype in query file
            if ( igt_qry>=max_ipl || pl_ptr[igt_qry]<0 ) continue;   // genotype not present in query sample: haploid or missing
            args->lks[i] += log(pow(10, -0.1*pl_ptr[igt_qry])/sum_pl); 
            args->sites[i]++; 
        }
        if ( args->all_sites )
        {
            // Print LKs at all sites for debugging
            int *gt_ptr = gt_arr + tgt_isample*ngt;
            if ( gt_ptr[1]==bcf_int32_vector_end ) continue;    // skip haploid genotypes
            int a = bcf_gt_allele(gt_ptr[0]);
            int b = bcf_gt_allele(gt_ptr[1]);
            if ( args->hom_only && a!=b ) continue; // heterozygous genotype
            fprintf(fp, "SC\t%s\t%d", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1);
            for (i=0; i<gt_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', gt_line->d.allele[i]);
            fprintf(fp, "\t%s/%s", a>=0 ? gt_line->d.allele[a] : ".", b>=0 ? gt_line->d.allele[b] : ".");

            int igt, *pl_ptr = args->pl_arr + query_isample*npl; // PLs of the query sample
            for (i=0; i<sm_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', sm_line->d.allele[i]); 
            for (igt=0; igt<npl; igt++)   
                if ( pl_ptr[igt]==bcf_int32_vector_end ) break;
                else if ( pl_ptr[igt]==bcf_int32_missing ) fprintf(fp, ".");
                else fprintf(fp, "\t%d", pl_ptr[igt]); 
            fprintf(fp, "\n"); 
        }
    }
    free(gt2ipl);
    free(gt_arr);
    free(args->pl_arr);
    free(args->tmp_arr);

    // Scale LKs and certainties
    int nsamples = bcf_hdr_nsamples(args->gt_hdr);
    double min = args->lks[0];
    for (i=1; i<nsamples; i++) if ( min>args->lks[i] ) min = args->lks[i];
    for (i=0; i<nsamples; i++) args->lks[i] = min ? args->lks[i] / min : 0;
    double max_avg = args->sites[0] ? args->lks[0]/args->sites[0] : 0;
    for (i=1; i<nsamples; i++) 
    {
        double val = args->sites[i] ? args->lks[i]/args->sites[i] : 0;
        if ( max_avg<val ) max_avg = val;
    }

    // Sorted output
    double **p = (double**) malloc(sizeof(double*)*nsamples);
    for (i=0; i<nsamples; i++) p[i] = &args->lks[i];
    qsort(p, nsamples, sizeof(int*), cmp_doubleptr);

    fprintf(fp, "# [1]CN\t[2]Concordance with %s (total)\t[3]Concordance (average)\t[4]Number of sites compared\t[5]Sample\t[6]Sample ID\n", args->sm_hdr->samples[query_isample]);
    for (i=0; i<nsamples; i++)
    {
        int idx = p[i] - args->lks;
        double avg = args->sites[idx] ? args->lks[idx]/args->sites[idx] : 0;
        fprintf(fp, "CN\t%e\t%e\t%.0f\t%s\t%d\n", 1-args->lks[idx], 1-avg/max_avg, args->sites[idx], args->gt_hdr->samples[idx], i);
    }

    if ( args->plot )
    {
        fclose(fp);
        plot_check(args, args->target_sample ? args->target_sample : "", args->sm_hdr->samples[query_isample]);
    }
}
Ejemplo n.º 4
0
int main(int argc, char **argv)
{
    if ( argc!=2 ) 
    {
        fprintf(stderr,"Usage: test-vcf-sweep <file.bcf|file.vcf>\n");
        return 1;
    }

    // Init variables. The checksum is just for this test program to output
    // something and verify that all sites are read in both passes - fwd and
    // bwd.
    bcf_sweep_t *sw = bcf_sweep_init(argv[1]);
    bcf_hdr_t *hdr  = bcf_sweep_hdr(sw);
    int chksum = 0;

    // First we must sweep forward and read the whole file to build an index.
    // If this is undesirable, we can require the presence of a .gzi index
    // which can be created with `bgzip -r` from the samtools/htslib package
    bcf1_t *rec;
    while ( (rec = bcf_sweep_fwd(sw)) ) chksum += rec->pos+1;
    printf("fwd position chksum: %d\n", chksum);

    // Now sweep backward. 
    chksum = 0;
    while ( (rec = bcf_sweep_bwd(sw)) ) chksum += rec->pos+1;
    printf("bwd position chksum: %d\n", chksum);

    // And forward and backward again, this time summing the PL vectors
    int i,j, mPLs = 0, nPLs;
    int32_t *PLs = NULL;
    chksum = 0;
    while ( (rec = bcf_sweep_fwd(sw)) ) 
    {
        // get copy of the PL vectors
        nPLs = bcf_get_format_int32(hdr, rec, "PL", &PLs, &mPLs);
        if ( !nPLs ) continue;  // PL not present

        // how many values are there per sample
        int nvals = nPLs / bcf_hdr_nsamples(hdr);

        int32_t *ptr = PLs;
        for (i=0; i<bcf_hdr_nsamples(hdr); i++)
        {
            for (j=0; j<nvals; j++)
            {
                // check for shorter vectors (haploid genotypes amongst diploids)
                if ( ptr[j]==bcf_int32_vector_end ) break;

                // skip missing values
                if ( ptr[j]==bcf_int32_missing ) continue;

                chksum += ptr[j];
            }
            ptr += nvals;
        }
    }
    printf("fwd PL chksum: %d\n", chksum);

    // And the same backwards..
    chksum = 0;
    while ( (rec = bcf_sweep_bwd(sw)) )
    {
        nPLs = bcf_get_format_int32(hdr, rec, "PL", &PLs, &mPLs);
        if ( !nPLs ) continue;
        int nvals = nPLs / bcf_hdr_nsamples(hdr);
        int32_t *ptr = PLs;
        for (i=0; i<bcf_hdr_nsamples(hdr); i++)
        {
            for (j=0; j<nvals; j++)
            {
                if ( ptr[j]==bcf_int32_vector_end ) break;
                if ( ptr[j]==bcf_int32_missing ) continue;
                chksum += ptr[j];
            }
            ptr += nvals;
        }
    }
    printf("bwd PL chksum: %d\n", chksum);

    // Clean up
    bcf_sweep_destroy(sw);
    return 0;
}
Ejemplo n.º 5
0
/**************************
 * PROCESS INPUT VCF FILE *
 **************************/
void vcf2raw(char **filename, char **out_filename, char **cross, int *n_parent1,
             char **parent1, int *n_parent2, char **parent2, double *min_class) {
  // We assume the input file exists (checked in R)
  bcf_sweep_t *in_vcf = bcf_sweep_init(*filename);
  if (in_vcf == NULL) {
    bcf_sweep_destroy(in_vcf);
    error("Could not parse input VCF file.");
  }
  bcf_hdr_t *vcf_hdr = bcf_sweep_hdr(in_vcf);

  // Get reference sequence IDs
  int n_seq = 0;
  const char **seq_names = NULL;
  seq_names = bcf_hdr_seqnames(vcf_hdr, &n_seq);
  if (seq_names == NULL || n_seq == 0) {
    free(seq_names);
    error("Could not correctly parse sequence names in VCF file. Is the input file tabix indexed?\n");
  }
  
  // Map parent names to sample indices
  int idx_parent1[*n_parent1];
  int idx_parent2[*n_parent2];
  get_parents_idx(*n_parent1, idx_parent1, *n_parent2, idx_parent2, vcf_hdr, parent1, parent2);

  // Get progeny sample indices (all samples that are not set as parents)
  int n_samples = bcf_hdr_nsamples(vcf_hdr);
  int n_progeny = n_samples - *n_parent1 - *n_parent2;
  if (n_progeny == 0) {
    error("Input file must contain at least one progeny individual.");
  }
  int idx_progeny[n_progeny];
  int i = 0, s;
  for (s = 0; s < n_samples; s++) {
    if (!is_val_in_arr(s, idx_parent1, *n_parent1)) {
      if (!is_val_in_arr(s, idx_parent2, *n_parent2)) {
        idx_progeny[i++] = s;
      }
    }
  }
  
  // Minimum count to assign parent genotype
  int min_class_parent1 = (int)ceil(*min_class * *n_parent1);
  int min_class_parent2 = (int)ceil(*min_class * *n_parent2);

  // Convert cross type
  int cross_type = get_cross_type(cross);

  // We need to write to a temporary file, because the number of markers in the header is unknown
  FILE *temp_f;
  char temp_filename[] = "tmp_raw_XXXXXX";
  int temp_fd;
  temp_fd = mkstemp(temp_filename);
  if (temp_fd == -1) {
    error("Could not open temporary output file.\n");
  }
  unlink(temp_filename);
  temp_f = fdopen(temp_fd, "w+");
  if (temp_f == NULL) {
    error("Could not open temporary output file.\n");
  }

  // CHROM and POS fields will be placed at the end of the output file
  int marker_count = 0;
  int * chrom = malloc(MAX_VARIANTS * sizeof(int));
  if (chrom == NULL) {
    error("Could not allocate vector.\n");
  }
  int * pos = malloc(MAX_VARIANTS * sizeof(int));
  if (pos == NULL) {
    error("Could not allocate vector.\n");
  }

  // Mapping of VCF genotypes to ONEMAP genotypes
  const char * const D_BC_ref[GT_TYPES_LEN] = { "a", "-", "ab", "-", "-", "-", "-" };
  const char * const D_BC_alt[GT_TYPES_LEN] = { "-", "a", "ab", "-", "-", "-", "-" };
  const char * const RI_ref[GT_TYPES_LEN] = { "a", "b", "-", "-", "-", "-", "-" };
  const char * const RI_alt[GT_TYPES_LEN] = { "b", "a", "-", "-", "-", "-", "-" };
  const char * const B3_F2_ref[GT_TYPES_LEN] = { "a", "b", "ab", "-", "-", "-", "-" };
  const char * const B3_F2_alt[GT_TYPES_LEN] = { "b", "a", "ab", "-", "-", "-", "-" };

  // Scan all records in VCF file and print valid markers to output
  bcf1_t *record;
  int32_t *GTs = NULL;
  int nGT_arr = 0;

  while ((record = bcf_sweep_fwd(in_vcf)) && marker_count < MAX_VARIANTS) {
    // We only consider biallelic SNP and INDEL markers
    int var_type = bcf_get_variant_types(record);
    if ((var_type == VCF_SNP || var_type == VCF_INDEL) && record->n_allele == 2) {
      int nGTs = bcf_get_format_int32(vcf_hdr, record, "GT", &GTs, &nGT_arr);
      // We only consider diploid variants (number of alleles in genotypes == 2)
      nGTs /= n_samples;
      if (nGTs == 2) {

        bcf_fmt_t *fmt_ptr = bcf_get_fmt(vcf_hdr, record, "GT");

        // First, check which parents are heterozygous or homozygous (REF or ALT allele)
        bool is_het_parent1 = false, is_hom_ref_parent1 = false, is_hom_alt_parent1 = false;
        get_consensus_parent_gt(fmt_ptr, *n_parent1, idx_parent1, min_class_parent1, &is_het_parent1,
                                &is_hom_ref_parent1, &is_hom_alt_parent1);
        bool is_het_parent2 = false, is_hom_ref_parent2 = false, is_hom_alt_parent2 = false;
        get_consensus_parent_gt(fmt_ptr, *n_parent2, idx_parent2, min_class_parent2, &is_het_parent2,
                                &is_hom_ref_parent2, &is_hom_alt_parent2);

        // Convert to appropriate marker type
        char marker_type[MARKER_TYPE_LEN];
        int type = get_marker_type(marker_type, cross_type,
                                   is_het_parent1, is_hom_ref_parent1, is_hom_alt_parent1,
                                   is_het_parent2, is_hom_ref_parent2, is_hom_alt_parent2);

        const char * const(*type_ptr)[GT_TYPES_LEN];
        bool valid_marker = true;
        switch(type)
        {
        case marker_B3:
        case marker_F2_ref:
          type_ptr = &B3_F2_ref;
          break;
        case marker_F2_alt:
          type_ptr = &B3_F2_alt;
          break;
        case marker_D_ref:
        case marker_BC_ref:
          type_ptr = &D_BC_ref;
          break;
        case marker_D_alt:
        case marker_BC_alt:
          type_ptr = &D_BC_alt;
          break;
        case marker_RI_ref:
          type_ptr = &RI_ref;
          break;
        case marker_RI_alt:
          type_ptr = &RI_alt;
          break;
        default:
          valid_marker = false;
        }

        if (valid_marker) {
          // Store CHROM and POS fields for valid markers
          chrom[marker_count] = record->rid;
          pos[marker_count] = record->pos + 1;

          // Check if marker name exists; if negative, create one
          char *marker_name = record->d.id;
          if (!strcmp(marker_name, ".")) {
            sprintf(marker_name, "%s.%d", seq_names[chrom[marker_count]], pos[marker_count]);
          }

          // Output variant in ONEMAP format to temporary file
          print_record(temp_f, marker_name, marker_type, fmt_ptr, n_progeny, idx_progeny, type_ptr);

          marker_count++;
        }
      }
    }
  }

  // Write final output file header
  FILE *final_f = fopen(*out_filename, "w");
  if (final_f == NULL) {
    error("Could not open output file.\n");
  }
  fprintf(final_f, "data type %s\n", *cross);
  // The next header line contains the following information: number of individuals, number of markers, 1 for the presence of CHROM information, 1 for the presence of POS information and 0 for the absence of phenotypes (these need to be manually included later)
  fprintf(final_f, "%d %d 1 1 0\n", n_progeny, marker_count);
  // The next header line contains the sample names
  char *cur_sample_name = vcf_hdr->samples[idx_progeny[0]];
  fprintf(final_f, "%s", cur_sample_name);
  for (i = 1; i < n_progeny; i++) {
    cur_sample_name = vcf_hdr->samples[idx_progeny[i]];
    fprintf(final_f, "\t%s", cur_sample_name);
  }
  fprintf(final_f, "\n");
  
  // Copy marker data from temporary file to final file
  rewind(temp_f);
  char buf[BUFSIZ];
  size_t size;
  while ((size = fread(buf, 1, BUFSIZ, temp_f))) {
    fwrite(buf, 1, size, final_f);
  }

  // Write CHROM and POS data to output file
  if (marker_count) {
    fprintf(final_f, "*CHROM\t");
    fprintf(final_f, "%s", seq_names[chrom[0]]);
    for (i = 1; i < marker_count; i++) {
      fprintf(final_f, " %s", seq_names[chrom[i]]);
    }
    fprintf(final_f, "\n*POS\t");
    fprintf(final_f, "%d", pos[0]);
    for (i = 1; i < marker_count; i++) {
      fprintf(final_f, " %d", pos[i]);
    }
  }

  // Clean-up
  free(chrom);
  free(pos);

  free(GTs);
  bcf_sweep_destroy(in_vcf);

  fclose(temp_f);
  close(temp_fd);
  fclose(final_f);
}
Ejemplo n.º 6
0
int parse_line(args_t *args, bcf1_t *line, double *alt_freq, double *pdg)
{
    args->nitmp = 0;

    // Set allele frequency
    int ret;
    if ( args->af_tag )
    {
        // Use an INFO tag provided by the user
        ret = bcf_get_info_float(args->hdr, line, args->af_tag, &args->AFs, &args->mAFs);
        if ( ret==1 )
            *alt_freq = args->AFs[0];
        if ( ret==-2 )
            error("Type mismatch for INFO/%s tag at %s:%d\n", args->af_tag, bcf_seqname(args->hdr,line), line->pos+1);
    }
    else if ( args->af_fname ) 
    {
        // Read AF from a file
        ret = read_AF(args->files->targets, line, alt_freq);
    }
    else
    {
        // Use GTs or AC/AN: GTs when AC/AN not present or when GTs explicitly requested by --estimate-AF
        ret = -1;
        if ( !args->estimate_AF )
        {
            int AC = -1, AN = 0;
            ret = bcf_get_info_int32(args->hdr, line, "AN", &args->itmp, &args->mitmp);
            if ( ret==1 )
            {
                AN = args->itmp[0];
                ret = bcf_get_info_int32(args->hdr, line, "AC", &args->itmp, &args->mitmp);
                if ( ret>0 )
                    AC = args->itmp[0];
            }
            if ( AN<=0 || AC<0 ) 
                ret = -1;
            else 
                *alt_freq = (double) AC/AN;
        }
        if ( ret==-1 )
            ret = estimate_AF(args, line, alt_freq);    // reads GTs into args->itmp
    }

    if ( ret<0 ) return ret;
    if ( *alt_freq==0.0 )
    {
        if ( args->dflt_AF==0 ) return -1;       // we skip sites with AF=0
        *alt_freq = args->dflt_AF;
    }

    // Set P(D|G)
    if ( args->fake_PLs )
    {
        if ( !args->nitmp )
        {
            args->nitmp = bcf_get_genotypes(args->hdr, line, &args->itmp, &args->mitmp);
            if ( args->nitmp != 2*args->nsmpl ) return -1;     // not diploid?
            args->nitmp /= args->nsmpl;
        }

        int32_t *gt = &args->itmp[args->ismpl*args->nitmp];
        if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) return -1;

        int a = bcf_gt_allele(gt[0]);
        int b = bcf_gt_allele(gt[1]);
        if ( a!=b )
        {
            pdg[0] = pdg[2] = args->unseen_PL;
            pdg[1] = 1 - 2*args->unseen_PL;
        }
        else if ( a==0 )
        {
            pdg[0] = 1 - 2*args->unseen_PL;
            pdg[1] = pdg[2] = args->unseen_PL;
        }
        else
        {
            pdg[0] = pdg[1] = args->unseen_PL;
            pdg[2] = 1 - 2*args->unseen_PL;
        }
    }
    else
    {
        args->nitmp = bcf_get_format_int32(args->hdr, line, "PL", &args->itmp, &args->mitmp);
        if ( args->nitmp != args->nsmpl*line->n_allele*(line->n_allele+1)/2. ) return -1;     // not diploid?
        args->nitmp /= args->nsmpl;

        int32_t *pl = &args->itmp[args->ismpl*args->nitmp];
        pdg[0] = pl[0] < 256 ? args->pl2p[ pl[0] ] : 1.0;
        pdg[1] = pl[1] < 256 ? args->pl2p[ pl[1] ] : 1.0;
        pdg[2] = pl[2] < 256 ? args->pl2p[ pl[2] ] : 1.0;

        double sum = pdg[0] + pdg[1] + pdg[2];
        if ( !sum ) return -1;
        pdg[0] /= sum;
        pdg[1] /= sum;
        pdg[2] /= sum;
    }

    return 0;
}
Ejemplo n.º 7
0
int process_region_guess(args_t *args, char *seq, regitr_t *itr)
{
    int kitr = 1;
    uint32_t start = 0, end = INT_MAX;
    reg_stats_t *stats = NULL;

    // set the start and the end position
    if ( itr )
    {
        start = itr->reg[itr->i].start;
        end   = itr->reg[itr->i].end;

        // flush all records with the same coordinates
        while ( itr->i+kitr<itr->n && start==itr->reg[itr->i+kitr].start && end==itr->reg[itr->i+kitr].end ) kitr++;

        int min,max,ret = ploidy_query(args->ploidy, seq, start, args->sex2ploidy, &min, &max);
        assert(ret);
        stats = expand_regs(args, seq,start,end);
    }
    else
    {
        // background region
        int spos, epos;
        const char *ptr = hts_parse_reg(args->background, &spos, &epos);
        if ( !ptr )
            error("Could not parse the region: %s\n", args->background);
        seq = (char*) malloc(ptr - args->background + 1);
        memcpy(seq,args->background,ptr-args->background);
        seq[ptr-args->background] = 0;
        start = spos;
        end   = epos;
    }

    if ( bcf_sr_seek(args->sr,seq,start)!=0 ) 
    {
        // sequence not present
        if ( !itr ) free(seq);
        return kitr;
    }

    int ismpl, rid = bcf_hdr_name2id(args->hdr,seq);
    if ( !itr ) free(seq);

    while ( bcf_sr_next_line(args->sr) )
    {
        bcf1_t *rec = bcf_sr_get_line(args->sr,0);
        if ( rec->rid!=rid || rec->pos > end ) break;

        if ( args->guess & GUESS_GT )   // use GTs to guess the ploidy
        {
            bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT");
            if ( !fmt ) continue;
            for (ismpl=0; ismpl<args->nsample; ismpl++)
            {
                count_t *counts = stats ? &stats->counts[ismpl] : &args->bg_counts[ismpl];
                int gt = bcf_gt_type(fmt, ismpl, NULL,NULL);
                if ( gt==GT_UNKN ) counts->nmiss++;
                else if ( gt==GT_HET_RA || gt==GT_HET_AA ) counts->nhet++;
                else counts->nhom++;
            }
        }
        else    // use PLs to guess the ploidy
        {
            int gl2pl = args->guess & GUESS_PL ? 1 : -1;
            int npl = bcf_get_format_int32(args->hdr,rec,args->guess&GUESS_PL?"PL":"GL",&args->pls,&args->npls);
            if ( npl<=0 ) continue;
            npl /= args->nsample;
            for (ismpl=0; ismpl<args->nsample; ismpl++)
            {
                int32_t *ptr = args->pls + ismpl*npl;
                int phom = INT_MAX, phet = INT_MAX, ial, jal, k = 0;
                for (ial=0; ial<rec->n_allele; ial++)
                {
                    for (jal=0; jal<ial; jal++)
                    {
                        if ( ptr[k] == bcf_int32_missing || ptr[k] == bcf_int32_vector_end )  break;
                        ptr[k] *= gl2pl;
                        if ( phet > ptr[k] ) phet = ptr[k];
                        k++;
                    }
                    if ( ptr[k] == bcf_int32_missing || ptr[k] == bcf_int32_vector_end )  break;
                    ptr[k] *= gl2pl;
                    if ( phom > ptr[k] ) phom = ptr[k];
                    k++;
                }
                count_t *counts = stats ? &stats->counts[ismpl] : &args->bg_counts[ismpl];
                if ( k == rec->n_allele ) counts->nhom++;   // haploid
                else if ( phet == phom || k != rec->n_allele*(rec->n_allele+1)/2 ) counts->nmiss++;
                else if ( phet < phom ) counts->nhet++;
                else counts->nhom++;
            }
        }
    }
    return kitr;
}