int process_PL(args_t *args, bcf1_t *line, uint32_t *ntot, uint32_t *ndif) { int npl = bcf_get_format_int32(args->sm_hdr, line, "PL", &args->tmp_arr, &args->ntmp_arr); if ( npl<=0 ) return 1; // PL not present npl /= args->nsmpl; int i,j,k, idx = 0; for (i=1; i<args->nsmpl; i++) { int32_t *a = args->tmp_arr + i*npl; int imin = -1; for (k=0; k<npl; k++) { if ( a[k]==bcf_int32_vector_end ) break; if ( a[k]==bcf_int32_missing ) continue; if ( imin==-1 || a[imin] > a[k] ) imin = k; } if ( imin<0 ) { idx+=i; continue; } for (j=0; j<i; j++) { int32_t *b = args->tmp_arr + j*npl; int jmin = -1; for (k=0; k<npl; k++) { if ( b[k]==bcf_int32_vector_end ) break; if ( b[k]==bcf_int32_missing ) continue; if ( jmin==-1 || b[jmin] > b[k] ) jmin = k; } if ( jmin<0 ) { idx++; continue; } ntot[idx]++; if ( imin!=jmin ) ndif[idx]++; idx++; } } return 0; }
static void cross_check_gts(args_t *args) { int nsamples = bcf_hdr_nsamples(args->sm_hdr), ndp_arr = 0; unsigned int *dp = (unsigned int*) calloc(nsamples,sizeof(unsigned int)), *ndp = (unsigned int*) calloc(nsamples,sizeof(unsigned int)); // this will overflow one day... int fake_pls = args->no_PLs, ignore_dp = 0; int i,j,k,idx, pl_warned = 0, dp_warned = 0; int32_t *dp_arr = NULL; int *is_hom = args->hom_only ? (int*) malloc(sizeof(int)*nsamples) : NULL; if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 ) { if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname); fprintf(stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname); fake_pls = 1; } if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "DP")<0 ) ignore_dp = 1; FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : stdout; print_header(args, fp); if ( args->all_sites ) fprintf(fp,"# [1]SD, Average Site Discordance\t[2]Chromosome\t[3]Position\t[4]Number of available pairs\t[5]Average discordance\n"); while ( bcf_sr_next_line(args->files) ) { bcf1_t *line = args->files->readers[0].buffer[0]; bcf_unpack(line, BCF_UN_FMT); int npl; if ( !fake_pls ) { npl = bcf_get_format_int32(args->sm_hdr, line, "PL", &args->pl_arr, &args->npl_arr); if ( npl<=0 ) { pl_warned++; continue; } npl /= nsamples; } else npl = fake_PLs(args, args->sm_hdr, line); if ( !ignore_dp && bcf_get_format_int32(args->sm_hdr, line, "DP", &dp_arr, &ndp_arr) <= 0 ) { dp_warned++; continue; } if ( args->hom_only ) { for (i=0; i<nsamples; i++) is_hom[i] = is_hom_most_likely(line->n_allele, args->pl_arr+i*npl); } double sum = 0; int nsum = 0; idx = 0; for (i=0; i<nsamples; i++) { int *ipl = &args->pl_arr[i*npl]; if ( *ipl==-1 ) { idx += i; continue; } // missing genotype if ( !ignore_dp && (dp_arr[i]==bcf_int32_missing || !dp_arr[i]) ) { idx += i; continue; } if ( args->hom_only && !is_hom[i] ) { idx += i; continue; } for (j=0; j<i; j++) { int *jpl = &args->pl_arr[j*npl]; if ( *jpl==-1 ) { idx++; continue; } // missing genotype if ( !ignore_dp && (dp_arr[j]==bcf_int32_missing || !dp_arr[j]) ) { idx++; continue; } if ( args->hom_only && !is_hom[j] ) { idx++; continue; } int min_pl = INT_MAX; for (k=0; k<npl; k++) { if ( ipl[k]==bcf_int32_missing || jpl[k]==bcf_int32_missing ) break; if ( ipl[k]==bcf_int32_vector_end || jpl[k]==bcf_int32_vector_end ) { k = npl; break; } if ( min_pl > ipl[k]+jpl[k] ) min_pl = ipl[k]+jpl[k]; } if ( k!=npl ) { idx++; continue; } if ( args->all_sites ) { sum += min_pl; nsum++; } args->lks[idx] += min_pl; args->cnts[idx]++; if ( !ignore_dp ) { args->dps[idx] += dp_arr[i] < dp_arr[j] ? dp_arr[i] : dp_arr[j]; dp[i] += dp_arr[i]; ndp[i]++; dp[j] += dp_arr[j]; ndp[j]++; } else { args->dps[idx]++; dp[i]++; ndp[i]++; dp[j]++; ndp[j]++; } idx++; } } if ( args->all_sites ) fprintf(fp,"SD\t%s\t%d\t%d\t%.0f\n", args->sm_hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1, nsum, nsum?sum/nsum:0); } if ( dp_arr ) free(dp_arr); if ( args->pl_arr ) free(args->pl_arr); if ( args->tmp_arr ) free(args->tmp_arr); if ( is_hom ) free(is_hom); if ( pl_warned ) fprintf(stderr, "[W::%s] PL was not found at %d site(s)\n", __func__, pl_warned); if ( dp_warned ) fprintf(stderr, "[W::%s] DP was not found at %d site(s)\n", __func__, dp_warned); // Output samples sorted by average discordance double *score = (double*) calloc(nsamples,sizeof(double)); args->sites = (double*) calloc(nsamples,sizeof(double)); idx = 0; for (i=0; i<nsamples; i++) { for (j=0; j<i; j++) { score[i] += args->lks[idx]; score[j] += args->lks[idx]; args->sites[i] += args->cnts[idx]; args->sites[j] += args->cnts[idx]; idx++; } } for (i=0; i<nsamples; i++) if ( args->sites[i] ) score[i] /= args->sites[i]; double **p = (double**) malloc(sizeof(double*)*nsamples), avg_score = 0; for (i=0; i<nsamples; i++) p[i] = &score[i]; qsort(p, nsamples, sizeof(int*), cmp_doubleptr); // The average discordance gives the number of differing sites in % with -G1 fprintf(fp, "# [1]SM\t[2]Average Discordance\t[3]Average depth\t[4]Average number of sites\t[5]Sample\t[6]Sample ID\n"); for (i=0; i<nsamples; i++) { idx = p[i] - score; double adp = ndp[idx] ? (double)dp[idx]/ndp[idx] : 0; double nsites = args->sites[idx]/(nsamples-1); avg_score += score[idx]; fprintf(fp, "SM\t%f\t%.2lf\t%.0lf\t%s\t%d\n", score[idx]*100., adp, nsites, args->sm_hdr->samples[idx],i); } // Overall score: maximum absolute deviation from the average score fprintf(fp, "# [1] MD\t[2]Maximum deviation\t[3]The culprit\n"); fprintf(fp, "MD\t%f\t%s\n", (score[idx] - avg_score/nsamples)*100., args->sm_hdr->samples[idx]); // idx still set free(p); free(score); free(dp); free(ndp); // Pairwise discordances fprintf(fp, "# [1]CN\t[2]Discordance\t[3]Number of sites\t[4]Average minimum depth\t[5]Sample i\t[6]Sample j\n"); idx = 0; for (i=0; i<nsamples; i++) { for (j=0; j<i; j++) { fprintf(fp, "CN\t%.0f\t%d\t%.2f\t%s\t%s\n", args->lks[idx], args->cnts[idx], args->cnts[idx]?(double)args->dps[idx]/args->cnts[idx]:0.0, args->sm_hdr->samples[i],args->sm_hdr->samples[j]); idx++; } } fclose(fp); if ( args->plot ) plot_cross_check(args); }
static void check_gt(args_t *args) { int i,ret, *gt2ipl = NULL, m_gt2ipl = 0, *gt_arr = NULL, ngt_arr = 0; int fake_pls = args->no_PLs; // Initialize things: check which tags are defined in the header, sample names etc. if ( bcf_hdr_id2int(args->gt_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] GT not present in the header of %s?\n", __func__, args->files->readers[1].fname); if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 ) { if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname); fprintf(stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname); fake_pls = 1; } FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : stdout; print_header(args, fp); int tgt_isample = -1, query_isample = 0; if ( args->target_sample ) { tgt_isample = bcf_hdr_id2int(args->gt_hdr, BCF_DT_SAMPLE, args->target_sample); if ( tgt_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[1].fname, args->target_sample); } if ( args->all_sites ) { if ( tgt_isample==-1 ) { fprintf(stderr,"No target sample selected for comparison, using the first sample in %s: %s\n", args->gt_fname,args->gt_hdr->samples[0]); tgt_isample = 0; } } if ( args->query_sample ) { query_isample = bcf_hdr_id2int(args->sm_hdr, BCF_DT_SAMPLE, args->query_sample); if ( query_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[0].fname, args->query_sample); } if ( args->all_sites ) fprintf(fp, "# [1]SC, Site by Site Comparison\t[2]Chromosome\t[3]Position\t[4]-g alleles\t[5]-g GT (%s)\t[6]Coverage\t[7]Query alleles\t[8-]Query PLs (%s)\n", args->gt_hdr->samples[tgt_isample],args->sm_hdr->samples[query_isample]); // Main loop while ( (ret=bcf_sr_next_line(args->files)) ) { if ( ret!=2 ) continue; bcf1_t *sm_line = args->files->readers[0].buffer[0]; // the query file bcf1_t *gt_line = args->files->readers[1].buffer[0]; // the -g target file bcf_unpack(sm_line, BCF_UN_FMT); bcf_unpack(gt_line, BCF_UN_FMT); // Init mapping from target genotype index to the sample's PL fields int n_gt2ipl = gt_line->n_allele*(gt_line->n_allele + 1)/2; if ( n_gt2ipl > m_gt2ipl ) { m_gt2ipl = n_gt2ipl; gt2ipl = (int*) realloc(gt2ipl, sizeof(int)*m_gt2ipl); } if ( !init_gt2ipl(args, gt_line, sm_line, gt2ipl, n_gt2ipl) ) continue; // Target genotypes int ngt, npl; if ( (ngt=bcf_get_genotypes(args->gt_hdr, gt_line, >_arr, &ngt_arr)) <= 0 ) error("GT not present at %s:%d?", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1); ngt /= bcf_hdr_nsamples(args->gt_hdr); if ( ngt!=2 ) continue; // checking only diploid genotypes // Sample PLs if ( !fake_pls ) { if ( (npl=bcf_get_format_int32(args->sm_hdr, sm_line, "PL", &args->pl_arr, &args->npl_arr)) <= 0 ) error("PL not present at %s:%d?", args->sm_hdr->id[BCF_DT_CTG][sm_line->rid].key, sm_line->pos+1); npl /= bcf_hdr_nsamples(args->sm_hdr); } else npl = fake_PLs(args, args->sm_hdr, sm_line); // Calculate likelihoods for all samples, assuming diploid genotypes // For faster access to genotype likelihoods (PLs) of the query sample int max_ipl, *pl_ptr = args->pl_arr + query_isample*npl; double sum_pl = 0; // for converting PLs to probs for (max_ipl=0; max_ipl<npl; max_ipl++) { if ( pl_ptr[max_ipl]==bcf_int32_vector_end ) break; if ( pl_ptr[max_ipl]==bcf_int32_missing ) continue; sum_pl += pow(10, -0.1*pl_ptr[max_ipl]); } if ( sum_pl==0 ) continue; // no PLs present // The main stats: concordance of the query sample with the target -g samples for (i=0; i<bcf_hdr_nsamples(args->gt_hdr); i++) { int *gt_ptr = gt_arr + i*ngt; if ( gt_ptr[1]==bcf_int32_vector_end ) continue; // skip haploid genotypes int a = bcf_gt_allele(gt_ptr[0]); int b = bcf_gt_allele(gt_ptr[1]); if ( a<0 || b<0 ) continue; // missing genotypes if ( args->hom_only && a!=b ) continue; // heterozygous genotype int igt_tgt = igt_tgt = bcf_alleles2gt(a,b); // genotype index in the target file int igt_qry = gt2ipl[igt_tgt]; // corresponding genotype in query file if ( igt_qry>=max_ipl || pl_ptr[igt_qry]<0 ) continue; // genotype not present in query sample: haploid or missing args->lks[i] += log(pow(10, -0.1*pl_ptr[igt_qry])/sum_pl); args->sites[i]++; } if ( args->all_sites ) { // Print LKs at all sites for debugging int *gt_ptr = gt_arr + tgt_isample*ngt; if ( gt_ptr[1]==bcf_int32_vector_end ) continue; // skip haploid genotypes int a = bcf_gt_allele(gt_ptr[0]); int b = bcf_gt_allele(gt_ptr[1]); if ( args->hom_only && a!=b ) continue; // heterozygous genotype fprintf(fp, "SC\t%s\t%d", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1); for (i=0; i<gt_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', gt_line->d.allele[i]); fprintf(fp, "\t%s/%s", a>=0 ? gt_line->d.allele[a] : ".", b>=0 ? gt_line->d.allele[b] : "."); int igt, *pl_ptr = args->pl_arr + query_isample*npl; // PLs of the query sample for (i=0; i<sm_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', sm_line->d.allele[i]); for (igt=0; igt<npl; igt++) if ( pl_ptr[igt]==bcf_int32_vector_end ) break; else if ( pl_ptr[igt]==bcf_int32_missing ) fprintf(fp, "."); else fprintf(fp, "\t%d", pl_ptr[igt]); fprintf(fp, "\n"); } } free(gt2ipl); free(gt_arr); free(args->pl_arr); free(args->tmp_arr); // Scale LKs and certainties int nsamples = bcf_hdr_nsamples(args->gt_hdr); double min = args->lks[0]; for (i=1; i<nsamples; i++) if ( min>args->lks[i] ) min = args->lks[i]; for (i=0; i<nsamples; i++) args->lks[i] = min ? args->lks[i] / min : 0; double max_avg = args->sites[0] ? args->lks[0]/args->sites[0] : 0; for (i=1; i<nsamples; i++) { double val = args->sites[i] ? args->lks[i]/args->sites[i] : 0; if ( max_avg<val ) max_avg = val; } // Sorted output double **p = (double**) malloc(sizeof(double*)*nsamples); for (i=0; i<nsamples; i++) p[i] = &args->lks[i]; qsort(p, nsamples, sizeof(int*), cmp_doubleptr); fprintf(fp, "# [1]CN\t[2]Concordance with %s (total)\t[3]Concordance (average)\t[4]Number of sites compared\t[5]Sample\t[6]Sample ID\n", args->sm_hdr->samples[query_isample]); for (i=0; i<nsamples; i++) { int idx = p[i] - args->lks; double avg = args->sites[idx] ? args->lks[idx]/args->sites[idx] : 0; fprintf(fp, "CN\t%e\t%e\t%.0f\t%s\t%d\n", 1-args->lks[idx], 1-avg/max_avg, args->sites[idx], args->gt_hdr->samples[idx], i); } if ( args->plot ) { fclose(fp); plot_check(args, args->target_sample ? args->target_sample : "", args->sm_hdr->samples[query_isample]); } }
int main(int argc, char **argv) { if ( argc!=2 ) { fprintf(stderr,"Usage: test-vcf-sweep <file.bcf|file.vcf>\n"); return 1; } // Init variables. The checksum is just for this test program to output // something and verify that all sites are read in both passes - fwd and // bwd. bcf_sweep_t *sw = bcf_sweep_init(argv[1]); bcf_hdr_t *hdr = bcf_sweep_hdr(sw); int chksum = 0; // First we must sweep forward and read the whole file to build an index. // If this is undesirable, we can require the presence of a .gzi index // which can be created with `bgzip -r` from the samtools/htslib package bcf1_t *rec; while ( (rec = bcf_sweep_fwd(sw)) ) chksum += rec->pos+1; printf("fwd position chksum: %d\n", chksum); // Now sweep backward. chksum = 0; while ( (rec = bcf_sweep_bwd(sw)) ) chksum += rec->pos+1; printf("bwd position chksum: %d\n", chksum); // And forward and backward again, this time summing the PL vectors int i,j, mPLs = 0, nPLs; int32_t *PLs = NULL; chksum = 0; while ( (rec = bcf_sweep_fwd(sw)) ) { // get copy of the PL vectors nPLs = bcf_get_format_int32(hdr, rec, "PL", &PLs, &mPLs); if ( !nPLs ) continue; // PL not present // how many values are there per sample int nvals = nPLs / bcf_hdr_nsamples(hdr); int32_t *ptr = PLs; for (i=0; i<bcf_hdr_nsamples(hdr); i++) { for (j=0; j<nvals; j++) { // check for shorter vectors (haploid genotypes amongst diploids) if ( ptr[j]==bcf_int32_vector_end ) break; // skip missing values if ( ptr[j]==bcf_int32_missing ) continue; chksum += ptr[j]; } ptr += nvals; } } printf("fwd PL chksum: %d\n", chksum); // And the same backwards.. chksum = 0; while ( (rec = bcf_sweep_bwd(sw)) ) { nPLs = bcf_get_format_int32(hdr, rec, "PL", &PLs, &mPLs); if ( !nPLs ) continue; int nvals = nPLs / bcf_hdr_nsamples(hdr); int32_t *ptr = PLs; for (i=0; i<bcf_hdr_nsamples(hdr); i++) { for (j=0; j<nvals; j++) { if ( ptr[j]==bcf_int32_vector_end ) break; if ( ptr[j]==bcf_int32_missing ) continue; chksum += ptr[j]; } ptr += nvals; } } printf("bwd PL chksum: %d\n", chksum); // Clean up bcf_sweep_destroy(sw); return 0; }
/************************** * PROCESS INPUT VCF FILE * **************************/ void vcf2raw(char **filename, char **out_filename, char **cross, int *n_parent1, char **parent1, int *n_parent2, char **parent2, double *min_class) { // We assume the input file exists (checked in R) bcf_sweep_t *in_vcf = bcf_sweep_init(*filename); if (in_vcf == NULL) { bcf_sweep_destroy(in_vcf); error("Could not parse input VCF file."); } bcf_hdr_t *vcf_hdr = bcf_sweep_hdr(in_vcf); // Get reference sequence IDs int n_seq = 0; const char **seq_names = NULL; seq_names = bcf_hdr_seqnames(vcf_hdr, &n_seq); if (seq_names == NULL || n_seq == 0) { free(seq_names); error("Could not correctly parse sequence names in VCF file. Is the input file tabix indexed?\n"); } // Map parent names to sample indices int idx_parent1[*n_parent1]; int idx_parent2[*n_parent2]; get_parents_idx(*n_parent1, idx_parent1, *n_parent2, idx_parent2, vcf_hdr, parent1, parent2); // Get progeny sample indices (all samples that are not set as parents) int n_samples = bcf_hdr_nsamples(vcf_hdr); int n_progeny = n_samples - *n_parent1 - *n_parent2; if (n_progeny == 0) { error("Input file must contain at least one progeny individual."); } int idx_progeny[n_progeny]; int i = 0, s; for (s = 0; s < n_samples; s++) { if (!is_val_in_arr(s, idx_parent1, *n_parent1)) { if (!is_val_in_arr(s, idx_parent2, *n_parent2)) { idx_progeny[i++] = s; } } } // Minimum count to assign parent genotype int min_class_parent1 = (int)ceil(*min_class * *n_parent1); int min_class_parent2 = (int)ceil(*min_class * *n_parent2); // Convert cross type int cross_type = get_cross_type(cross); // We need to write to a temporary file, because the number of markers in the header is unknown FILE *temp_f; char temp_filename[] = "tmp_raw_XXXXXX"; int temp_fd; temp_fd = mkstemp(temp_filename); if (temp_fd == -1) { error("Could not open temporary output file.\n"); } unlink(temp_filename); temp_f = fdopen(temp_fd, "w+"); if (temp_f == NULL) { error("Could not open temporary output file.\n"); } // CHROM and POS fields will be placed at the end of the output file int marker_count = 0; int * chrom = malloc(MAX_VARIANTS * sizeof(int)); if (chrom == NULL) { error("Could not allocate vector.\n"); } int * pos = malloc(MAX_VARIANTS * sizeof(int)); if (pos == NULL) { error("Could not allocate vector.\n"); } // Mapping of VCF genotypes to ONEMAP genotypes const char * const D_BC_ref[GT_TYPES_LEN] = { "a", "-", "ab", "-", "-", "-", "-" }; const char * const D_BC_alt[GT_TYPES_LEN] = { "-", "a", "ab", "-", "-", "-", "-" }; const char * const RI_ref[GT_TYPES_LEN] = { "a", "b", "-", "-", "-", "-", "-" }; const char * const RI_alt[GT_TYPES_LEN] = { "b", "a", "-", "-", "-", "-", "-" }; const char * const B3_F2_ref[GT_TYPES_LEN] = { "a", "b", "ab", "-", "-", "-", "-" }; const char * const B3_F2_alt[GT_TYPES_LEN] = { "b", "a", "ab", "-", "-", "-", "-" }; // Scan all records in VCF file and print valid markers to output bcf1_t *record; int32_t *GTs = NULL; int nGT_arr = 0; while ((record = bcf_sweep_fwd(in_vcf)) && marker_count < MAX_VARIANTS) { // We only consider biallelic SNP and INDEL markers int var_type = bcf_get_variant_types(record); if ((var_type == VCF_SNP || var_type == VCF_INDEL) && record->n_allele == 2) { int nGTs = bcf_get_format_int32(vcf_hdr, record, "GT", >s, &nGT_arr); // We only consider diploid variants (number of alleles in genotypes == 2) nGTs /= n_samples; if (nGTs == 2) { bcf_fmt_t *fmt_ptr = bcf_get_fmt(vcf_hdr, record, "GT"); // First, check which parents are heterozygous or homozygous (REF or ALT allele) bool is_het_parent1 = false, is_hom_ref_parent1 = false, is_hom_alt_parent1 = false; get_consensus_parent_gt(fmt_ptr, *n_parent1, idx_parent1, min_class_parent1, &is_het_parent1, &is_hom_ref_parent1, &is_hom_alt_parent1); bool is_het_parent2 = false, is_hom_ref_parent2 = false, is_hom_alt_parent2 = false; get_consensus_parent_gt(fmt_ptr, *n_parent2, idx_parent2, min_class_parent2, &is_het_parent2, &is_hom_ref_parent2, &is_hom_alt_parent2); // Convert to appropriate marker type char marker_type[MARKER_TYPE_LEN]; int type = get_marker_type(marker_type, cross_type, is_het_parent1, is_hom_ref_parent1, is_hom_alt_parent1, is_het_parent2, is_hom_ref_parent2, is_hom_alt_parent2); const char * const(*type_ptr)[GT_TYPES_LEN]; bool valid_marker = true; switch(type) { case marker_B3: case marker_F2_ref: type_ptr = &B3_F2_ref; break; case marker_F2_alt: type_ptr = &B3_F2_alt; break; case marker_D_ref: case marker_BC_ref: type_ptr = &D_BC_ref; break; case marker_D_alt: case marker_BC_alt: type_ptr = &D_BC_alt; break; case marker_RI_ref: type_ptr = &RI_ref; break; case marker_RI_alt: type_ptr = &RI_alt; break; default: valid_marker = false; } if (valid_marker) { // Store CHROM and POS fields for valid markers chrom[marker_count] = record->rid; pos[marker_count] = record->pos + 1; // Check if marker name exists; if negative, create one char *marker_name = record->d.id; if (!strcmp(marker_name, ".")) { sprintf(marker_name, "%s.%d", seq_names[chrom[marker_count]], pos[marker_count]); } // Output variant in ONEMAP format to temporary file print_record(temp_f, marker_name, marker_type, fmt_ptr, n_progeny, idx_progeny, type_ptr); marker_count++; } } } } // Write final output file header FILE *final_f = fopen(*out_filename, "w"); if (final_f == NULL) { error("Could not open output file.\n"); } fprintf(final_f, "data type %s\n", *cross); // The next header line contains the following information: number of individuals, number of markers, 1 for the presence of CHROM information, 1 for the presence of POS information and 0 for the absence of phenotypes (these need to be manually included later) fprintf(final_f, "%d %d 1 1 0\n", n_progeny, marker_count); // The next header line contains the sample names char *cur_sample_name = vcf_hdr->samples[idx_progeny[0]]; fprintf(final_f, "%s", cur_sample_name); for (i = 1; i < n_progeny; i++) { cur_sample_name = vcf_hdr->samples[idx_progeny[i]]; fprintf(final_f, "\t%s", cur_sample_name); } fprintf(final_f, "\n"); // Copy marker data from temporary file to final file rewind(temp_f); char buf[BUFSIZ]; size_t size; while ((size = fread(buf, 1, BUFSIZ, temp_f))) { fwrite(buf, 1, size, final_f); } // Write CHROM and POS data to output file if (marker_count) { fprintf(final_f, "*CHROM\t"); fprintf(final_f, "%s", seq_names[chrom[0]]); for (i = 1; i < marker_count; i++) { fprintf(final_f, " %s", seq_names[chrom[i]]); } fprintf(final_f, "\n*POS\t"); fprintf(final_f, "%d", pos[0]); for (i = 1; i < marker_count; i++) { fprintf(final_f, " %d", pos[i]); } } // Clean-up free(chrom); free(pos); free(GTs); bcf_sweep_destroy(in_vcf); fclose(temp_f); close(temp_fd); fclose(final_f); }
int parse_line(args_t *args, bcf1_t *line, double *alt_freq, double *pdg) { args->nitmp = 0; // Set allele frequency int ret; if ( args->af_tag ) { // Use an INFO tag provided by the user ret = bcf_get_info_float(args->hdr, line, args->af_tag, &args->AFs, &args->mAFs); if ( ret==1 ) *alt_freq = args->AFs[0]; if ( ret==-2 ) error("Type mismatch for INFO/%s tag at %s:%d\n", args->af_tag, bcf_seqname(args->hdr,line), line->pos+1); } else if ( args->af_fname ) { // Read AF from a file ret = read_AF(args->files->targets, line, alt_freq); } else { // Use GTs or AC/AN: GTs when AC/AN not present or when GTs explicitly requested by --estimate-AF ret = -1; if ( !args->estimate_AF ) { int AC = -1, AN = 0; ret = bcf_get_info_int32(args->hdr, line, "AN", &args->itmp, &args->mitmp); if ( ret==1 ) { AN = args->itmp[0]; ret = bcf_get_info_int32(args->hdr, line, "AC", &args->itmp, &args->mitmp); if ( ret>0 ) AC = args->itmp[0]; } if ( AN<=0 || AC<0 ) ret = -1; else *alt_freq = (double) AC/AN; } if ( ret==-1 ) ret = estimate_AF(args, line, alt_freq); // reads GTs into args->itmp } if ( ret<0 ) return ret; if ( *alt_freq==0.0 ) { if ( args->dflt_AF==0 ) return -1; // we skip sites with AF=0 *alt_freq = args->dflt_AF; } // Set P(D|G) if ( args->fake_PLs ) { if ( !args->nitmp ) { args->nitmp = bcf_get_genotypes(args->hdr, line, &args->itmp, &args->mitmp); if ( args->nitmp != 2*args->nsmpl ) return -1; // not diploid? args->nitmp /= args->nsmpl; } int32_t *gt = &args->itmp[args->ismpl*args->nitmp]; if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) return -1; int a = bcf_gt_allele(gt[0]); int b = bcf_gt_allele(gt[1]); if ( a!=b ) { pdg[0] = pdg[2] = args->unseen_PL; pdg[1] = 1 - 2*args->unseen_PL; } else if ( a==0 ) { pdg[0] = 1 - 2*args->unseen_PL; pdg[1] = pdg[2] = args->unseen_PL; } else { pdg[0] = pdg[1] = args->unseen_PL; pdg[2] = 1 - 2*args->unseen_PL; } } else { args->nitmp = bcf_get_format_int32(args->hdr, line, "PL", &args->itmp, &args->mitmp); if ( args->nitmp != args->nsmpl*line->n_allele*(line->n_allele+1)/2. ) return -1; // not diploid? args->nitmp /= args->nsmpl; int32_t *pl = &args->itmp[args->ismpl*args->nitmp]; pdg[0] = pl[0] < 256 ? args->pl2p[ pl[0] ] : 1.0; pdg[1] = pl[1] < 256 ? args->pl2p[ pl[1] ] : 1.0; pdg[2] = pl[2] < 256 ? args->pl2p[ pl[2] ] : 1.0; double sum = pdg[0] + pdg[1] + pdg[2]; if ( !sum ) return -1; pdg[0] /= sum; pdg[1] /= sum; pdg[2] /= sum; } return 0; }
int process_region_guess(args_t *args, char *seq, regitr_t *itr) { int kitr = 1; uint32_t start = 0, end = INT_MAX; reg_stats_t *stats = NULL; // set the start and the end position if ( itr ) { start = itr->reg[itr->i].start; end = itr->reg[itr->i].end; // flush all records with the same coordinates while ( itr->i+kitr<itr->n && start==itr->reg[itr->i+kitr].start && end==itr->reg[itr->i+kitr].end ) kitr++; int min,max,ret = ploidy_query(args->ploidy, seq, start, args->sex2ploidy, &min, &max); assert(ret); stats = expand_regs(args, seq,start,end); } else { // background region int spos, epos; const char *ptr = hts_parse_reg(args->background, &spos, &epos); if ( !ptr ) error("Could not parse the region: %s\n", args->background); seq = (char*) malloc(ptr - args->background + 1); memcpy(seq,args->background,ptr-args->background); seq[ptr-args->background] = 0; start = spos; end = epos; } if ( bcf_sr_seek(args->sr,seq,start)!=0 ) { // sequence not present if ( !itr ) free(seq); return kitr; } int ismpl, rid = bcf_hdr_name2id(args->hdr,seq); if ( !itr ) free(seq); while ( bcf_sr_next_line(args->sr) ) { bcf1_t *rec = bcf_sr_get_line(args->sr,0); if ( rec->rid!=rid || rec->pos > end ) break; if ( args->guess & GUESS_GT ) // use GTs to guess the ploidy { bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT"); if ( !fmt ) continue; for (ismpl=0; ismpl<args->nsample; ismpl++) { count_t *counts = stats ? &stats->counts[ismpl] : &args->bg_counts[ismpl]; int gt = bcf_gt_type(fmt, ismpl, NULL,NULL); if ( gt==GT_UNKN ) counts->nmiss++; else if ( gt==GT_HET_RA || gt==GT_HET_AA ) counts->nhet++; else counts->nhom++; } } else // use PLs to guess the ploidy { int gl2pl = args->guess & GUESS_PL ? 1 : -1; int npl = bcf_get_format_int32(args->hdr,rec,args->guess&GUESS_PL?"PL":"GL",&args->pls,&args->npls); if ( npl<=0 ) continue; npl /= args->nsample; for (ismpl=0; ismpl<args->nsample; ismpl++) { int32_t *ptr = args->pls + ismpl*npl; int phom = INT_MAX, phet = INT_MAX, ial, jal, k = 0; for (ial=0; ial<rec->n_allele; ial++) { for (jal=0; jal<ial; jal++) { if ( ptr[k] == bcf_int32_missing || ptr[k] == bcf_int32_vector_end ) break; ptr[k] *= gl2pl; if ( phet > ptr[k] ) phet = ptr[k]; k++; } if ( ptr[k] == bcf_int32_missing || ptr[k] == bcf_int32_vector_end ) break; ptr[k] *= gl2pl; if ( phom > ptr[k] ) phom = ptr[k]; k++; } count_t *counts = stats ? &stats->counts[ismpl] : &args->bg_counts[ismpl]; if ( k == rec->n_allele ) counts->nhom++; // haploid else if ( phet == phom || k != rec->n_allele*(rec->n_allele+1)/2 ) counts->nmiss++; else if ( phet < phom ) counts->nhet++; else counts->nhom++; } } } return kitr; }