int main(int argc, char **argv) { if ( argc!=2 ) { fprintf(stderr,"Usage: test-vcf-sweep <file.bcf|file.vcf>\n"); return 1; } // Init variables. The checksum is just for this test program to output // something and verify that all sites are read in both passes - fwd and // bwd. bcf_sweep_t *sw = bcf_sweep_init(argv[1]); bcf_hdr_t *hdr = bcf_sweep_hdr(sw); int chksum = 0; // First we must sweep forward and read the whole file to build an index. // If this is undesirable, we can require the presence of a .gzi index // which can be created with `bgzip -r` from the samtools/htslib package bcf1_t *rec; while ( (rec = bcf_sweep_fwd(sw)) ) chksum += rec->pos+1; printf("fwd position chksum: %d\n", chksum); // Now sweep backward. chksum = 0; while ( (rec = bcf_sweep_bwd(sw)) ) chksum += rec->pos+1; printf("bwd position chksum: %d\n", chksum); // And forward and backward again, this time summing the PL vectors int i,j, mPLs = 0, nPLs; int32_t *PLs = NULL; chksum = 0; while ( (rec = bcf_sweep_fwd(sw)) ) { // get copy of the PL vectors nPLs = bcf_get_format_int32(hdr, rec, "PL", &PLs, &mPLs); if ( !nPLs ) continue; // PL not present // how many values are there per sample int nvals = nPLs / bcf_hdr_nsamples(hdr); int32_t *ptr = PLs; for (i=0; i<bcf_hdr_nsamples(hdr); i++) { for (j=0; j<nvals; j++) { // check for shorter vectors (haploid genotypes amongst diploids) if ( ptr[j]==bcf_int32_vector_end ) break; // skip missing values if ( ptr[j]==bcf_int32_missing ) continue; chksum += ptr[j]; } ptr += nvals; } } printf("fwd PL chksum: %d\n", chksum); // And the same backwards.. chksum = 0; while ( (rec = bcf_sweep_bwd(sw)) ) { nPLs = bcf_get_format_int32(hdr, rec, "PL", &PLs, &mPLs); if ( !nPLs ) continue; int nvals = nPLs / bcf_hdr_nsamples(hdr); int32_t *ptr = PLs; for (i=0; i<bcf_hdr_nsamples(hdr); i++) { for (j=0; j<nvals; j++) { if ( ptr[j]==bcf_int32_vector_end ) break; if ( ptr[j]==bcf_int32_missing ) continue; chksum += ptr[j]; } ptr += nvals; } } printf("bwd PL chksum: %d\n", chksum); // Clean up bcf_sweep_destroy(sw); return 0; }
/************************** * PROCESS INPUT VCF FILE * **************************/ void vcf2raw(char **filename, char **out_filename, char **cross, int *n_parent1, char **parent1, int *n_parent2, char **parent2, double *min_class) { // We assume the input file exists (checked in R) bcf_sweep_t *in_vcf = bcf_sweep_init(*filename); if (in_vcf == NULL) { bcf_sweep_destroy(in_vcf); error("Could not parse input VCF file."); } bcf_hdr_t *vcf_hdr = bcf_sweep_hdr(in_vcf); // Get reference sequence IDs int n_seq = 0; const char **seq_names = NULL; seq_names = bcf_hdr_seqnames(vcf_hdr, &n_seq); if (seq_names == NULL || n_seq == 0) { free(seq_names); error("Could not correctly parse sequence names in VCF file. Is the input file tabix indexed?\n"); } // Map parent names to sample indices int idx_parent1[*n_parent1]; int idx_parent2[*n_parent2]; get_parents_idx(*n_parent1, idx_parent1, *n_parent2, idx_parent2, vcf_hdr, parent1, parent2); // Get progeny sample indices (all samples that are not set as parents) int n_samples = bcf_hdr_nsamples(vcf_hdr); int n_progeny = n_samples - *n_parent1 - *n_parent2; if (n_progeny == 0) { error("Input file must contain at least one progeny individual."); } int idx_progeny[n_progeny]; int i = 0, s; for (s = 0; s < n_samples; s++) { if (!is_val_in_arr(s, idx_parent1, *n_parent1)) { if (!is_val_in_arr(s, idx_parent2, *n_parent2)) { idx_progeny[i++] = s; } } } // Minimum count to assign parent genotype int min_class_parent1 = (int)ceil(*min_class * *n_parent1); int min_class_parent2 = (int)ceil(*min_class * *n_parent2); // Convert cross type int cross_type = get_cross_type(cross); // We need to write to a temporary file, because the number of markers in the header is unknown FILE *temp_f; char temp_filename[] = "tmp_raw_XXXXXX"; int temp_fd; temp_fd = mkstemp(temp_filename); if (temp_fd == -1) { error("Could not open temporary output file.\n"); } unlink(temp_filename); temp_f = fdopen(temp_fd, "w+"); if (temp_f == NULL) { error("Could not open temporary output file.\n"); } // CHROM and POS fields will be placed at the end of the output file int marker_count = 0; int * chrom = malloc(MAX_VARIANTS * sizeof(int)); if (chrom == NULL) { error("Could not allocate vector.\n"); } int * pos = malloc(MAX_VARIANTS * sizeof(int)); if (pos == NULL) { error("Could not allocate vector.\n"); } // Mapping of VCF genotypes to ONEMAP genotypes const char * const D_BC_ref[GT_TYPES_LEN] = { "a", "-", "ab", "-", "-", "-", "-" }; const char * const D_BC_alt[GT_TYPES_LEN] = { "-", "a", "ab", "-", "-", "-", "-" }; const char * const RI_ref[GT_TYPES_LEN] = { "a", "b", "-", "-", "-", "-", "-" }; const char * const RI_alt[GT_TYPES_LEN] = { "b", "a", "-", "-", "-", "-", "-" }; const char * const B3_F2_ref[GT_TYPES_LEN] = { "a", "b", "ab", "-", "-", "-", "-" }; const char * const B3_F2_alt[GT_TYPES_LEN] = { "b", "a", "ab", "-", "-", "-", "-" }; // Scan all records in VCF file and print valid markers to output bcf1_t *record; int32_t *GTs = NULL; int nGT_arr = 0; while ((record = bcf_sweep_fwd(in_vcf)) && marker_count < MAX_VARIANTS) { // We only consider biallelic SNP and INDEL markers int var_type = bcf_get_variant_types(record); if ((var_type == VCF_SNP || var_type == VCF_INDEL) && record->n_allele == 2) { int nGTs = bcf_get_format_int32(vcf_hdr, record, "GT", >s, &nGT_arr); // We only consider diploid variants (number of alleles in genotypes == 2) nGTs /= n_samples; if (nGTs == 2) { bcf_fmt_t *fmt_ptr = bcf_get_fmt(vcf_hdr, record, "GT"); // First, check which parents are heterozygous or homozygous (REF or ALT allele) bool is_het_parent1 = false, is_hom_ref_parent1 = false, is_hom_alt_parent1 = false; get_consensus_parent_gt(fmt_ptr, *n_parent1, idx_parent1, min_class_parent1, &is_het_parent1, &is_hom_ref_parent1, &is_hom_alt_parent1); bool is_het_parent2 = false, is_hom_ref_parent2 = false, is_hom_alt_parent2 = false; get_consensus_parent_gt(fmt_ptr, *n_parent2, idx_parent2, min_class_parent2, &is_het_parent2, &is_hom_ref_parent2, &is_hom_alt_parent2); // Convert to appropriate marker type char marker_type[MARKER_TYPE_LEN]; int type = get_marker_type(marker_type, cross_type, is_het_parent1, is_hom_ref_parent1, is_hom_alt_parent1, is_het_parent2, is_hom_ref_parent2, is_hom_alt_parent2); const char * const(*type_ptr)[GT_TYPES_LEN]; bool valid_marker = true; switch(type) { case marker_B3: case marker_F2_ref: type_ptr = &B3_F2_ref; break; case marker_F2_alt: type_ptr = &B3_F2_alt; break; case marker_D_ref: case marker_BC_ref: type_ptr = &D_BC_ref; break; case marker_D_alt: case marker_BC_alt: type_ptr = &D_BC_alt; break; case marker_RI_ref: type_ptr = &RI_ref; break; case marker_RI_alt: type_ptr = &RI_alt; break; default: valid_marker = false; } if (valid_marker) { // Store CHROM and POS fields for valid markers chrom[marker_count] = record->rid; pos[marker_count] = record->pos + 1; // Check if marker name exists; if negative, create one char *marker_name = record->d.id; if (!strcmp(marker_name, ".")) { sprintf(marker_name, "%s.%d", seq_names[chrom[marker_count]], pos[marker_count]); } // Output variant in ONEMAP format to temporary file print_record(temp_f, marker_name, marker_type, fmt_ptr, n_progeny, idx_progeny, type_ptr); marker_count++; } } } } // Write final output file header FILE *final_f = fopen(*out_filename, "w"); if (final_f == NULL) { error("Could not open output file.\n"); } fprintf(final_f, "data type %s\n", *cross); // The next header line contains the following information: number of individuals, number of markers, 1 for the presence of CHROM information, 1 for the presence of POS information and 0 for the absence of phenotypes (these need to be manually included later) fprintf(final_f, "%d %d 1 1 0\n", n_progeny, marker_count); // The next header line contains the sample names char *cur_sample_name = vcf_hdr->samples[idx_progeny[0]]; fprintf(final_f, "%s", cur_sample_name); for (i = 1; i < n_progeny; i++) { cur_sample_name = vcf_hdr->samples[idx_progeny[i]]; fprintf(final_f, "\t%s", cur_sample_name); } fprintf(final_f, "\n"); // Copy marker data from temporary file to final file rewind(temp_f); char buf[BUFSIZ]; size_t size; while ((size = fread(buf, 1, BUFSIZ, temp_f))) { fwrite(buf, 1, size, final_f); } // Write CHROM and POS data to output file if (marker_count) { fprintf(final_f, "*CHROM\t"); fprintf(final_f, "%s", seq_names[chrom[0]]); for (i = 1; i < marker_count; i++) { fprintf(final_f, " %s", seq_names[chrom[i]]); } fprintf(final_f, "\n*POS\t"); fprintf(final_f, "%d", pos[0]); for (i = 1; i < marker_count; i++) { fprintf(final_f, " %d", pos[i]); } } // Clean-up free(chrom); free(pos); free(GTs); bcf_sweep_destroy(in_vcf); fclose(temp_f); close(temp_fd); fclose(final_f); }