Пример #1
0
int main(int argc, char **argv)
{
    if ( argc!=2 ) 
    {
        fprintf(stderr,"Usage: test-vcf-sweep <file.bcf|file.vcf>\n");
        return 1;
    }

    // Init variables. The checksum is just for this test program to output
    // something and verify that all sites are read in both passes - fwd and
    // bwd.
    bcf_sweep_t *sw = bcf_sweep_init(argv[1]);
    bcf_hdr_t *hdr  = bcf_sweep_hdr(sw);
    int chksum = 0;

    // First we must sweep forward and read the whole file to build an index.
    // If this is undesirable, we can require the presence of a .gzi index
    // which can be created with `bgzip -r` from the samtools/htslib package
    bcf1_t *rec;
    while ( (rec = bcf_sweep_fwd(sw)) ) chksum += rec->pos+1;
    printf("fwd position chksum: %d\n", chksum);

    // Now sweep backward. 
    chksum = 0;
    while ( (rec = bcf_sweep_bwd(sw)) ) chksum += rec->pos+1;
    printf("bwd position chksum: %d\n", chksum);

    // And forward and backward again, this time summing the PL vectors
    int i,j, mPLs = 0, nPLs;
    int32_t *PLs = NULL;
    chksum = 0;
    while ( (rec = bcf_sweep_fwd(sw)) ) 
    {
        // get copy of the PL vectors
        nPLs = bcf_get_format_int32(hdr, rec, "PL", &PLs, &mPLs);
        if ( !nPLs ) continue;  // PL not present

        // how many values are there per sample
        int nvals = nPLs / bcf_hdr_nsamples(hdr);

        int32_t *ptr = PLs;
        for (i=0; i<bcf_hdr_nsamples(hdr); i++)
        {
            for (j=0; j<nvals; j++)
            {
                // check for shorter vectors (haploid genotypes amongst diploids)
                if ( ptr[j]==bcf_int32_vector_end ) break;

                // skip missing values
                if ( ptr[j]==bcf_int32_missing ) continue;

                chksum += ptr[j];
            }
            ptr += nvals;
        }
    }
    printf("fwd PL chksum: %d\n", chksum);

    // And the same backwards..
    chksum = 0;
    while ( (rec = bcf_sweep_bwd(sw)) )
    {
        nPLs = bcf_get_format_int32(hdr, rec, "PL", &PLs, &mPLs);
        if ( !nPLs ) continue;
        int nvals = nPLs / bcf_hdr_nsamples(hdr);
        int32_t *ptr = PLs;
        for (i=0; i<bcf_hdr_nsamples(hdr); i++)
        {
            for (j=0; j<nvals; j++)
            {
                if ( ptr[j]==bcf_int32_vector_end ) break;
                if ( ptr[j]==bcf_int32_missing ) continue;
                chksum += ptr[j];
            }
            ptr += nvals;
        }
    }
    printf("bwd PL chksum: %d\n", chksum);

    // Clean up
    bcf_sweep_destroy(sw);
    return 0;
}
Пример #2
0
/**************************
 * PROCESS INPUT VCF FILE *
 **************************/
void vcf2raw(char **filename, char **out_filename, char **cross, int *n_parent1,
             char **parent1, int *n_parent2, char **parent2, double *min_class) {
  // We assume the input file exists (checked in R)
  bcf_sweep_t *in_vcf = bcf_sweep_init(*filename);
  if (in_vcf == NULL) {
    bcf_sweep_destroy(in_vcf);
    error("Could not parse input VCF file.");
  }
  bcf_hdr_t *vcf_hdr = bcf_sweep_hdr(in_vcf);

  // Get reference sequence IDs
  int n_seq = 0;
  const char **seq_names = NULL;
  seq_names = bcf_hdr_seqnames(vcf_hdr, &n_seq);
  if (seq_names == NULL || n_seq == 0) {
    free(seq_names);
    error("Could not correctly parse sequence names in VCF file. Is the input file tabix indexed?\n");
  }
  
  // Map parent names to sample indices
  int idx_parent1[*n_parent1];
  int idx_parent2[*n_parent2];
  get_parents_idx(*n_parent1, idx_parent1, *n_parent2, idx_parent2, vcf_hdr, parent1, parent2);

  // Get progeny sample indices (all samples that are not set as parents)
  int n_samples = bcf_hdr_nsamples(vcf_hdr);
  int n_progeny = n_samples - *n_parent1 - *n_parent2;
  if (n_progeny == 0) {
    error("Input file must contain at least one progeny individual.");
  }
  int idx_progeny[n_progeny];
  int i = 0, s;
  for (s = 0; s < n_samples; s++) {
    if (!is_val_in_arr(s, idx_parent1, *n_parent1)) {
      if (!is_val_in_arr(s, idx_parent2, *n_parent2)) {
        idx_progeny[i++] = s;
      }
    }
  }
  
  // Minimum count to assign parent genotype
  int min_class_parent1 = (int)ceil(*min_class * *n_parent1);
  int min_class_parent2 = (int)ceil(*min_class * *n_parent2);

  // Convert cross type
  int cross_type = get_cross_type(cross);

  // We need to write to a temporary file, because the number of markers in the header is unknown
  FILE *temp_f;
  char temp_filename[] = "tmp_raw_XXXXXX";
  int temp_fd;
  temp_fd = mkstemp(temp_filename);
  if (temp_fd == -1) {
    error("Could not open temporary output file.\n");
  }
  unlink(temp_filename);
  temp_f = fdopen(temp_fd, "w+");
  if (temp_f == NULL) {
    error("Could not open temporary output file.\n");
  }

  // CHROM and POS fields will be placed at the end of the output file
  int marker_count = 0;
  int * chrom = malloc(MAX_VARIANTS * sizeof(int));
  if (chrom == NULL) {
    error("Could not allocate vector.\n");
  }
  int * pos = malloc(MAX_VARIANTS * sizeof(int));
  if (pos == NULL) {
    error("Could not allocate vector.\n");
  }

  // Mapping of VCF genotypes to ONEMAP genotypes
  const char * const D_BC_ref[GT_TYPES_LEN] = { "a", "-", "ab", "-", "-", "-", "-" };
  const char * const D_BC_alt[GT_TYPES_LEN] = { "-", "a", "ab", "-", "-", "-", "-" };
  const char * const RI_ref[GT_TYPES_LEN] = { "a", "b", "-", "-", "-", "-", "-" };
  const char * const RI_alt[GT_TYPES_LEN] = { "b", "a", "-", "-", "-", "-", "-" };
  const char * const B3_F2_ref[GT_TYPES_LEN] = { "a", "b", "ab", "-", "-", "-", "-" };
  const char * const B3_F2_alt[GT_TYPES_LEN] = { "b", "a", "ab", "-", "-", "-", "-" };

  // Scan all records in VCF file and print valid markers to output
  bcf1_t *record;
  int32_t *GTs = NULL;
  int nGT_arr = 0;

  while ((record = bcf_sweep_fwd(in_vcf)) && marker_count < MAX_VARIANTS) {
    // We only consider biallelic SNP and INDEL markers
    int var_type = bcf_get_variant_types(record);
    if ((var_type == VCF_SNP || var_type == VCF_INDEL) && record->n_allele == 2) {
      int nGTs = bcf_get_format_int32(vcf_hdr, record, "GT", &GTs, &nGT_arr);
      // We only consider diploid variants (number of alleles in genotypes == 2)
      nGTs /= n_samples;
      if (nGTs == 2) {

        bcf_fmt_t *fmt_ptr = bcf_get_fmt(vcf_hdr, record, "GT");

        // First, check which parents are heterozygous or homozygous (REF or ALT allele)
        bool is_het_parent1 = false, is_hom_ref_parent1 = false, is_hom_alt_parent1 = false;
        get_consensus_parent_gt(fmt_ptr, *n_parent1, idx_parent1, min_class_parent1, &is_het_parent1,
                                &is_hom_ref_parent1, &is_hom_alt_parent1);
        bool is_het_parent2 = false, is_hom_ref_parent2 = false, is_hom_alt_parent2 = false;
        get_consensus_parent_gt(fmt_ptr, *n_parent2, idx_parent2, min_class_parent2, &is_het_parent2,
                                &is_hom_ref_parent2, &is_hom_alt_parent2);

        // Convert to appropriate marker type
        char marker_type[MARKER_TYPE_LEN];
        int type = get_marker_type(marker_type, cross_type,
                                   is_het_parent1, is_hom_ref_parent1, is_hom_alt_parent1,
                                   is_het_parent2, is_hom_ref_parent2, is_hom_alt_parent2);

        const char * const(*type_ptr)[GT_TYPES_LEN];
        bool valid_marker = true;
        switch(type)
        {
        case marker_B3:
        case marker_F2_ref:
          type_ptr = &B3_F2_ref;
          break;
        case marker_F2_alt:
          type_ptr = &B3_F2_alt;
          break;
        case marker_D_ref:
        case marker_BC_ref:
          type_ptr = &D_BC_ref;
          break;
        case marker_D_alt:
        case marker_BC_alt:
          type_ptr = &D_BC_alt;
          break;
        case marker_RI_ref:
          type_ptr = &RI_ref;
          break;
        case marker_RI_alt:
          type_ptr = &RI_alt;
          break;
        default:
          valid_marker = false;
        }

        if (valid_marker) {
          // Store CHROM and POS fields for valid markers
          chrom[marker_count] = record->rid;
          pos[marker_count] = record->pos + 1;

          // Check if marker name exists; if negative, create one
          char *marker_name = record->d.id;
          if (!strcmp(marker_name, ".")) {
            sprintf(marker_name, "%s.%d", seq_names[chrom[marker_count]], pos[marker_count]);
          }

          // Output variant in ONEMAP format to temporary file
          print_record(temp_f, marker_name, marker_type, fmt_ptr, n_progeny, idx_progeny, type_ptr);

          marker_count++;
        }
      }
    }
  }

  // Write final output file header
  FILE *final_f = fopen(*out_filename, "w");
  if (final_f == NULL) {
    error("Could not open output file.\n");
  }
  fprintf(final_f, "data type %s\n", *cross);
  // The next header line contains the following information: number of individuals, number of markers, 1 for the presence of CHROM information, 1 for the presence of POS information and 0 for the absence of phenotypes (these need to be manually included later)
  fprintf(final_f, "%d %d 1 1 0\n", n_progeny, marker_count);
  // The next header line contains the sample names
  char *cur_sample_name = vcf_hdr->samples[idx_progeny[0]];
  fprintf(final_f, "%s", cur_sample_name);
  for (i = 1; i < n_progeny; i++) {
    cur_sample_name = vcf_hdr->samples[idx_progeny[i]];
    fprintf(final_f, "\t%s", cur_sample_name);
  }
  fprintf(final_f, "\n");
  
  // Copy marker data from temporary file to final file
  rewind(temp_f);
  char buf[BUFSIZ];
  size_t size;
  while ((size = fread(buf, 1, BUFSIZ, temp_f))) {
    fwrite(buf, 1, size, final_f);
  }

  // Write CHROM and POS data to output file
  if (marker_count) {
    fprintf(final_f, "*CHROM\t");
    fprintf(final_f, "%s", seq_names[chrom[0]]);
    for (i = 1; i < marker_count; i++) {
      fprintf(final_f, " %s", seq_names[chrom[i]]);
    }
    fprintf(final_f, "\n*POS\t");
    fprintf(final_f, "%d", pos[0]);
    for (i = 1; i < marker_count; i++) {
      fprintf(final_f, " %d", pos[i]);
    }
  }

  // Clean-up
  free(chrom);
  free(pos);

  free(GTs);
  bcf_sweep_destroy(in_vcf);

  fclose(temp_f);
  close(temp_fd);
  fclose(final_f);
}