Ejemplo n.º 1
0
/**
 * Populate sequence names from files.
 * Searches headers first folllowed by tabix.
 */
void BCFSyncedStreamReader::add_interval(int32_t i)
{
    int32_t nseqs = 0;
    const char **seq_names = NULL;

    if (hdrs[i])
    {
        seq_names = bcf_hdr_seqnames(hdrs[i], &nseqs);
        for (uint32_t j=0; j<nseqs; ++j)
        {
            std::string seq(seq_names[j]);
            if (intervals_map.find(seq)==intervals_map.end())
            {
                intervals_map[seq] = intervals_map.size();
                intervals.push_back(GenomeInterval(seq));
            }
        }
        if(seq_names) free(seq_names);
    }

    if (tbxs[i])
    {
        seq_names = tbx_seqnames(tbxs[i], &nseqs);
        for (uint32_t j=0; j<nseqs; ++j)
        {
            std::string seq(seq_names[j]);
            if (intervals_map.find(seq)==intervals_map.end())
            {
                intervals_map[seq] = intervals_map.size();
                intervals.push_back(GenomeInterval(seq));
            }
        }
        if(seq_names) free(seq_names);
    }
}
Ejemplo n.º 2
0
bam_hdr_t *bcf_hdr_2_bam_hdr_t (htsstuff *hs){
  bam_hdr_t *ret = bam_hdr_init();
  ret->l_text = 0;
  ret->text =NULL;
  const char **seqnames = NULL;
  int nseq;
  seqnames = bcf_hdr_seqnames(hs->hdr, &nseq); assert(seqnames);
  
  ret->n_targets = nseq;
  ret->target_len = (uint32_t*) malloc(sizeof(uint32_t)*nseq);
  ret->target_name = (char**) malloc(sizeof(char*)*nseq);
  for(size_t i=0;i<nseq;i++){
    //    fprintf(stderr,"i:%d is:%d\n",i,bcf_hdr_id2length())
    ret->target_len[i] =0x7fffffff;// strlen(seqnames[i]);
    ret->target_name[i] =strdup(seqnames[i]);
  }
  free(seqnames);
  return ret;
}
Ejemplo n.º 3
0
int bcf_sr_add_reader(bcf_srs_t *files, const char *fname)
{
    htsFile* file_ptr = hts_open(fname, "r");
    if ( ! file_ptr ) {
        files->errnum = open_failed;
        return 0;
    }

    files->has_line = (int*) realloc(files->has_line, sizeof(int)*(files->nreaders+1));
    files->has_line[files->nreaders] = 0;
    files->readers  = (bcf_sr_t*) realloc(files->readers, sizeof(bcf_sr_t)*(files->nreaders+1));
    bcf_sr_t *reader = &files->readers[files->nreaders++];
    memset(reader,0,sizeof(bcf_sr_t));

    reader->file = file_ptr;

    files->errnum = 0;

    if ( files->require_index )
    {
        if ( reader->file->format.format==vcf )
        {
            if ( reader->file->format.compression!=bgzf )
            {
                files->errnum = not_bgzf;
                return 0;
            }

            reader->tbx_idx = tbx_index_load(fname);
            if ( !reader->tbx_idx )
            {
                files->errnum = idx_load_failed;
                return 0;
            }

            reader->header = bcf_hdr_read(reader->file);
        }
        else if ( reader->file->format.format==bcf )
        {
            if ( reader->file->format.compression!=bgzf )
            {
                files->errnum = not_bgzf;
                return 0;
            }

            reader->header = bcf_hdr_read(reader->file);

            reader->bcf_idx = bcf_index_load(fname);
            if ( !reader->bcf_idx )
            {
                files->errnum = idx_load_failed;
                return 0;
            }
        }
        else
        {
            files->errnum = file_type_error;
            return 0;
        }
    }
    else
    {
        if ( reader->file->format.format==bcf || reader->file->format.format==vcf )
        {
            reader->header = bcf_hdr_read(reader->file);
        }
        else
        {
            files->errnum = file_type_error;
            return 0;
        }
        files->streaming = 1;
    }
    if ( files->streaming && files->nreaders>1 )
    {
        files->errnum = api_usage_error;
        fprintf(stderr,"[%s:%d %s] Error: %d readers, yet require_index not set\n", __FILE__,__LINE__,__FUNCTION__,files->nreaders);
        return 0;
    }
    if ( files->streaming && files->regions )
    {
        files->errnum = api_usage_error;
        fprintf(stderr,"[%s:%d %s] Error: cannot tabix-jump in streaming mode\n", __FILE__,__LINE__,__FUNCTION__);
        return 0;
    }
    if ( !reader->header )
    {
        files->errnum = header_error;
        return 0;
    }

    reader->fname = fname;
    if ( files->apply_filters )
        reader->filter_ids = init_filters(reader->header, files->apply_filters, &reader->nfilter_ids);

    // Update list of chromosomes
    if ( !files->explicit_regs && !files->streaming )
    {
        int n,i;
        const char **names = reader->tbx_idx ? tbx_seqnames(reader->tbx_idx, &n) : bcf_hdr_seqnames(reader->header, &n);
        for (i=0; i<n; i++)
        {
            if ( !files->regions )
                files->regions = _regions_init_string(names[i]);
            else
                _regions_add(files->regions, names[i], -1, -1);
        }
        free(names);
    }

    return 1;
}
Ejemplo n.º 4
0
/**************************
 * PROCESS INPUT VCF FILE *
 **************************/
void vcf2raw(char **filename, char **out_filename, char **cross, int *n_parent1,
             char **parent1, int *n_parent2, char **parent2, double *min_class) {
  // We assume the input file exists (checked in R)
  bcf_sweep_t *in_vcf = bcf_sweep_init(*filename);
  if (in_vcf == NULL) {
    bcf_sweep_destroy(in_vcf);
    error("Could not parse input VCF file.");
  }
  bcf_hdr_t *vcf_hdr = bcf_sweep_hdr(in_vcf);

  // Get reference sequence IDs
  int n_seq = 0;
  const char **seq_names = NULL;
  seq_names = bcf_hdr_seqnames(vcf_hdr, &n_seq);
  if (seq_names == NULL || n_seq == 0) {
    free(seq_names);
    error("Could not correctly parse sequence names in VCF file. Is the input file tabix indexed?\n");
  }
  
  // Map parent names to sample indices
  int idx_parent1[*n_parent1];
  int idx_parent2[*n_parent2];
  get_parents_idx(*n_parent1, idx_parent1, *n_parent2, idx_parent2, vcf_hdr, parent1, parent2);

  // Get progeny sample indices (all samples that are not set as parents)
  int n_samples = bcf_hdr_nsamples(vcf_hdr);
  int n_progeny = n_samples - *n_parent1 - *n_parent2;
  if (n_progeny == 0) {
    error("Input file must contain at least one progeny individual.");
  }
  int idx_progeny[n_progeny];
  int i = 0, s;
  for (s = 0; s < n_samples; s++) {
    if (!is_val_in_arr(s, idx_parent1, *n_parent1)) {
      if (!is_val_in_arr(s, idx_parent2, *n_parent2)) {
        idx_progeny[i++] = s;
      }
    }
  }
  
  // Minimum count to assign parent genotype
  int min_class_parent1 = (int)ceil(*min_class * *n_parent1);
  int min_class_parent2 = (int)ceil(*min_class * *n_parent2);

  // Convert cross type
  int cross_type = get_cross_type(cross);

  // We need to write to a temporary file, because the number of markers in the header is unknown
  FILE *temp_f;
  char temp_filename[] = "tmp_raw_XXXXXX";
  int temp_fd;
  temp_fd = mkstemp(temp_filename);
  if (temp_fd == -1) {
    error("Could not open temporary output file.\n");
  }
  unlink(temp_filename);
  temp_f = fdopen(temp_fd, "w+");
  if (temp_f == NULL) {
    error("Could not open temporary output file.\n");
  }

  // CHROM and POS fields will be placed at the end of the output file
  int marker_count = 0;
  int * chrom = malloc(MAX_VARIANTS * sizeof(int));
  if (chrom == NULL) {
    error("Could not allocate vector.\n");
  }
  int * pos = malloc(MAX_VARIANTS * sizeof(int));
  if (pos == NULL) {
    error("Could not allocate vector.\n");
  }

  // Mapping of VCF genotypes to ONEMAP genotypes
  const char * const D_BC_ref[GT_TYPES_LEN] = { "a", "-", "ab", "-", "-", "-", "-" };
  const char * const D_BC_alt[GT_TYPES_LEN] = { "-", "a", "ab", "-", "-", "-", "-" };
  const char * const RI_ref[GT_TYPES_LEN] = { "a", "b", "-", "-", "-", "-", "-" };
  const char * const RI_alt[GT_TYPES_LEN] = { "b", "a", "-", "-", "-", "-", "-" };
  const char * const B3_F2_ref[GT_TYPES_LEN] = { "a", "b", "ab", "-", "-", "-", "-" };
  const char * const B3_F2_alt[GT_TYPES_LEN] = { "b", "a", "ab", "-", "-", "-", "-" };

  // Scan all records in VCF file and print valid markers to output
  bcf1_t *record;
  int32_t *GTs = NULL;
  int nGT_arr = 0;

  while ((record = bcf_sweep_fwd(in_vcf)) && marker_count < MAX_VARIANTS) {
    // We only consider biallelic SNP and INDEL markers
    int var_type = bcf_get_variant_types(record);
    if ((var_type == VCF_SNP || var_type == VCF_INDEL) && record->n_allele == 2) {
      int nGTs = bcf_get_format_int32(vcf_hdr, record, "GT", &GTs, &nGT_arr);
      // We only consider diploid variants (number of alleles in genotypes == 2)
      nGTs /= n_samples;
      if (nGTs == 2) {

        bcf_fmt_t *fmt_ptr = bcf_get_fmt(vcf_hdr, record, "GT");

        // First, check which parents are heterozygous or homozygous (REF or ALT allele)
        bool is_het_parent1 = false, is_hom_ref_parent1 = false, is_hom_alt_parent1 = false;
        get_consensus_parent_gt(fmt_ptr, *n_parent1, idx_parent1, min_class_parent1, &is_het_parent1,
                                &is_hom_ref_parent1, &is_hom_alt_parent1);
        bool is_het_parent2 = false, is_hom_ref_parent2 = false, is_hom_alt_parent2 = false;
        get_consensus_parent_gt(fmt_ptr, *n_parent2, idx_parent2, min_class_parent2, &is_het_parent2,
                                &is_hom_ref_parent2, &is_hom_alt_parent2);

        // Convert to appropriate marker type
        char marker_type[MARKER_TYPE_LEN];
        int type = get_marker_type(marker_type, cross_type,
                                   is_het_parent1, is_hom_ref_parent1, is_hom_alt_parent1,
                                   is_het_parent2, is_hom_ref_parent2, is_hom_alt_parent2);

        const char * const(*type_ptr)[GT_TYPES_LEN];
        bool valid_marker = true;
        switch(type)
        {
        case marker_B3:
        case marker_F2_ref:
          type_ptr = &B3_F2_ref;
          break;
        case marker_F2_alt:
          type_ptr = &B3_F2_alt;
          break;
        case marker_D_ref:
        case marker_BC_ref:
          type_ptr = &D_BC_ref;
          break;
        case marker_D_alt:
        case marker_BC_alt:
          type_ptr = &D_BC_alt;
          break;
        case marker_RI_ref:
          type_ptr = &RI_ref;
          break;
        case marker_RI_alt:
          type_ptr = &RI_alt;
          break;
        default:
          valid_marker = false;
        }

        if (valid_marker) {
          // Store CHROM and POS fields for valid markers
          chrom[marker_count] = record->rid;
          pos[marker_count] = record->pos + 1;

          // Check if marker name exists; if negative, create one
          char *marker_name = record->d.id;
          if (!strcmp(marker_name, ".")) {
            sprintf(marker_name, "%s.%d", seq_names[chrom[marker_count]], pos[marker_count]);
          }

          // Output variant in ONEMAP format to temporary file
          print_record(temp_f, marker_name, marker_type, fmt_ptr, n_progeny, idx_progeny, type_ptr);

          marker_count++;
        }
      }
    }
  }

  // Write final output file header
  FILE *final_f = fopen(*out_filename, "w");
  if (final_f == NULL) {
    error("Could not open output file.\n");
  }
  fprintf(final_f, "data type %s\n", *cross);
  // The next header line contains the following information: number of individuals, number of markers, 1 for the presence of CHROM information, 1 for the presence of POS information and 0 for the absence of phenotypes (these need to be manually included later)
  fprintf(final_f, "%d %d 1 1 0\n", n_progeny, marker_count);
  // The next header line contains the sample names
  char *cur_sample_name = vcf_hdr->samples[idx_progeny[0]];
  fprintf(final_f, "%s", cur_sample_name);
  for (i = 1; i < n_progeny; i++) {
    cur_sample_name = vcf_hdr->samples[idx_progeny[i]];
    fprintf(final_f, "\t%s", cur_sample_name);
  }
  fprintf(final_f, "\n");
  
  // Copy marker data from temporary file to final file
  rewind(temp_f);
  char buf[BUFSIZ];
  size_t size;
  while ((size = fread(buf, 1, BUFSIZ, temp_f))) {
    fwrite(buf, 1, size, final_f);
  }

  // Write CHROM and POS data to output file
  if (marker_count) {
    fprintf(final_f, "*CHROM\t");
    fprintf(final_f, "%s", seq_names[chrom[0]]);
    for (i = 1; i < marker_count; i++) {
      fprintf(final_f, " %s", seq_names[chrom[i]]);
    }
    fprintf(final_f, "\n*POS\t");
    fprintf(final_f, "%d", pos[0]);
    for (i = 1; i < marker_count; i++) {
      fprintf(final_f, " %d", pos[i]);
    }
  }

  // Clean-up
  free(chrom);
  free(pos);

  free(GTs);
  bcf_sweep_destroy(in_vcf);

  fclose(temp_f);
  close(temp_fd);
  fclose(final_f);
}
Ejemplo n.º 5
0
int bcf_sr_add_reader(bcf_srs_t *files, const char *fname)
{
    files->has_line = (int*) realloc(files->has_line, sizeof(int)*(files->nreaders+1));
    files->has_line[files->nreaders] = 0;
    files->readers  = (bcf_sr_t*) realloc(files->readers, sizeof(bcf_sr_t)*(files->nreaders+1));
    bcf_sr_t *reader = &files->readers[files->nreaders++];
    memset(reader,0,sizeof(bcf_sr_t));

    reader->file = hts_open(fname, "r");
    if ( !reader->file ) return 0;

    reader->type = reader->file->is_bin? FT_BCF : FT_VCF;
    if (reader->file->is_compressed) reader->type |= FT_GZ;

    if ( files->require_index )
    {
        if ( reader->type==FT_VCF_GZ ) 
        {
            reader->tbx_idx = tbx_index_load(fname);
            if ( !reader->tbx_idx )
            {
                fprintf(stderr,"[add_reader] Could not load the index of %s\n", fname);
                return 0;
            }

            reader->header = bcf_hdr_read(reader->file);
        }
        else if ( reader->type==FT_BCF_GZ ) 
        {
            reader->header = bcf_hdr_read(reader->file);

            reader->bcf_idx = bcf_index_load(fname);
            if ( !reader->bcf_idx ) 
            {
                fprintf(stderr,"[add_reader] Could not load the index of %s\n", fname);
                return 0;   // not indexed..?
            }
        }
        else
        {
            fprintf(stderr,"Index required, expected .vcf.gz or .bcf file: %s\n", fname);
            return 0;
        }
    }
    else 
    {
        if ( reader->type & FT_BCF )
        {
            reader->header = bcf_hdr_read(reader->file);
        }
        else if ( reader->type & FT_VCF )
        {
            reader->header = bcf_hdr_read(reader->file);
        }
        else
        {
            fprintf(stderr,"File type not recognised: %s\n", fname);
            return 0;
        }
        files->streaming = 1;
    }
    if ( files->streaming && files->nreaders>1 )
    {
        fprintf(stderr,"[%s:%d %s] Error: %d readers, yet require_index not set\n", __FILE__,__LINE__,__FUNCTION__,files->nreaders);
        return 0;
    }
    if ( files->streaming && files->regions )
    {
        fprintf(stderr,"[%s:%d %s] Error: cannot tabix-jump in streaming mode\n", __FILE__,__LINE__,__FUNCTION__);
        return 0;
    }
    if ( !reader->header ) return 0;

    reader->fname = fname;
    if ( files->apply_filters )
        reader->filter_ids = init_filters(reader->header, files->apply_filters, &reader->nfilter_ids);

    // Update list of chromosomes
    if ( !files->explicit_regs && !files->streaming )
    {
        int n,i;
        const char **names = reader->tbx_idx ? tbx_seqnames(reader->tbx_idx, &n) : bcf_hdr_seqnames(reader->header, &n);
        for (i=0; i<n; i++)
        {
            if ( !files->regions )
                files->regions = _regions_init_string(names[i]);
            else
                _regions_add(files->regions, names[i], -1, -1);
        }
        free(names);
    }

    return 1;
}
Ejemplo n.º 6
0
/**
 * Constructor.
 *
 * @intervals - if empty, will add the contigs found in the header files
 */
BCFSyncedReader::BCFSyncedReader(std::vector<std::string>& file_names, std::vector<GenomeInterval>& intervals, bool sync_by_pos)
:file_names(file_names), intervals(intervals), sync_by_pos(sync_by_pos)
{
    nfiles = file_names.size();
    files.resize(nfiles, 0);
    hdrs.resize(nfiles, 0);
    idxs.resize(nfiles, 0);
    tbxs.resize(nfiles, 0);
    itrs.resize(nfiles, 0);
    ftypes.resize(nfiles);

    current_interval = "";
    current_pos1 = 0;

    buffer.resize(nfiles);
    s = {0, 0, 0};

    random_access = (intervals.size()!=0);
    for (size_t i=0; i<intervals.size(); ++i)
    {
        intervals_map[intervals[i].to_string()] = i;
    }
    intervals_index = 0;

    uint32_t no_stdins = 0;

    for (size_t i = 0; i<nfiles; ++i)
    {
        if (file_names[0]=="+")
        {
            file_names[0]="-";
            ++no_stdins;
        }
        else if (file_names[0]=="-")
        {
            file_names[0]="-";
            ++no_stdins;
        }

        if (no_stdins>1)
        {
            fprintf(stderr, "[E:%s:%d %s] BCFSyncedReader does not support reading from more than one STDIN stream\n", __FILE__, __LINE__, __FUNCTION__);
            exit(1);
        }

        files[i] = hts_open(file_names[i].c_str(), "r");
        if (files[i]==NULL)
        {
            fprintf(stderr, "[%s:%d %s] Cannot open %s\n", __FILE__, __LINE__, __FUNCTION__, file_names[i].c_str());
            exit(1);
        }
        ftypes[i] = files[i]->format;

        //check format
        if (ftypes[i].format!=vcf && ftypes[i].format!=bcf)
        {
            fprintf(stderr, "[E:%s:%d %s] %s not a VCF or BCF file\n", __FILE__, __LINE__, __FUNCTION__, file_names[i].c_str());
            exit(1);
        }

        //read header
        hdrs[i] = bcf_alt_hdr_read(files[i]);
        if (!hdrs[i])
        {
            fprintf(stderr, "[E:%s:%d %s] header cannot be read for %s\n", __FILE__, __LINE__, __FUNCTION__, file_names[i].c_str());
            exit(1);
        }

        //load index if intervals are specified
        if (random_access && !load_index(i))
        {
            fprintf(stderr, "[E:%s:%d %s] index cannot be loaded for %s for random access\n", __FILE__, __LINE__, __FUNCTION__, file_names[i].c_str());
            exit(1);
        }

        //check contigs consistency
        if (i)
        {
            int32_t nseqs0;
            const char ** seqnames0 = bcf_hdr_seqnames(hdrs[i], &nseqs0);

            int32_t nseqs;
            const char ** seqnames = bcf_hdr_seqnames(hdrs[i], &nseqs);

            if (nseqs0==0 || nseqs==0 || nseqs0!=nseqs)
            {
                fprintf(stderr, "[E:%s:%d %s] contigs in header not consistent with first file for %s\n", __FILE__, __LINE__, __FUNCTION__, file_names[i].c_str());
                exit(1);
            }

            for (size_t j=0; j<nseqs; ++j)
            {
                if (strcmp(seqnames0[j], seqnames[j]))
                {
                    fprintf(stderr, "[E:%s:%d %s] contigs in header not consistent with first file for %s\n", __FILE__, __LINE__, __FUNCTION__, file_names[i].c_str());
                    exit(1);
                }
            }

            free(seqnames0);
            free(seqnames);
        }
    }
}