Example #1
0
int process(bcf1_t *rec)
{
    if ( rec->n_allele<2 ) return 0;    // not a variant

    int type = bcf_get_variant_types(rec);
    if ( !(type&VCF_INDEL) ) return 0;  // not an indel

    int i, len = 0;
    for (i=1; i<rec->n_allele; i++)
        if ( len > rec->d.var[i].n ) len = rec->d.var[i].n;

    int pos_to = len!=0 ? rec->pos : rec->pos - len;    // len is negative
    if ( bcf_sr_regions_overlap(exons, bcf_seqname(in_hdr,rec),rec->pos,pos_to) ) return 0;  // no overlap

    hts_expand(int32_t,rec->n_allele-1,nfrm,frm);
    for (i=1; i<rec->n_allele; i++)
    {
        if ( rec->d.var[i].type!=VCF_INDEL ) {
            frm[i-1] = -1;
            continue;
        }

        int len = rec->d.var[i].n, tlen = 0;
        if ( len>0 )
        {
            // insertion
            if ( exons->start <= rec->pos && exons->end > rec->pos ) tlen = abs(len);
        }
        else if ( exons->start <= rec->pos + abs(len) )
        {
            // deletion
            tlen = abs(len);
            if ( rec->pos < exons->start )              // trim the beginning
                tlen -= exons->start - rec->pos + 1;
            if ( exons->end < rec->pos + abs(len) )     // trim the end
                tlen -= rec->pos + abs(len) - exons->end;
        }
        if ( tlen )     // there are some deleted/inserted bases in the exon
        {
            if ( tlen%3 ) frm[i-1] = 1; // out-of-frame
            else frm[i-1] = 0;          // in-frame
        }
        else frm[i-1] = -1;             // not applicable (is outside)
    }

    if ( bcf_update_info_int32(out_hdr,rec,"OOF",frm,rec->n_allele-1)<0 ) return -1;
    return 0;
}
/*
   Removes duplicate records from the buffer. The meaning of "duplicate" is
   controlled by the $collapse variable, which can cause that from multiple
   <indel|snp|any> lines only the first is considered and the rest is ignored.
   The removal is done by setting the redundant lines' positions to -1 and
   moving these lines at the end of the buffer.
 */
static void collapse_buffer(bcf_srs_t *files, bcf_sr_t *reader)
{
    int irec,jrec, has_snp=0, has_indel=0, has_any=0;
    for (irec=1; irec<=reader->nbuffer; irec++)
    {
        bcf1_t *line = reader->buffer[irec];
        if ( line->pos != reader->buffer[1]->pos ) break;
        if ( files->collapse&COLLAPSE_ANY )
        {
            if ( !has_any ) has_any = 1;
            else line->pos = -1;
        }
        int line_type = bcf_get_variant_types(line);
        if ( files->collapse&COLLAPSE_SNPS && line_type&(VCF_SNP|VCF_MNP) )
        {
            if ( !has_snp ) has_snp = 1;
            else line->pos = -1;
        }
        if ( files->collapse&COLLAPSE_INDELS && line_type&VCF_INDEL )
        {
            if ( !has_indel ) has_indel = 1;
            else line->pos = -1;
        }
    }
    bcf1_t *tmp;
    irec = jrec = 1;
    while ( irec<=reader->nbuffer && jrec<=reader->nbuffer )
    {
        if ( reader->buffer[irec]->pos != -1 ) { irec++; continue; }
        if ( jrec<=irec ) jrec = irec+1;
        while ( jrec<=reader->nbuffer && reader->buffer[jrec]->pos==-1 ) jrec++;
        if ( jrec<=reader->nbuffer )
        {
            tmp = reader->buffer[irec]; reader->buffer[irec] = reader->buffer[jrec]; reader->buffer[jrec] = tmp;
        }
    }
    reader->nbuffer = irec - 1;
}
Example #3
0
static void filters_set_type(bcf1_t *line, token_t *tok)
{
    tok->num_value = bcf_get_variant_types(line);
}
Example #4
0
static void bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int min_pos)
{
    if ( !srt->grp_str2int )
    {
        // first time here, initialize
        if ( !srt->pair )
        {
            if ( readers->collapse==COLLAPSE_NONE ) readers->collapse = BCF_SR_PAIR_EXACT;
            bcf_sr_set_opt(readers, BCF_SR_PAIR_LOGIC, readers->collapse);
        }
        bcf_sr_init_scores(srt);
        srt->grp_str2int = khash_str2int_init();
        srt->var_str2int = khash_str2int_init();
    }
    int k;
    khash_t(str2int) *hash;
    hash = srt->grp_str2int;
    for (k=0; k < kh_end(hash); k++)
        if ( kh_exist(hash,k) ) free((char*)kh_key(hash,k));
    hash = srt->var_str2int;
    for (k=0; k < kh_end(hash); k++)
        if ( kh_exist(hash,k) ) free((char*)kh_key(hash,k));
    kh_clear(str2int, srt->grp_str2int);
    kh_clear(str2int, srt->var_str2int);
    srt->ngrp = srt->nvar = srt->nvset = 0;

    grp_t grp;
    memset(&grp,0,sizeof(grp_t));

    // group VCFs into groups, each with a unique combination of variants in the duplicate lines
    int ireader,ivar,irec,igrp,ivset,iact;
    for (ireader=0; ireader<readers->nreaders; ireader++) srt->vcf_buf[ireader].nrec = 0;
    for (iact=0; iact<srt->nactive; iact++)
    {
        ireader = srt->active[iact];
        bcf_sr_t *reader = &readers->readers[ireader];
        int rid   = bcf_hdr_name2id(reader->header, chr);
        grp.nvar  = 0;
        hts_expand(int,reader->nbuffer,srt->moff,srt->off);
        srt->noff  = 0;
        srt->str.l = 0;
        for (irec=1; irec<=reader->nbuffer; irec++)
        {
            bcf1_t *line = reader->buffer[irec];
            if ( line->rid!=rid || line->pos!=min_pos ) break;

            if ( srt->str.l ) kputc(';',&srt->str);
            srt->off[srt->noff++] = srt->str.l;
            size_t beg = srt->str.l;
            for (ivar=1; ivar<line->n_allele; ivar++)
            {
                if ( ivar>1 ) kputc(',',&srt->str);
                kputs(line->d.allele[0],&srt->str);
                kputc('>',&srt->str);
                kputs(line->d.allele[ivar],&srt->str);
            }
            if ( line->n_allele==1 )
            {
                kputs(line->d.allele[0],&srt->str);
                kputsn(">.",2,&srt->str);
            }

            // Create new variant or attach to existing one. But careful, there can be duplicate
            // records with the same POS,REF,ALT (e.g. in dbSNP-b142)
            char *var_str = beg + srt->str.s;
            int ret, var_idx = 0, var_end = srt->str.l;
            while ( 1 )
            {
                ret = khash_str2int_get(srt->var_str2int, var_str, &ivar);
                if ( ret==-1 ) break;

                var_t *var = &srt->var[ivar];
                if ( var->vcf[var->nvcf-1] != ireader ) break;

                srt->str.l = var_end;
                kputw(var_idx, &srt->str);
                var_str = beg + srt->str.s;
                var_idx++;
            }
            if ( ret==-1 )
            {
                ivar = srt->nvar++;
                hts_expand0(var_t,srt->nvar,srt->mvar,srt->var);
                srt->var[ivar].nvcf = 0;
                khash_str2int_set(srt->var_str2int, strdup(var_str), ivar);
                free(srt->var[ivar].str);   // possible left-over from the previous position
            }
            var_t *var = &srt->var[ivar];
            var->nalt = line->n_allele - 1;
            var->type = bcf_get_variant_types(line);
            srt->str.s[var_end] = 0;
            if ( ret==-1 )
                var->str = strdup(var_str);

            int mvcf = var->mvcf;
            var->nvcf++;
            hts_expand0(int*, var->nvcf, var->mvcf, var->vcf);
            if ( mvcf != var->mvcf ) var->rec = (bcf1_t **) realloc(var->rec,sizeof(bcf1_t*)*var->mvcf);
            var->vcf[var->nvcf-1] = ireader;
            var->rec[var->nvcf-1] = line;

            grp.nvar++;
            hts_expand(var_t,grp.nvar,grp.mvar,grp.var);
            grp.var[grp.nvar-1] = ivar;
        }
        char *grp_key = grp_create_key(srt);
        int ret = khash_str2int_get(srt->grp_str2int, grp_key, &igrp);
        if ( ret==-1 )
        {
            igrp = srt->ngrp++;
            hts_expand0(grp_t, srt->ngrp, srt->mgrp, srt->grp);
            free(srt->grp[igrp].var);
            srt->grp[igrp] = grp;
            srt->grp[igrp].key = grp_key;
            khash_str2int_set(srt->grp_str2int, grp_key, igrp);
            memset(&grp,0,sizeof(grp_t));
        }
        else
            free(grp_key);
        srt->grp[igrp].nvcf++;
    }
    free(grp.var);

    // initialize bitmask - which groups is the variant present in
    for (ivar=0; ivar<srt->nvar; ivar++)
    {
        srt->var[ivar].mask = kbs_resize(srt->var[ivar].mask, srt->ngrp);
        kbs_clear(srt->var[ivar].mask);
    }
    for (igrp=0; igrp<srt->ngrp; igrp++)
    {
        for (ivar=0; ivar<srt->grp[igrp].nvar; ivar++)
        {
            int i = srt->grp[igrp].var[ivar];
            kbs_insert(srt->var[i].mask, igrp);
        }
    }

    // create the initial list of variant sets
    for (ivar=0; ivar<srt->nvar; ivar++)
    {
        ivset = srt->nvset++;
        hts_expand0(varset_t, srt->nvset, srt->mvset, srt->vset);

        varset_t *vset = &srt->vset[ivset];
        vset->nvar = 1;
        hts_expand0(var_t, vset->nvar, vset->mvar, vset->var);
        vset->var[vset->nvar-1] = ivar;
        var_t *var  = &srt->var[ivar];
        vset->cnt   = var->nvcf;
        vset->mask  = kbs_resize(vset->mask, srt->ngrp);
        kbs_clear(vset->mask);
        kbs_bitwise_or(vset->mask, var->mask);

        int type = 0;
        if ( var->type==VCF_REF ) type |= SR_REF;
        else
        {
            if ( var->type & VCF_SNP ) type |= SR_SNP;
            if ( var->type & VCF_MNP ) type |= SR_SNP;
            if ( var->type & VCF_INDEL ) type |= SR_INDEL;
            if ( var->type & VCF_OTHER ) type |= SR_OTHER;
        }
        var->type = type;
    }
#if DEBUG_VSETS
    debug_vsets(srt);
#endif

    // initialize the pairing matrix
    hts_expand(int, srt->ngrp*srt->nvset, srt->mpmat, srt->pmat);
    hts_expand(int, srt->nvset, srt->mcnt, srt->cnt);
    memset(srt->pmat, 0, sizeof(*srt->pmat)*srt->ngrp*srt->nvset);
    for (ivset=0; ivset<srt->nvset; ivset++)
    {
        varset_t *vset = &srt->vset[ivset];
        for (igrp=0; igrp<srt->ngrp; igrp++) srt->pmat[ivset*srt->ngrp+igrp] = 0;
        srt->cnt[ivset] = vset->cnt;
    }

    // pair the lines
    while ( srt->nvset )
    {
#if DEBUG_VSETS
    fprintf(stderr,"\n");
    debug_vsets(srt);
#endif

        int imax = 0;
        for (ivset=1; ivset<srt->nvset; ivset++)
            if ( srt->cnt[imax] < srt->cnt[ivset] ) imax = ivset;

        int ipair = -1;
        uint32_t max_score = 0;
        for (ivset=0; ivset<srt->nvset; ivset++)
        {
            if ( kbs_logical_and(srt->vset[imax].mask,srt->vset[ivset].mask) ) continue;   // cannot be merged
            uint32_t score = pairing_score(srt, imax, ivset);
            // fprintf(stderr,"score: %d %d, logic=%d \t..\t %u\n", imax,ivset,srt->pair,score);
            if ( max_score < score ) { max_score = score; ipair = ivset; }
        }

        // merge rows creating a new variant set this way
        if ( ipair!=-1 && ipair!=imax )
        {
            imax = merge_vsets(srt, imax, ipair);
            continue;
        }

        push_vset(srt, imax);
    }

    srt->chr = chr;
    srt->pos = min_pos;
}
/*
 *  _reader_match_alleles() - from multiple buffered lines selects the one which
 *  corresponds best to the template line. The logic is controlled by COLLAPSE_*
 *  Returns 0 on success or -1 when no good matching line is found.
 */
static int _reader_match_alleles(bcf_srs_t *files, bcf_sr_t *reader, bcf1_t *tmpl)
{
    int i, irec = -1;

    // if no template given, use the first available record
    if ( !tmpl )
        irec = 1;
    else
    {
        int tmpl_type = bcf_get_variant_types(tmpl);
        for (i=1; i<=reader->nbuffer; i++)
        {
            bcf1_t *line = reader->buffer[i];
            if ( line->pos != reader->buffer[1]->pos ) break;  // done with this reader

            // Easiest case: matching by position only
            if ( files->collapse&COLLAPSE_ANY ) { irec=i; break; }

            int line_type = bcf_get_variant_types(line);

            // No matter what the alleles are, as long as they are both SNPs
            if ( files->collapse&COLLAPSE_SNPS && tmpl_type&VCF_SNP && line_type&VCF_SNP ) { irec=i; break; }
            // ... or indels
            if ( files->collapse&COLLAPSE_INDELS && tmpl_type&VCF_INDEL && line_type&VCF_INDEL ) { irec=i; break; }

            // More thorough checking: REFs must match
            if ( tmpl->rlen != line->rlen ) continue;  // different length
            if ( strcmp(tmpl->d.allele[0], line->d.allele[0]) ) continue; // the strings do not match

            int ial,jal;
            if ( files->collapse==COLLAPSE_NONE )
            {
                // Exact match, all alleles must be identical
                if ( tmpl->n_allele!=line->n_allele ) continue;   // different number of alleles, skip

                int nmatch = 1; // REF has been already checked
                for (ial=1; ial<tmpl->n_allele; ial++)
                {
                    for (jal=1; jal<line->n_allele; jal++)
                        if ( !strcmp(tmpl->d.allele[ial], line->d.allele[jal]) ) { nmatch++; break; }
                }
                if ( nmatch==tmpl->n_allele ) { irec=i; break; }    // found: exact match
                continue;
            }

            if ( line->n_allele==1 && tmpl->n_allele==1 ) { irec=i; break; }    // both sites are non-variant

            // COLLAPSE_SOME: at least some ALTs must match
            for (ial=1; ial<tmpl->n_allele; ial++)
            {
                for (jal=1; jal<line->n_allele; jal++)
                    if ( !strcmp(tmpl->d.allele[ial], line->d.allele[jal]) ) { irec=i; break; }
                if ( irec>=1 ) break;
            }
            if ( irec>=1 ) break;
        }
        if ( irec==-1 ) return -1;  // no matching line was found
    }

    // Set the selected line (irec) as active: set it to buffer[0], move the remaining lines forward
    // and put the old bcf1_t record at the end.
    bcf1_t *tmp = reader->buffer[0];
    reader->buffer[0] = reader->buffer[irec];
    for (i=irec+1; i<=reader->nbuffer; i++) reader->buffer[i-1] = reader->buffer[i];
    reader->buffer[ reader->nbuffer ] = tmp;
    reader->nbuffer--;

    return 0;
}
            if ( *se=='\t' ) break;
            if ( *se!=',' ) continue;
            reg->als[reg->nals] = &reg->als_str.s[reg->als_str.l];
            kputsn(ss,se-ss,&reg->als_str);
            if ( &reg->als_str.s[reg->als_str.l] - reg->als[reg->nals] > max_len ) max_len = &reg->als_str.s[reg->als_str.l] - reg->als[reg->nals];
            reg->als_str.l++;
            reg->nals++;
            ss = ++se;
        }
        reg->als[reg->nals] = &reg->als_str.s[reg->als_str.l];
        kputsn(ss,se-ss,&reg->als_str);
        if ( &reg->als_str.s[reg->als_str.l] - reg->als[reg->nals] > max_len ) max_len = &reg->als_str.s[reg->als_str.l] - reg->als[reg->nals];
        reg->nals++;
        reg->als_type = max_len > 1 ? VCF_INDEL : VCF_SNP;  // this is a simplified check, see vcf.c:bcf_set_variant_types
    }
    int type = bcf_get_variant_types(rec);
    if ( reg->als_type & VCF_INDEL )
        return type & VCF_INDEL ? 1 : 0;
    return !(type & VCF_INDEL) ? 1 : 0;
}

int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, int start, int end)
{
    int iseq;
    if ( khash_str2int_get(reg->seq_hash, seq, &iseq)<0 ) return -1;    // no such sequence

    if ( reg->prev_seq==-1 || iseq!=reg->prev_seq || reg->prev_start > start ) // new chromosome or after a seek
    {
        // flush regions left on previous chromosome
        if ( reg->missed_reg_handler && reg->prev_seq!=-1 && reg->iseq!=-1 )
            bcf_sr_regions_flush(reg);
Example #7
0
/**************************
 * PROCESS INPUT VCF FILE *
 **************************/
void vcf2raw(char **filename, char **out_filename, char **cross, int *n_parent1,
             char **parent1, int *n_parent2, char **parent2, double *min_class) {
  // We assume the input file exists (checked in R)
  bcf_sweep_t *in_vcf = bcf_sweep_init(*filename);
  if (in_vcf == NULL) {
    bcf_sweep_destroy(in_vcf);
    error("Could not parse input VCF file.");
  }
  bcf_hdr_t *vcf_hdr = bcf_sweep_hdr(in_vcf);

  // Get reference sequence IDs
  int n_seq = 0;
  const char **seq_names = NULL;
  seq_names = bcf_hdr_seqnames(vcf_hdr, &n_seq);
  if (seq_names == NULL || n_seq == 0) {
    free(seq_names);
    error("Could not correctly parse sequence names in VCF file. Is the input file tabix indexed?\n");
  }
  
  // Map parent names to sample indices
  int idx_parent1[*n_parent1];
  int idx_parent2[*n_parent2];
  get_parents_idx(*n_parent1, idx_parent1, *n_parent2, idx_parent2, vcf_hdr, parent1, parent2);

  // Get progeny sample indices (all samples that are not set as parents)
  int n_samples = bcf_hdr_nsamples(vcf_hdr);
  int n_progeny = n_samples - *n_parent1 - *n_parent2;
  if (n_progeny == 0) {
    error("Input file must contain at least one progeny individual.");
  }
  int idx_progeny[n_progeny];
  int i = 0, s;
  for (s = 0; s < n_samples; s++) {
    if (!is_val_in_arr(s, idx_parent1, *n_parent1)) {
      if (!is_val_in_arr(s, idx_parent2, *n_parent2)) {
        idx_progeny[i++] = s;
      }
    }
  }
  
  // Minimum count to assign parent genotype
  int min_class_parent1 = (int)ceil(*min_class * *n_parent1);
  int min_class_parent2 = (int)ceil(*min_class * *n_parent2);

  // Convert cross type
  int cross_type = get_cross_type(cross);

  // We need to write to a temporary file, because the number of markers in the header is unknown
  FILE *temp_f;
  char temp_filename[] = "tmp_raw_XXXXXX";
  int temp_fd;
  temp_fd = mkstemp(temp_filename);
  if (temp_fd == -1) {
    error("Could not open temporary output file.\n");
  }
  unlink(temp_filename);
  temp_f = fdopen(temp_fd, "w+");
  if (temp_f == NULL) {
    error("Could not open temporary output file.\n");
  }

  // CHROM and POS fields will be placed at the end of the output file
  int marker_count = 0;
  int * chrom = malloc(MAX_VARIANTS * sizeof(int));
  if (chrom == NULL) {
    error("Could not allocate vector.\n");
  }
  int * pos = malloc(MAX_VARIANTS * sizeof(int));
  if (pos == NULL) {
    error("Could not allocate vector.\n");
  }

  // Mapping of VCF genotypes to ONEMAP genotypes
  const char * const D_BC_ref[GT_TYPES_LEN] = { "a", "-", "ab", "-", "-", "-", "-" };
  const char * const D_BC_alt[GT_TYPES_LEN] = { "-", "a", "ab", "-", "-", "-", "-" };
  const char * const RI_ref[GT_TYPES_LEN] = { "a", "b", "-", "-", "-", "-", "-" };
  const char * const RI_alt[GT_TYPES_LEN] = { "b", "a", "-", "-", "-", "-", "-" };
  const char * const B3_F2_ref[GT_TYPES_LEN] = { "a", "b", "ab", "-", "-", "-", "-" };
  const char * const B3_F2_alt[GT_TYPES_LEN] = { "b", "a", "ab", "-", "-", "-", "-" };

  // Scan all records in VCF file and print valid markers to output
  bcf1_t *record;
  int32_t *GTs = NULL;
  int nGT_arr = 0;

  while ((record = bcf_sweep_fwd(in_vcf)) && marker_count < MAX_VARIANTS) {
    // We only consider biallelic SNP and INDEL markers
    int var_type = bcf_get_variant_types(record);
    if ((var_type == VCF_SNP || var_type == VCF_INDEL) && record->n_allele == 2) {
      int nGTs = bcf_get_format_int32(vcf_hdr, record, "GT", &GTs, &nGT_arr);
      // We only consider diploid variants (number of alleles in genotypes == 2)
      nGTs /= n_samples;
      if (nGTs == 2) {

        bcf_fmt_t *fmt_ptr = bcf_get_fmt(vcf_hdr, record, "GT");

        // First, check which parents are heterozygous or homozygous (REF or ALT allele)
        bool is_het_parent1 = false, is_hom_ref_parent1 = false, is_hom_alt_parent1 = false;
        get_consensus_parent_gt(fmt_ptr, *n_parent1, idx_parent1, min_class_parent1, &is_het_parent1,
                                &is_hom_ref_parent1, &is_hom_alt_parent1);
        bool is_het_parent2 = false, is_hom_ref_parent2 = false, is_hom_alt_parent2 = false;
        get_consensus_parent_gt(fmt_ptr, *n_parent2, idx_parent2, min_class_parent2, &is_het_parent2,
                                &is_hom_ref_parent2, &is_hom_alt_parent2);

        // Convert to appropriate marker type
        char marker_type[MARKER_TYPE_LEN];
        int type = get_marker_type(marker_type, cross_type,
                                   is_het_parent1, is_hom_ref_parent1, is_hom_alt_parent1,
                                   is_het_parent2, is_hom_ref_parent2, is_hom_alt_parent2);

        const char * const(*type_ptr)[GT_TYPES_LEN];
        bool valid_marker = true;
        switch(type)
        {
        case marker_B3:
        case marker_F2_ref:
          type_ptr = &B3_F2_ref;
          break;
        case marker_F2_alt:
          type_ptr = &B3_F2_alt;
          break;
        case marker_D_ref:
        case marker_BC_ref:
          type_ptr = &D_BC_ref;
          break;
        case marker_D_alt:
        case marker_BC_alt:
          type_ptr = &D_BC_alt;
          break;
        case marker_RI_ref:
          type_ptr = &RI_ref;
          break;
        case marker_RI_alt:
          type_ptr = &RI_alt;
          break;
        default:
          valid_marker = false;
        }

        if (valid_marker) {
          // Store CHROM and POS fields for valid markers
          chrom[marker_count] = record->rid;
          pos[marker_count] = record->pos + 1;

          // Check if marker name exists; if negative, create one
          char *marker_name = record->d.id;
          if (!strcmp(marker_name, ".")) {
            sprintf(marker_name, "%s.%d", seq_names[chrom[marker_count]], pos[marker_count]);
          }

          // Output variant in ONEMAP format to temporary file
          print_record(temp_f, marker_name, marker_type, fmt_ptr, n_progeny, idx_progeny, type_ptr);

          marker_count++;
        }
      }
    }
  }

  // Write final output file header
  FILE *final_f = fopen(*out_filename, "w");
  if (final_f == NULL) {
    error("Could not open output file.\n");
  }
  fprintf(final_f, "data type %s\n", *cross);
  // The next header line contains the following information: number of individuals, number of markers, 1 for the presence of CHROM information, 1 for the presence of POS information and 0 for the absence of phenotypes (these need to be manually included later)
  fprintf(final_f, "%d %d 1 1 0\n", n_progeny, marker_count);
  // The next header line contains the sample names
  char *cur_sample_name = vcf_hdr->samples[idx_progeny[0]];
  fprintf(final_f, "%s", cur_sample_name);
  for (i = 1; i < n_progeny; i++) {
    cur_sample_name = vcf_hdr->samples[idx_progeny[i]];
    fprintf(final_f, "\t%s", cur_sample_name);
  }
  fprintf(final_f, "\n");
  
  // Copy marker data from temporary file to final file
  rewind(temp_f);
  char buf[BUFSIZ];
  size_t size;
  while ((size = fread(buf, 1, BUFSIZ, temp_f))) {
    fwrite(buf, 1, size, final_f);
  }

  // Write CHROM and POS data to output file
  if (marker_count) {
    fprintf(final_f, "*CHROM\t");
    fprintf(final_f, "%s", seq_names[chrom[0]]);
    for (i = 1; i < marker_count; i++) {
      fprintf(final_f, " %s", seq_names[chrom[i]]);
    }
    fprintf(final_f, "\n*POS\t");
    fprintf(final_f, "%d", pos[0]);
    for (i = 1; i < marker_count; i++) {
      fprintf(final_f, " %d", pos[i]);
    }
  }

  // Clean-up
  free(chrom);
  free(pos);

  free(GTs);
  bcf_sweep_destroy(in_vcf);

  fclose(temp_f);
  close(temp_fd);
  fclose(final_f);
}
Example #8
0
static void buffered_filters(args_t *args, bcf1_t *line)
{
    /**
     *  The logic of SnpGap=3. The SNPs at positions 1 and 7 are filtered,
     *  positions 0 and 8 are not:
     *           0123456789
     *      ref  .G.GT..G..
     *      del  .A.G-..A..
     *  Here the positions 1 and 6 are filtered, 0 and 7 are not:
     *           0123-456789
     *      ref  .G.G-..G..
     *      ins  .A.GT..A..
     *
     *  The logic of IndelGap=2. The second indel is filtered:
     *           012345678901
     *      ref  .GT.GT..GT..
     *      del  .G-.G-..G-..
     *  And similarly here, the second is filtered:
     *           01 23 456 78
     *      ref  .A-.A-..A-..
     *      ins  .AT.AT..AT..
     */

    // To avoid additional data structure, we abuse bcf1_t's var and var_type records.
    const int SnpGap_set     = VCF_OTHER<<1;
    const int IndelGap_set   = VCF_OTHER<<2;
    const int IndelGap_flush = VCF_OTHER<<3;

    int var_type = 0, i;
    if ( line )
    {
        // Still on the same chromosome?
        int ilast = rbuf_last(&args->rbuf);
        if ( ilast>=0 && line->rid != args->rbuf_lines[ilast]->rid )
            flush_buffer(args, args->rbuf.n); // new chromosome, flush everything

        rbuf_expand0(&args->rbuf,bcf1_t*,args->rbuf.n,args->rbuf_lines);

        // Insert the new record in the buffer. The line would be overwritten in
        // the next bcf_sr_next_line call, therefore we need to swap it with an
        // unused one
        ilast = rbuf_append(&args->rbuf);
        if ( !args->rbuf_lines[ilast] ) args->rbuf_lines[ilast] = bcf_init1();
        SWAP(bcf1_t*, args->files->readers[0].buffer[0], args->rbuf_lines[ilast]);

        var_type = bcf_get_variant_types(line);

        // Find out the size of an indel. The indel boundaries are based on REF
        // (POS+1,POS+rlen-1). This is not entirely correct: mpileup likes to
        // output REF=CAGAGAGAGA, ALT=CAGAGAGAGAGA where REF=C,ALT=CGA could be
        // used. This filter is therefore more strict and may remove some valid
        // SNPs.
        int len = 1;
        if ( var_type & VCF_INDEL )
        {
            for (i=1; i<line->n_allele; i++)
                if ( len < 1-line->d.var[i].n ) len = 1-line->d.var[i].n;
        }

        // Set the REF allele's length to max deletion length or to 1 if a SNP or an insertion.
        line->d.var[0].n = len;
    }

    int k_flush = 1;
    if ( args->indel_gap )
    {
        k_flush = 0;
        // Find indels which are too close to each other
        int last_to = -1;
        for (i=-1; rbuf_next(&args->rbuf,&i); )
        {
            bcf1_t *rec  = args->rbuf_lines[i];
            int rec_from = rec->pos;
            if ( last_to!=-1 && last_to < rec_from ) break;

            k_flush++;
            if ( !(rec->d.var_type & VCF_INDEL) ) continue;

            rec->d.var_type |= IndelGap_set;
            last_to = args->indel_gap + rec->pos + rec->d.var[0].n - 1;
        }
        if ( i==args->rbuf.f && line && last_to!=-1 ) k_flush = 0;
        if ( k_flush || !line )
        {
            // Select the best indel from the cluster of k_flush indels
            int k = 0, max_ac = -1, imax_ac = -1;
            for (i=-1; rbuf_next(&args->rbuf,&i) && k<k_flush; )
            {
                k++;
                bcf1_t *rec  = args->rbuf_lines[i];
                if ( !(rec->d.var_type & IndelGap_set) ) continue;
                hts_expand(int, rec->n_allele, args->ntmpi, args->tmpi);
                int ret = bcf_calc_ac(args->hdr, rec, args->tmpi, BCF_UN_ALL);
                if ( imax_ac==-1 || (ret && max_ac < args->tmpi[1]) ) { max_ac = args->tmpi[1]; imax_ac = i; }
            }

            // Filter all but the best indel (with max AF or first if AF not available)
            k = 0;
            for (i=-1; rbuf_next(&args->rbuf,&i) && k<k_flush; )
            {
                k++;
                bcf1_t *rec = args->rbuf_lines[i];
                if ( !(rec->d.var_type & IndelGap_set) ) continue;
                rec->d.var_type |= IndelGap_flush;
                if ( i!=imax_ac ) bcf_add_filter(args->hdr, args->rbuf_lines[i], args->IndelGap_id);
            }
        }
Example #9
0
int subset_vcf(args_t *args, bcf1_t *line)
{
    if ( args->min_alleles && line->n_allele < args->min_alleles ) return 0; // min alleles
    if ( args->max_alleles && line->n_allele > args->max_alleles ) return 0; // max alleles
    if (args->novel || args->known)
    {
        if ( args->novel && (line->d.id[0]!='.' || line->d.id[1]!=0) ) return 0; // skip sites which are known, ID != '.'
        if ( args->known && line->d.id[0]=='.' && line->d.id[1]==0 ) return 0;  // skip sites which are novel, ID == '.'
    }

    if (args->include || args->exclude)
    {
        int line_type = bcf_get_variant_types(line);
        if ( args->include && !(line_type&args->include) ) return 0; // include only given variant types
        if ( args->exclude &&   line_type&args->exclude  ) return 0; // exclude given variant types
    }

    if ( args->filter )
    {
        int ret = filter_test(args->filter, line, NULL);
        if ( args->filter_logic==FLT_INCLUDE ) { if ( !ret ) return 0; }
        else if ( ret ) return 0;
    }

    hts_expand(int, line->n_allele, args->mac, args->ac);
    int i, an = 0, non_ref_ac = 0;
    if (args->calc_ac) {
        bcf_calc_ac(args->hdr, line, args->ac, BCF_UN_INFO|BCF_UN_FMT); // get original AC and AN values from INFO field if available, otherwise calculate
        for (i=1; i<line->n_allele; i++)
            non_ref_ac += args->ac[i];
        for (i=0; i<line->n_allele; i++)
            an += args->ac[i];
    }

    if (args->n_samples)
    {
        int non_ref_ac_sub = 0, *ac_sub = (int*) calloc(line->n_allele,sizeof(int));
        bcf_subset(args->hdr, line, args->n_samples, args->imap);
        if (args->calc_ac) {
            bcf_calc_ac(args->hsub, line, ac_sub, BCF_UN_FMT); // recalculate AC and AN
            an = 0;
            for (i=0; i<line->n_allele; i++) {
                args->ac[i] = ac_sub[i];
                an += ac_sub[i];
            }
            for (i=1; i<line->n_allele; i++)
                non_ref_ac_sub += ac_sub[i];
            if (args->private_vars) {
                if (args->private_vars == FLT_INCLUDE && !(non_ref_ac_sub > 0 && non_ref_ac == non_ref_ac_sub)) { free(ac_sub); return 0; } // select private sites
                if (args->private_vars == FLT_EXCLUDE && non_ref_ac_sub > 0 && non_ref_ac == non_ref_ac_sub) { free(ac_sub); return 0; } // exclude private sites
            }
            non_ref_ac = non_ref_ac_sub;
        }
        free(ac_sub);
    }

    bcf_fmt_t *gt_fmt;
    if ( args->gt_type && (gt_fmt=bcf_get_fmt(args->hdr,line,"GT")) )
    {
        int nhet = 0, nhom = 0, nmiss = 0;
        for (i=0; i<bcf_hdr_nsamples(args->hdr); i++)
        {
            int type = bcf_gt_type(gt_fmt,i,NULL,NULL);
            if ( type==GT_HET_RA || type==GT_HET_AA )
            {
                if ( args->gt_type==GT_NO_HET ) return 0;
                nhet = 1;
            }
            else if ( type==GT_UNKN )
            {
                if ( args->gt_type==GT_NO_MISSING ) return 0;
                nmiss = 1;
            }
            else
            {
                if ( args->gt_type==GT_NO_HOM ) return 0;
                nhom = 1;
            }
        }
        if ( args->gt_type==GT_NEED_HOM && !nhom ) return 0;
        else if ( args->gt_type==GT_NEED_HET && !nhet ) return 0;
        else if ( args->gt_type==GT_NEED_MISSING && !nmiss ) return 0;
    }

    int minor_ac = 0;
    int major_ac = 0;
    if ( args->calc_ac )
    {
        minor_ac = args->ac[0];
        major_ac = args->ac[0];
        for (i=1; i<line->n_allele; i++){
            if (args->ac[i] < minor_ac) { minor_ac = args->ac[i]; }
            if (args->ac[i] > major_ac) { major_ac = args->ac[i]; }
        }
    }

    if (args->min_ac)
    {
        if (args->min_ac_type == ALLELE_NONREF && args->min_ac>non_ref_ac) return 0; // min AC
        else if (args->min_ac_type == ALLELE_MINOR && args->min_ac>minor_ac) return 0; // min minor AC
        else if (args->min_ac_type == ALLELE_ALT1 && args->min_ac>args->ac[1]) return 0; // min 1st alternate AC
        else if (args->min_ac_type == ALLELE_MAJOR && args->min_ac > major_ac) return 0; // min major AC
        else if (args->min_ac_type == ALLELE_NONMAJOR && args->min_ac > an-major_ac) return 0; // min non-major AC
    }
    if (args->max_ac)
    {
        if (args->max_ac_type == ALLELE_NONREF && args->max_ac<non_ref_ac) return 0; // max AC
        else if (args->max_ac_type == ALLELE_MINOR && args->max_ac<minor_ac) return 0; // max minor AC
        else if (args->max_ac_type == ALLELE_ALT1 && args->max_ac<args->ac[1]) return 0; // max 1st alternate AC
        else if (args->max_ac_type == ALLELE_MAJOR && args->max_ac < major_ac) return 0; // max major AC
        else if (args->max_ac_type == ALLELE_NONMAJOR && args->max_ac < an-major_ac) return 0; // max non-major AC
    }
    if (args->min_af)
    {
        if (an == 0) return 0; // freq not defined, skip site
        if (args->min_af_type == ALLELE_NONREF && args->min_af>non_ref_ac/(double)an) return 0; // min AF
        else if (args->min_af_type == ALLELE_MINOR && args->min_af>minor_ac/(double)an) return 0; // min minor AF
        else if (args->min_af_type == ALLELE_ALT1 && args->min_af>args->ac[1]/(double)an) return 0; // min 1st alternate AF
        else if (args->min_af_type == ALLELE_MAJOR && args->min_af > major_ac/(double)an) return 0; // min major AF
        else if (args->min_af_type == ALLELE_NONMAJOR && args->min_af > (an-major_ac)/(double)an) return 0; // min non-major AF
    }
    if (args->max_af)
    {
        if (an == 0) return 0; // freq not defined, skip site
        if (args->max_af_type == ALLELE_NONREF && args->max_af<non_ref_ac/(double)an) return 0; // max AF
        else if (args->max_af_type == ALLELE_MINOR && args->max_af<minor_ac/(double)an) return 0; // max minor AF
        else if (args->max_af_type == ALLELE_ALT1 && args->max_af<args->ac[1]/(double)an) return 0; // max 1st alternate AF
        else if (args->max_af_type == ALLELE_MAJOR && args->max_af < major_ac/(double)an) return 0; // max major AF
        else if (args->max_af_type == ALLELE_NONMAJOR && args->max_af < (an-major_ac)/(double)an) return 0; // max non-major AF
    }
    if (args->uncalled) {
        if (args->uncalled == FLT_INCLUDE && an > 0) return 0; // select uncalled
        if (args->uncalled == FLT_EXCLUDE && an == 0) return 0; // skip if uncalled
    }
    if (args->calc_ac && args->update_info) {
        bcf_update_info_int32(args->hdr, line, "AC", &args->ac[1], line->n_allele-1);
        bcf_update_info_int32(args->hdr, line, "AN", &an, 1);
    }
    if (args->trim_alts)
    {
        int ret = bcf_trim_alleles(args->hsub ? args->hsub : args->hdr, line);
        if ( ret==-1 ) error("Error: some GT index is out of bounds at %s:%d\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), line->pos+1);
    }
    if (args->phased) {
        int phased = bcf_all_phased(args->hdr, line);
        if (args->phased == FLT_INCLUDE && !phased) { return 0; } // skip unphased
        if (args->phased == FLT_EXCLUDE && phased) { return 0; } // skip phased
    }
    if (args->sites_only) bcf_subset(args->hsub ? args->hsub : args->hdr, line, 0, 0);
    return 1;
}