int process(bcf1_t *rec) { if ( rec->n_allele<2 ) return 0; // not a variant int type = bcf_get_variant_types(rec); if ( !(type&VCF_INDEL) ) return 0; // not an indel int i, len = 0; for (i=1; i<rec->n_allele; i++) if ( len > rec->d.var[i].n ) len = rec->d.var[i].n; int pos_to = len!=0 ? rec->pos : rec->pos - len; // len is negative if ( bcf_sr_regions_overlap(exons, bcf_seqname(in_hdr,rec),rec->pos,pos_to) ) return 0; // no overlap hts_expand(int32_t,rec->n_allele-1,nfrm,frm); for (i=1; i<rec->n_allele; i++) { if ( rec->d.var[i].type!=VCF_INDEL ) { frm[i-1] = -1; continue; } int len = rec->d.var[i].n, tlen = 0; if ( len>0 ) { // insertion if ( exons->start <= rec->pos && exons->end > rec->pos ) tlen = abs(len); } else if ( exons->start <= rec->pos + abs(len) ) { // deletion tlen = abs(len); if ( rec->pos < exons->start ) // trim the beginning tlen -= exons->start - rec->pos + 1; if ( exons->end < rec->pos + abs(len) ) // trim the end tlen -= rec->pos + abs(len) - exons->end; } if ( tlen ) // there are some deleted/inserted bases in the exon { if ( tlen%3 ) frm[i-1] = 1; // out-of-frame else frm[i-1] = 0; // in-frame } else frm[i-1] = -1; // not applicable (is outside) } if ( bcf_update_info_int32(out_hdr,rec,"OOF",frm,rec->n_allele-1)<0 ) return -1; return 0; }
/* Removes duplicate records from the buffer. The meaning of "duplicate" is controlled by the $collapse variable, which can cause that from multiple <indel|snp|any> lines only the first is considered and the rest is ignored. The removal is done by setting the redundant lines' positions to -1 and moving these lines at the end of the buffer. */ static void collapse_buffer(bcf_srs_t *files, bcf_sr_t *reader) { int irec,jrec, has_snp=0, has_indel=0, has_any=0; for (irec=1; irec<=reader->nbuffer; irec++) { bcf1_t *line = reader->buffer[irec]; if ( line->pos != reader->buffer[1]->pos ) break; if ( files->collapse&COLLAPSE_ANY ) { if ( !has_any ) has_any = 1; else line->pos = -1; } int line_type = bcf_get_variant_types(line); if ( files->collapse&COLLAPSE_SNPS && line_type&(VCF_SNP|VCF_MNP) ) { if ( !has_snp ) has_snp = 1; else line->pos = -1; } if ( files->collapse&COLLAPSE_INDELS && line_type&VCF_INDEL ) { if ( !has_indel ) has_indel = 1; else line->pos = -1; } } bcf1_t *tmp; irec = jrec = 1; while ( irec<=reader->nbuffer && jrec<=reader->nbuffer ) { if ( reader->buffer[irec]->pos != -1 ) { irec++; continue; } if ( jrec<=irec ) jrec = irec+1; while ( jrec<=reader->nbuffer && reader->buffer[jrec]->pos==-1 ) jrec++; if ( jrec<=reader->nbuffer ) { tmp = reader->buffer[irec]; reader->buffer[irec] = reader->buffer[jrec]; reader->buffer[jrec] = tmp; } } reader->nbuffer = irec - 1; }
static void filters_set_type(bcf1_t *line, token_t *tok) { tok->num_value = bcf_get_variant_types(line); }
static void bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int min_pos) { if ( !srt->grp_str2int ) { // first time here, initialize if ( !srt->pair ) { if ( readers->collapse==COLLAPSE_NONE ) readers->collapse = BCF_SR_PAIR_EXACT; bcf_sr_set_opt(readers, BCF_SR_PAIR_LOGIC, readers->collapse); } bcf_sr_init_scores(srt); srt->grp_str2int = khash_str2int_init(); srt->var_str2int = khash_str2int_init(); } int k; khash_t(str2int) *hash; hash = srt->grp_str2int; for (k=0; k < kh_end(hash); k++) if ( kh_exist(hash,k) ) free((char*)kh_key(hash,k)); hash = srt->var_str2int; for (k=0; k < kh_end(hash); k++) if ( kh_exist(hash,k) ) free((char*)kh_key(hash,k)); kh_clear(str2int, srt->grp_str2int); kh_clear(str2int, srt->var_str2int); srt->ngrp = srt->nvar = srt->nvset = 0; grp_t grp; memset(&grp,0,sizeof(grp_t)); // group VCFs into groups, each with a unique combination of variants in the duplicate lines int ireader,ivar,irec,igrp,ivset,iact; for (ireader=0; ireader<readers->nreaders; ireader++) srt->vcf_buf[ireader].nrec = 0; for (iact=0; iact<srt->nactive; iact++) { ireader = srt->active[iact]; bcf_sr_t *reader = &readers->readers[ireader]; int rid = bcf_hdr_name2id(reader->header, chr); grp.nvar = 0; hts_expand(int,reader->nbuffer,srt->moff,srt->off); srt->noff = 0; srt->str.l = 0; for (irec=1; irec<=reader->nbuffer; irec++) { bcf1_t *line = reader->buffer[irec]; if ( line->rid!=rid || line->pos!=min_pos ) break; if ( srt->str.l ) kputc(';',&srt->str); srt->off[srt->noff++] = srt->str.l; size_t beg = srt->str.l; for (ivar=1; ivar<line->n_allele; ivar++) { if ( ivar>1 ) kputc(',',&srt->str); kputs(line->d.allele[0],&srt->str); kputc('>',&srt->str); kputs(line->d.allele[ivar],&srt->str); } if ( line->n_allele==1 ) { kputs(line->d.allele[0],&srt->str); kputsn(">.",2,&srt->str); } // Create new variant or attach to existing one. But careful, there can be duplicate // records with the same POS,REF,ALT (e.g. in dbSNP-b142) char *var_str = beg + srt->str.s; int ret, var_idx = 0, var_end = srt->str.l; while ( 1 ) { ret = khash_str2int_get(srt->var_str2int, var_str, &ivar); if ( ret==-1 ) break; var_t *var = &srt->var[ivar]; if ( var->vcf[var->nvcf-1] != ireader ) break; srt->str.l = var_end; kputw(var_idx, &srt->str); var_str = beg + srt->str.s; var_idx++; } if ( ret==-1 ) { ivar = srt->nvar++; hts_expand0(var_t,srt->nvar,srt->mvar,srt->var); srt->var[ivar].nvcf = 0; khash_str2int_set(srt->var_str2int, strdup(var_str), ivar); free(srt->var[ivar].str); // possible left-over from the previous position } var_t *var = &srt->var[ivar]; var->nalt = line->n_allele - 1; var->type = bcf_get_variant_types(line); srt->str.s[var_end] = 0; if ( ret==-1 ) var->str = strdup(var_str); int mvcf = var->mvcf; var->nvcf++; hts_expand0(int*, var->nvcf, var->mvcf, var->vcf); if ( mvcf != var->mvcf ) var->rec = (bcf1_t **) realloc(var->rec,sizeof(bcf1_t*)*var->mvcf); var->vcf[var->nvcf-1] = ireader; var->rec[var->nvcf-1] = line; grp.nvar++; hts_expand(var_t,grp.nvar,grp.mvar,grp.var); grp.var[grp.nvar-1] = ivar; } char *grp_key = grp_create_key(srt); int ret = khash_str2int_get(srt->grp_str2int, grp_key, &igrp); if ( ret==-1 ) { igrp = srt->ngrp++; hts_expand0(grp_t, srt->ngrp, srt->mgrp, srt->grp); free(srt->grp[igrp].var); srt->grp[igrp] = grp; srt->grp[igrp].key = grp_key; khash_str2int_set(srt->grp_str2int, grp_key, igrp); memset(&grp,0,sizeof(grp_t)); } else free(grp_key); srt->grp[igrp].nvcf++; } free(grp.var); // initialize bitmask - which groups is the variant present in for (ivar=0; ivar<srt->nvar; ivar++) { srt->var[ivar].mask = kbs_resize(srt->var[ivar].mask, srt->ngrp); kbs_clear(srt->var[ivar].mask); } for (igrp=0; igrp<srt->ngrp; igrp++) { for (ivar=0; ivar<srt->grp[igrp].nvar; ivar++) { int i = srt->grp[igrp].var[ivar]; kbs_insert(srt->var[i].mask, igrp); } } // create the initial list of variant sets for (ivar=0; ivar<srt->nvar; ivar++) { ivset = srt->nvset++; hts_expand0(varset_t, srt->nvset, srt->mvset, srt->vset); varset_t *vset = &srt->vset[ivset]; vset->nvar = 1; hts_expand0(var_t, vset->nvar, vset->mvar, vset->var); vset->var[vset->nvar-1] = ivar; var_t *var = &srt->var[ivar]; vset->cnt = var->nvcf; vset->mask = kbs_resize(vset->mask, srt->ngrp); kbs_clear(vset->mask); kbs_bitwise_or(vset->mask, var->mask); int type = 0; if ( var->type==VCF_REF ) type |= SR_REF; else { if ( var->type & VCF_SNP ) type |= SR_SNP; if ( var->type & VCF_MNP ) type |= SR_SNP; if ( var->type & VCF_INDEL ) type |= SR_INDEL; if ( var->type & VCF_OTHER ) type |= SR_OTHER; } var->type = type; } #if DEBUG_VSETS debug_vsets(srt); #endif // initialize the pairing matrix hts_expand(int, srt->ngrp*srt->nvset, srt->mpmat, srt->pmat); hts_expand(int, srt->nvset, srt->mcnt, srt->cnt); memset(srt->pmat, 0, sizeof(*srt->pmat)*srt->ngrp*srt->nvset); for (ivset=0; ivset<srt->nvset; ivset++) { varset_t *vset = &srt->vset[ivset]; for (igrp=0; igrp<srt->ngrp; igrp++) srt->pmat[ivset*srt->ngrp+igrp] = 0; srt->cnt[ivset] = vset->cnt; } // pair the lines while ( srt->nvset ) { #if DEBUG_VSETS fprintf(stderr,"\n"); debug_vsets(srt); #endif int imax = 0; for (ivset=1; ivset<srt->nvset; ivset++) if ( srt->cnt[imax] < srt->cnt[ivset] ) imax = ivset; int ipair = -1; uint32_t max_score = 0; for (ivset=0; ivset<srt->nvset; ivset++) { if ( kbs_logical_and(srt->vset[imax].mask,srt->vset[ivset].mask) ) continue; // cannot be merged uint32_t score = pairing_score(srt, imax, ivset); // fprintf(stderr,"score: %d %d, logic=%d \t..\t %u\n", imax,ivset,srt->pair,score); if ( max_score < score ) { max_score = score; ipair = ivset; } } // merge rows creating a new variant set this way if ( ipair!=-1 && ipair!=imax ) { imax = merge_vsets(srt, imax, ipair); continue; } push_vset(srt, imax); } srt->chr = chr; srt->pos = min_pos; }
/* * _reader_match_alleles() - from multiple buffered lines selects the one which * corresponds best to the template line. The logic is controlled by COLLAPSE_* * Returns 0 on success or -1 when no good matching line is found. */ static int _reader_match_alleles(bcf_srs_t *files, bcf_sr_t *reader, bcf1_t *tmpl) { int i, irec = -1; // if no template given, use the first available record if ( !tmpl ) irec = 1; else { int tmpl_type = bcf_get_variant_types(tmpl); for (i=1; i<=reader->nbuffer; i++) { bcf1_t *line = reader->buffer[i]; if ( line->pos != reader->buffer[1]->pos ) break; // done with this reader // Easiest case: matching by position only if ( files->collapse&COLLAPSE_ANY ) { irec=i; break; } int line_type = bcf_get_variant_types(line); // No matter what the alleles are, as long as they are both SNPs if ( files->collapse&COLLAPSE_SNPS && tmpl_type&VCF_SNP && line_type&VCF_SNP ) { irec=i; break; } // ... or indels if ( files->collapse&COLLAPSE_INDELS && tmpl_type&VCF_INDEL && line_type&VCF_INDEL ) { irec=i; break; } // More thorough checking: REFs must match if ( tmpl->rlen != line->rlen ) continue; // different length if ( strcmp(tmpl->d.allele[0], line->d.allele[0]) ) continue; // the strings do not match int ial,jal; if ( files->collapse==COLLAPSE_NONE ) { // Exact match, all alleles must be identical if ( tmpl->n_allele!=line->n_allele ) continue; // different number of alleles, skip int nmatch = 1; // REF has been already checked for (ial=1; ial<tmpl->n_allele; ial++) { for (jal=1; jal<line->n_allele; jal++) if ( !strcmp(tmpl->d.allele[ial], line->d.allele[jal]) ) { nmatch++; break; } } if ( nmatch==tmpl->n_allele ) { irec=i; break; } // found: exact match continue; } if ( line->n_allele==1 && tmpl->n_allele==1 ) { irec=i; break; } // both sites are non-variant // COLLAPSE_SOME: at least some ALTs must match for (ial=1; ial<tmpl->n_allele; ial++) { for (jal=1; jal<line->n_allele; jal++) if ( !strcmp(tmpl->d.allele[ial], line->d.allele[jal]) ) { irec=i; break; } if ( irec>=1 ) break; } if ( irec>=1 ) break; } if ( irec==-1 ) return -1; // no matching line was found } // Set the selected line (irec) as active: set it to buffer[0], move the remaining lines forward // and put the old bcf1_t record at the end. bcf1_t *tmp = reader->buffer[0]; reader->buffer[0] = reader->buffer[irec]; for (i=irec+1; i<=reader->nbuffer; i++) reader->buffer[i-1] = reader->buffer[i]; reader->buffer[ reader->nbuffer ] = tmp; reader->nbuffer--; return 0; }
if ( *se=='\t' ) break; if ( *se!=',' ) continue; reg->als[reg->nals] = ®->als_str.s[reg->als_str.l]; kputsn(ss,se-ss,®->als_str); if ( ®->als_str.s[reg->als_str.l] - reg->als[reg->nals] > max_len ) max_len = ®->als_str.s[reg->als_str.l] - reg->als[reg->nals]; reg->als_str.l++; reg->nals++; ss = ++se; } reg->als[reg->nals] = ®->als_str.s[reg->als_str.l]; kputsn(ss,se-ss,®->als_str); if ( ®->als_str.s[reg->als_str.l] - reg->als[reg->nals] > max_len ) max_len = ®->als_str.s[reg->als_str.l] - reg->als[reg->nals]; reg->nals++; reg->als_type = max_len > 1 ? VCF_INDEL : VCF_SNP; // this is a simplified check, see vcf.c:bcf_set_variant_types } int type = bcf_get_variant_types(rec); if ( reg->als_type & VCF_INDEL ) return type & VCF_INDEL ? 1 : 0; return !(type & VCF_INDEL) ? 1 : 0; } int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, int start, int end) { int iseq; if ( khash_str2int_get(reg->seq_hash, seq, &iseq)<0 ) return -1; // no such sequence if ( reg->prev_seq==-1 || iseq!=reg->prev_seq || reg->prev_start > start ) // new chromosome or after a seek { // flush regions left on previous chromosome if ( reg->missed_reg_handler && reg->prev_seq!=-1 && reg->iseq!=-1 ) bcf_sr_regions_flush(reg);
/************************** * PROCESS INPUT VCF FILE * **************************/ void vcf2raw(char **filename, char **out_filename, char **cross, int *n_parent1, char **parent1, int *n_parent2, char **parent2, double *min_class) { // We assume the input file exists (checked in R) bcf_sweep_t *in_vcf = bcf_sweep_init(*filename); if (in_vcf == NULL) { bcf_sweep_destroy(in_vcf); error("Could not parse input VCF file."); } bcf_hdr_t *vcf_hdr = bcf_sweep_hdr(in_vcf); // Get reference sequence IDs int n_seq = 0; const char **seq_names = NULL; seq_names = bcf_hdr_seqnames(vcf_hdr, &n_seq); if (seq_names == NULL || n_seq == 0) { free(seq_names); error("Could not correctly parse sequence names in VCF file. Is the input file tabix indexed?\n"); } // Map parent names to sample indices int idx_parent1[*n_parent1]; int idx_parent2[*n_parent2]; get_parents_idx(*n_parent1, idx_parent1, *n_parent2, idx_parent2, vcf_hdr, parent1, parent2); // Get progeny sample indices (all samples that are not set as parents) int n_samples = bcf_hdr_nsamples(vcf_hdr); int n_progeny = n_samples - *n_parent1 - *n_parent2; if (n_progeny == 0) { error("Input file must contain at least one progeny individual."); } int idx_progeny[n_progeny]; int i = 0, s; for (s = 0; s < n_samples; s++) { if (!is_val_in_arr(s, idx_parent1, *n_parent1)) { if (!is_val_in_arr(s, idx_parent2, *n_parent2)) { idx_progeny[i++] = s; } } } // Minimum count to assign parent genotype int min_class_parent1 = (int)ceil(*min_class * *n_parent1); int min_class_parent2 = (int)ceil(*min_class * *n_parent2); // Convert cross type int cross_type = get_cross_type(cross); // We need to write to a temporary file, because the number of markers in the header is unknown FILE *temp_f; char temp_filename[] = "tmp_raw_XXXXXX"; int temp_fd; temp_fd = mkstemp(temp_filename); if (temp_fd == -1) { error("Could not open temporary output file.\n"); } unlink(temp_filename); temp_f = fdopen(temp_fd, "w+"); if (temp_f == NULL) { error("Could not open temporary output file.\n"); } // CHROM and POS fields will be placed at the end of the output file int marker_count = 0; int * chrom = malloc(MAX_VARIANTS * sizeof(int)); if (chrom == NULL) { error("Could not allocate vector.\n"); } int * pos = malloc(MAX_VARIANTS * sizeof(int)); if (pos == NULL) { error("Could not allocate vector.\n"); } // Mapping of VCF genotypes to ONEMAP genotypes const char * const D_BC_ref[GT_TYPES_LEN] = { "a", "-", "ab", "-", "-", "-", "-" }; const char * const D_BC_alt[GT_TYPES_LEN] = { "-", "a", "ab", "-", "-", "-", "-" }; const char * const RI_ref[GT_TYPES_LEN] = { "a", "b", "-", "-", "-", "-", "-" }; const char * const RI_alt[GT_TYPES_LEN] = { "b", "a", "-", "-", "-", "-", "-" }; const char * const B3_F2_ref[GT_TYPES_LEN] = { "a", "b", "ab", "-", "-", "-", "-" }; const char * const B3_F2_alt[GT_TYPES_LEN] = { "b", "a", "ab", "-", "-", "-", "-" }; // Scan all records in VCF file and print valid markers to output bcf1_t *record; int32_t *GTs = NULL; int nGT_arr = 0; while ((record = bcf_sweep_fwd(in_vcf)) && marker_count < MAX_VARIANTS) { // We only consider biallelic SNP and INDEL markers int var_type = bcf_get_variant_types(record); if ((var_type == VCF_SNP || var_type == VCF_INDEL) && record->n_allele == 2) { int nGTs = bcf_get_format_int32(vcf_hdr, record, "GT", >s, &nGT_arr); // We only consider diploid variants (number of alleles in genotypes == 2) nGTs /= n_samples; if (nGTs == 2) { bcf_fmt_t *fmt_ptr = bcf_get_fmt(vcf_hdr, record, "GT"); // First, check which parents are heterozygous or homozygous (REF or ALT allele) bool is_het_parent1 = false, is_hom_ref_parent1 = false, is_hom_alt_parent1 = false; get_consensus_parent_gt(fmt_ptr, *n_parent1, idx_parent1, min_class_parent1, &is_het_parent1, &is_hom_ref_parent1, &is_hom_alt_parent1); bool is_het_parent2 = false, is_hom_ref_parent2 = false, is_hom_alt_parent2 = false; get_consensus_parent_gt(fmt_ptr, *n_parent2, idx_parent2, min_class_parent2, &is_het_parent2, &is_hom_ref_parent2, &is_hom_alt_parent2); // Convert to appropriate marker type char marker_type[MARKER_TYPE_LEN]; int type = get_marker_type(marker_type, cross_type, is_het_parent1, is_hom_ref_parent1, is_hom_alt_parent1, is_het_parent2, is_hom_ref_parent2, is_hom_alt_parent2); const char * const(*type_ptr)[GT_TYPES_LEN]; bool valid_marker = true; switch(type) { case marker_B3: case marker_F2_ref: type_ptr = &B3_F2_ref; break; case marker_F2_alt: type_ptr = &B3_F2_alt; break; case marker_D_ref: case marker_BC_ref: type_ptr = &D_BC_ref; break; case marker_D_alt: case marker_BC_alt: type_ptr = &D_BC_alt; break; case marker_RI_ref: type_ptr = &RI_ref; break; case marker_RI_alt: type_ptr = &RI_alt; break; default: valid_marker = false; } if (valid_marker) { // Store CHROM and POS fields for valid markers chrom[marker_count] = record->rid; pos[marker_count] = record->pos + 1; // Check if marker name exists; if negative, create one char *marker_name = record->d.id; if (!strcmp(marker_name, ".")) { sprintf(marker_name, "%s.%d", seq_names[chrom[marker_count]], pos[marker_count]); } // Output variant in ONEMAP format to temporary file print_record(temp_f, marker_name, marker_type, fmt_ptr, n_progeny, idx_progeny, type_ptr); marker_count++; } } } } // Write final output file header FILE *final_f = fopen(*out_filename, "w"); if (final_f == NULL) { error("Could not open output file.\n"); } fprintf(final_f, "data type %s\n", *cross); // The next header line contains the following information: number of individuals, number of markers, 1 for the presence of CHROM information, 1 for the presence of POS information and 0 for the absence of phenotypes (these need to be manually included later) fprintf(final_f, "%d %d 1 1 0\n", n_progeny, marker_count); // The next header line contains the sample names char *cur_sample_name = vcf_hdr->samples[idx_progeny[0]]; fprintf(final_f, "%s", cur_sample_name); for (i = 1; i < n_progeny; i++) { cur_sample_name = vcf_hdr->samples[idx_progeny[i]]; fprintf(final_f, "\t%s", cur_sample_name); } fprintf(final_f, "\n"); // Copy marker data from temporary file to final file rewind(temp_f); char buf[BUFSIZ]; size_t size; while ((size = fread(buf, 1, BUFSIZ, temp_f))) { fwrite(buf, 1, size, final_f); } // Write CHROM and POS data to output file if (marker_count) { fprintf(final_f, "*CHROM\t"); fprintf(final_f, "%s", seq_names[chrom[0]]); for (i = 1; i < marker_count; i++) { fprintf(final_f, " %s", seq_names[chrom[i]]); } fprintf(final_f, "\n*POS\t"); fprintf(final_f, "%d", pos[0]); for (i = 1; i < marker_count; i++) { fprintf(final_f, " %d", pos[i]); } } // Clean-up free(chrom); free(pos); free(GTs); bcf_sweep_destroy(in_vcf); fclose(temp_f); close(temp_fd); fclose(final_f); }
static void buffered_filters(args_t *args, bcf1_t *line) { /** * The logic of SnpGap=3. The SNPs at positions 1 and 7 are filtered, * positions 0 and 8 are not: * 0123456789 * ref .G.GT..G.. * del .A.G-..A.. * Here the positions 1 and 6 are filtered, 0 and 7 are not: * 0123-456789 * ref .G.G-..G.. * ins .A.GT..A.. * * The logic of IndelGap=2. The second indel is filtered: * 012345678901 * ref .GT.GT..GT.. * del .G-.G-..G-.. * And similarly here, the second is filtered: * 01 23 456 78 * ref .A-.A-..A-.. * ins .AT.AT..AT.. */ // To avoid additional data structure, we abuse bcf1_t's var and var_type records. const int SnpGap_set = VCF_OTHER<<1; const int IndelGap_set = VCF_OTHER<<2; const int IndelGap_flush = VCF_OTHER<<3; int var_type = 0, i; if ( line ) { // Still on the same chromosome? int ilast = rbuf_last(&args->rbuf); if ( ilast>=0 && line->rid != args->rbuf_lines[ilast]->rid ) flush_buffer(args, args->rbuf.n); // new chromosome, flush everything rbuf_expand0(&args->rbuf,bcf1_t*,args->rbuf.n,args->rbuf_lines); // Insert the new record in the buffer. The line would be overwritten in // the next bcf_sr_next_line call, therefore we need to swap it with an // unused one ilast = rbuf_append(&args->rbuf); if ( !args->rbuf_lines[ilast] ) args->rbuf_lines[ilast] = bcf_init1(); SWAP(bcf1_t*, args->files->readers[0].buffer[0], args->rbuf_lines[ilast]); var_type = bcf_get_variant_types(line); // Find out the size of an indel. The indel boundaries are based on REF // (POS+1,POS+rlen-1). This is not entirely correct: mpileup likes to // output REF=CAGAGAGAGA, ALT=CAGAGAGAGAGA where REF=C,ALT=CGA could be // used. This filter is therefore more strict and may remove some valid // SNPs. int len = 1; if ( var_type & VCF_INDEL ) { for (i=1; i<line->n_allele; i++) if ( len < 1-line->d.var[i].n ) len = 1-line->d.var[i].n; } // Set the REF allele's length to max deletion length or to 1 if a SNP or an insertion. line->d.var[0].n = len; } int k_flush = 1; if ( args->indel_gap ) { k_flush = 0; // Find indels which are too close to each other int last_to = -1; for (i=-1; rbuf_next(&args->rbuf,&i); ) { bcf1_t *rec = args->rbuf_lines[i]; int rec_from = rec->pos; if ( last_to!=-1 && last_to < rec_from ) break; k_flush++; if ( !(rec->d.var_type & VCF_INDEL) ) continue; rec->d.var_type |= IndelGap_set; last_to = args->indel_gap + rec->pos + rec->d.var[0].n - 1; } if ( i==args->rbuf.f && line && last_to!=-1 ) k_flush = 0; if ( k_flush || !line ) { // Select the best indel from the cluster of k_flush indels int k = 0, max_ac = -1, imax_ac = -1; for (i=-1; rbuf_next(&args->rbuf,&i) && k<k_flush; ) { k++; bcf1_t *rec = args->rbuf_lines[i]; if ( !(rec->d.var_type & IndelGap_set) ) continue; hts_expand(int, rec->n_allele, args->ntmpi, args->tmpi); int ret = bcf_calc_ac(args->hdr, rec, args->tmpi, BCF_UN_ALL); if ( imax_ac==-1 || (ret && max_ac < args->tmpi[1]) ) { max_ac = args->tmpi[1]; imax_ac = i; } } // Filter all but the best indel (with max AF or first if AF not available) k = 0; for (i=-1; rbuf_next(&args->rbuf,&i) && k<k_flush; ) { k++; bcf1_t *rec = args->rbuf_lines[i]; if ( !(rec->d.var_type & IndelGap_set) ) continue; rec->d.var_type |= IndelGap_flush; if ( i!=imax_ac ) bcf_add_filter(args->hdr, args->rbuf_lines[i], args->IndelGap_id); } }
int subset_vcf(args_t *args, bcf1_t *line) { if ( args->min_alleles && line->n_allele < args->min_alleles ) return 0; // min alleles if ( args->max_alleles && line->n_allele > args->max_alleles ) return 0; // max alleles if (args->novel || args->known) { if ( args->novel && (line->d.id[0]!='.' || line->d.id[1]!=0) ) return 0; // skip sites which are known, ID != '.' if ( args->known && line->d.id[0]=='.' && line->d.id[1]==0 ) return 0; // skip sites which are novel, ID == '.' } if (args->include || args->exclude) { int line_type = bcf_get_variant_types(line); if ( args->include && !(line_type&args->include) ) return 0; // include only given variant types if ( args->exclude && line_type&args->exclude ) return 0; // exclude given variant types } if ( args->filter ) { int ret = filter_test(args->filter, line, NULL); if ( args->filter_logic==FLT_INCLUDE ) { if ( !ret ) return 0; } else if ( ret ) return 0; } hts_expand(int, line->n_allele, args->mac, args->ac); int i, an = 0, non_ref_ac = 0; if (args->calc_ac) { bcf_calc_ac(args->hdr, line, args->ac, BCF_UN_INFO|BCF_UN_FMT); // get original AC and AN values from INFO field if available, otherwise calculate for (i=1; i<line->n_allele; i++) non_ref_ac += args->ac[i]; for (i=0; i<line->n_allele; i++) an += args->ac[i]; } if (args->n_samples) { int non_ref_ac_sub = 0, *ac_sub = (int*) calloc(line->n_allele,sizeof(int)); bcf_subset(args->hdr, line, args->n_samples, args->imap); if (args->calc_ac) { bcf_calc_ac(args->hsub, line, ac_sub, BCF_UN_FMT); // recalculate AC and AN an = 0; for (i=0; i<line->n_allele; i++) { args->ac[i] = ac_sub[i]; an += ac_sub[i]; } for (i=1; i<line->n_allele; i++) non_ref_ac_sub += ac_sub[i]; if (args->private_vars) { if (args->private_vars == FLT_INCLUDE && !(non_ref_ac_sub > 0 && non_ref_ac == non_ref_ac_sub)) { free(ac_sub); return 0; } // select private sites if (args->private_vars == FLT_EXCLUDE && non_ref_ac_sub > 0 && non_ref_ac == non_ref_ac_sub) { free(ac_sub); return 0; } // exclude private sites } non_ref_ac = non_ref_ac_sub; } free(ac_sub); } bcf_fmt_t *gt_fmt; if ( args->gt_type && (gt_fmt=bcf_get_fmt(args->hdr,line,"GT")) ) { int nhet = 0, nhom = 0, nmiss = 0; for (i=0; i<bcf_hdr_nsamples(args->hdr); i++) { int type = bcf_gt_type(gt_fmt,i,NULL,NULL); if ( type==GT_HET_RA || type==GT_HET_AA ) { if ( args->gt_type==GT_NO_HET ) return 0; nhet = 1; } else if ( type==GT_UNKN ) { if ( args->gt_type==GT_NO_MISSING ) return 0; nmiss = 1; } else { if ( args->gt_type==GT_NO_HOM ) return 0; nhom = 1; } } if ( args->gt_type==GT_NEED_HOM && !nhom ) return 0; else if ( args->gt_type==GT_NEED_HET && !nhet ) return 0; else if ( args->gt_type==GT_NEED_MISSING && !nmiss ) return 0; } int minor_ac = 0; int major_ac = 0; if ( args->calc_ac ) { minor_ac = args->ac[0]; major_ac = args->ac[0]; for (i=1; i<line->n_allele; i++){ if (args->ac[i] < minor_ac) { minor_ac = args->ac[i]; } if (args->ac[i] > major_ac) { major_ac = args->ac[i]; } } } if (args->min_ac) { if (args->min_ac_type == ALLELE_NONREF && args->min_ac>non_ref_ac) return 0; // min AC else if (args->min_ac_type == ALLELE_MINOR && args->min_ac>minor_ac) return 0; // min minor AC else if (args->min_ac_type == ALLELE_ALT1 && args->min_ac>args->ac[1]) return 0; // min 1st alternate AC else if (args->min_ac_type == ALLELE_MAJOR && args->min_ac > major_ac) return 0; // min major AC else if (args->min_ac_type == ALLELE_NONMAJOR && args->min_ac > an-major_ac) return 0; // min non-major AC } if (args->max_ac) { if (args->max_ac_type == ALLELE_NONREF && args->max_ac<non_ref_ac) return 0; // max AC else if (args->max_ac_type == ALLELE_MINOR && args->max_ac<minor_ac) return 0; // max minor AC else if (args->max_ac_type == ALLELE_ALT1 && args->max_ac<args->ac[1]) return 0; // max 1st alternate AC else if (args->max_ac_type == ALLELE_MAJOR && args->max_ac < major_ac) return 0; // max major AC else if (args->max_ac_type == ALLELE_NONMAJOR && args->max_ac < an-major_ac) return 0; // max non-major AC } if (args->min_af) { if (an == 0) return 0; // freq not defined, skip site if (args->min_af_type == ALLELE_NONREF && args->min_af>non_ref_ac/(double)an) return 0; // min AF else if (args->min_af_type == ALLELE_MINOR && args->min_af>minor_ac/(double)an) return 0; // min minor AF else if (args->min_af_type == ALLELE_ALT1 && args->min_af>args->ac[1]/(double)an) return 0; // min 1st alternate AF else if (args->min_af_type == ALLELE_MAJOR && args->min_af > major_ac/(double)an) return 0; // min major AF else if (args->min_af_type == ALLELE_NONMAJOR && args->min_af > (an-major_ac)/(double)an) return 0; // min non-major AF } if (args->max_af) { if (an == 0) return 0; // freq not defined, skip site if (args->max_af_type == ALLELE_NONREF && args->max_af<non_ref_ac/(double)an) return 0; // max AF else if (args->max_af_type == ALLELE_MINOR && args->max_af<minor_ac/(double)an) return 0; // max minor AF else if (args->max_af_type == ALLELE_ALT1 && args->max_af<args->ac[1]/(double)an) return 0; // max 1st alternate AF else if (args->max_af_type == ALLELE_MAJOR && args->max_af < major_ac/(double)an) return 0; // max major AF else if (args->max_af_type == ALLELE_NONMAJOR && args->max_af < (an-major_ac)/(double)an) return 0; // max non-major AF } if (args->uncalled) { if (args->uncalled == FLT_INCLUDE && an > 0) return 0; // select uncalled if (args->uncalled == FLT_EXCLUDE && an == 0) return 0; // skip if uncalled } if (args->calc_ac && args->update_info) { bcf_update_info_int32(args->hdr, line, "AC", &args->ac[1], line->n_allele-1); bcf_update_info_int32(args->hdr, line, "AN", &an, 1); } if (args->trim_alts) { int ret = bcf_trim_alleles(args->hsub ? args->hsub : args->hdr, line); if ( ret==-1 ) error("Error: some GT index is out of bounds at %s:%d\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), line->pos+1); } if (args->phased) { int phased = bcf_all_phased(args->hdr, line); if (args->phased == FLT_INCLUDE && !phased) { return 0; } // skip unphased if (args->phased == FLT_EXCLUDE && phased) { return 0; } // skip phased } if (args->sites_only) bcf_subset(args->hsub ? args->hsub : args->hdr, line, 0, 0); return 1; }