static void print_chain(args_t *args) { /* Example chain format (see: https://genome.ucsc.edu/goldenPath/help/chain.html): chain 1 500 + 480 500 1 501 + 480 501 1 12 3 1 1 0 3 484 chain line is: - chain - score (sum of the length of ungapped block in this case) - ref_seqname (from the fasta header, parsed by htslib) - ref_seqlength (from the fasta header) - ref_strand (+ or -; always + for bcf-consensus) - ref_start (as defined in the fasta header) - ref_end (as defined in the fasta header) - alt_seqname (same as ref_seqname as bcf-consensus only considers SNPs and indels) - alt_seqlength (adjusted to match the length of the alt sequence) - alt_strand (+ or -; always + for bcf-consensus) - alt_start (same as ref_start, as no edits are recorded/applied before that position) - alt_end (adjusted to match the length of the alt sequence) - chain_num (just an auto-increment id) the other (sorted) lines are: - length of the ungapped alignment block - gap on the ref sequence between this and the next block (all but the last line) - gap on the alt sequence between this and the next block (all but the last line) */ chain_t *chain = args->chain; int n = chain->num; int ref_end_pos = args->fa_length + chain->ori_pos; int last_block_size = ref_end_pos - chain->ref_last_block_ori; int alt_end_pos = chain->alt_last_block_ori + last_block_size; int score = 0; for (n=0; n<chain->num; n++) { score += chain->block_lengths[n]; } score += last_block_size; fprintf(args->fp_chain, "chain %d %s %d + %d %d %s %d + %d %d %d\n", score, bcf_hdr_id2name(args->hdr,args->rid), ref_end_pos, chain->ori_pos, ref_end_pos, bcf_hdr_id2name(args->hdr,args->rid), alt_end_pos, chain->ori_pos, alt_end_pos, ++args->chain_id); for (n=0; n<chain->num; n++) { fprintf(args->fp_chain, "%d %d %d\n", chain->block_lengths[n], chain->ref_gaps[n], chain->alt_gaps[n]); } fprintf(args->fp_chain, "%d\n\n", last_block_size); }
static void mask_region(args_t *args, char *seq, int len) { char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid); int start = args->fa_src_pos - len; int end = args->fa_src_pos; regitr_t itr; if ( !regidx_overlap(args->mask, chr,start,end, &itr) ) return; int idx_start, idx_end, i; while ( REGITR_OVERLAP(itr,start,end) ) { idx_start = REGITR_START(itr) - start; idx_end = REGITR_END(itr) - start; if ( idx_start < 0 ) idx_start = 0; if ( idx_end >= len ) idx_end = len - 1; for (i=idx_start; i<=idx_end; i++) seq[i] = 'N'; itr.i++; } }
static void apply_variant(args_t *args, bcf1_t *rec) { if ( rec->n_allele==1 ) return; if ( rec->pos <= args->fa_frz_pos ) { fprintf(pysamerr,"The site %s:%d overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),rec->pos+1); return; } if ( args->mask ) { char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid); int start = rec->pos; int end = rec->pos + rec->rlen - 1; if ( regidx_overlap(args->mask, chr,start,end,NULL) ) return; } int i, ialt = 1; if ( args->isample >= 0 ) { bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT"); if ( !fmt ) return; if ( args->haplotype ) { if ( args->haplotype > fmt->n ) error("Can't apply %d-th haplotype at %s:%d\n", args->haplotype,bcf_seqname(args->hdr,rec),rec->pos+1); uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample + args->haplotype - 1; ialt = bcf_dec_int1(ptr, fmt->type, &ignore); if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return; ialt = bcf_gt_allele(ialt); } else if ( args->output_iupac ) { uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample; ialt = bcf_dec_int1(ptr, fmt->type, &ignore); if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return; ialt = bcf_gt_allele(ialt); int jalt; if ( fmt->n>1 ) { ptr = fmt->p + fmt->size*args->isample + 1; jalt = bcf_dec_int1(ptr, fmt->type, &ignore); if ( bcf_gt_is_missing(jalt) || jalt==bcf_int32_vector_end ) jalt = ialt; else jalt = bcf_gt_allele(jalt); } else jalt = ialt; if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); if ( ialt!=jalt && !rec->d.allele[ialt][1] && !rec->d.allele[jalt][1] ) // is this a het snp? { char ial = rec->d.allele[ialt][0]; char jal = rec->d.allele[jalt][0]; rec->d.allele[ialt][0] = gt2iupac(ial,jal); } } else { for (i=0; i<fmt->n; i++) { uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample + i; ialt = bcf_dec_int1(ptr, fmt->type, &ignore); if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return; ialt = bcf_gt_allele(ialt); if ( ialt ) break; } } if ( !ialt ) return; // ref allele if ( rec->n_allele <= ialt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); } else if ( args->output_iupac && !rec->d.allele[0][1] && !rec->d.allele[1][1] ) { char ial = rec->d.allele[0][0]; char jal = rec->d.allele[1][0]; rec->d.allele[1][0] = gt2iupac(ial,jal); } int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off; if ( idx<0 || idx>=args->fa_buf.l ) error("FIXME: %s:%d .. idx=%d, ori_pos=%d, len=%d, off=%d\n",bcf_seqname(args->hdr,rec),rec->pos+1,idx,args->fa_ori_pos,args->fa_buf.l,args->fa_mod_off); // sanity check the reference base int len_diff = 0, alen = 0; if ( rec->d.allele[ialt][0]=='<' ) { if ( strcasecmp(rec->d.allele[ialt], "<DEL>") ) error("Symbolic alleles other than <DEL> are currently not supported: %s at %s:%d\n",rec->d.allele[ialt],bcf_seqname(args->hdr,rec),rec->pos+1); assert( rec->d.allele[0][1]==0 ); // todo: for now expecting strlen(REF) = 1 len_diff = 1-rec->rlen; rec->d.allele[ialt] = rec->d.allele[0]; // according to VCF spec, REF must precede the event alen = strlen(rec->d.allele[ialt]); } else if ( strncasecmp(rec->d.allele[0],args->fa_buf.s+idx,rec->rlen) ) { // fprintf(pysamerr,"%d .. [%s], idx=%d ori=%d off=%d\n",args->fa_ori_pos,args->fa_buf.s,idx,args->fa_ori_pos,args->fa_mod_off); char tmp = 0; if ( args->fa_buf.l - idx > rec->rlen ) { tmp = args->fa_buf.s[idx+rec->rlen]; args->fa_buf.s[idx+rec->rlen] = 0; } error( "The fasta sequence does not match the REF allele at %s:%d:\n" " .vcf: [%s]\n" " .vcf: [%s] <- (ALT)\n" " .fa: [%s]%c%s\n", bcf_seqname(args->hdr,rec),rec->pos+1, rec->d.allele[0], rec->d.allele[ialt], args->fa_buf.s+idx, tmp?tmp:' ',tmp?args->fa_buf.s+idx+rec->rlen+1:"" ); } else { alen = strlen(rec->d.allele[ialt]); len_diff = alen - rec->rlen; } if ( args->fa_case ) for (i=0; i<alen; i++) rec->d.allele[ialt][i] = toupper(rec->d.allele[ialt][i]); else for (i=0; i<alen; i++) rec->d.allele[ialt][i] = tolower(rec->d.allele[ialt][i]); if ( len_diff <= 0 ) { // deletion or same size event for (i=0; i<alen; i++) args->fa_buf.s[idx+i] = rec->d.allele[ialt][i]; if ( len_diff ) memmove(args->fa_buf.s+idx+alen,args->fa_buf.s+idx+rec->rlen,args->fa_buf.l-idx-rec->rlen); } else { // insertion ks_resize(&args->fa_buf, args->fa_buf.l + len_diff); memmove(args->fa_buf.s + idx + rec->rlen + len_diff, args->fa_buf.s + idx + rec->rlen, args->fa_buf.l - idx - rec->rlen); for (i=0; i<alen; i++) args->fa_buf.s[idx+i] = rec->d.allele[ialt][i]; } if (args->chain && len_diff != 0) { // If first nucleotide of both REF and ALT are the same... (indels typically include the nucleotide before the variant) if ( strncasecmp(rec->d.allele[0],rec->d.allele[ialt],1) == 0) { // ...extend the block by 1 bp: start is 1 bp further and alleles are 1 bp shorter push_chain_gap(args->chain, rec->pos + 1, rec->rlen - 1, rec->pos + 1 + args->fa_mod_off, alen - 1); } else { // otherwise, just the coordinates of the variant as given push_chain_gap(args->chain, rec->pos, rec->rlen, rec->pos + args->fa_mod_off, alen); } } args->fa_buf.l += len_diff; args->fa_mod_off += len_diff; args->fa_frz_pos = rec->pos + rec->rlen - 1; }
void union_data::readGenotypesVCF(string fvcf,string region) { int n_includedG = 0; int n_excludedG_mult = 0; int n_excludedG_void = 0; int n_excludedG_user = 0; int n_includedS = 0; vector < int > mappingS; genotype_id.clear(); genotype_chr.clear(); genotype_start.clear(); genotype_end.clear(); genotype_val.clear(); genotype_count=0; genotype_id_to_idx.clear(); //Opening files bcf_srs_t * sr = bcf_sr_init(); //vrb.bullet("target region [" + regionGenotype.get() + "]"); //if (bcf_sr_set_regions(sr, regionGenotype.get().c_str(), 0) == -1) vrb.error("Cannot jump to region!"); bcf_sr_set_regions(sr, region.c_str(), 0); if(!(bcf_sr_add_reader (sr, fvcf.c_str()))) { switch (sr->errnum) { case not_bgzf: vrb.error("File not compressed with bgzip!"); case idx_load_failed: vrb.error("Impossible to load index file!"); case file_type_error: vrb.error("File format not detected by htslib!"); default : vrb.error("Unknown error!"); } } //Sample processing int n_samples = bcf_hdr_nsamples(sr->readers[0].header); for (int i0 = 0 ; i0 < n_samples ; i0 ++) { mappingS.push_back(findSample(string(sr->readers[0].header->samples[i0]))); if (mappingS.back() >= 0) n_includedS++; } //Read genotype data int ngt, ngt_arr = 0, nds, nds_arr = 0, * gt_arr = NULL, nsl, nsl_arr = 0, * sl_arr = NULL; float * ds_arr = NULL; bcf1_t * line; unsigned int linecount = 0; while(bcf_sr_next_line (sr)) { linecount ++; if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); line = bcf_sr_get_line(sr, 0); if (line->n_allele == 2) { ngt = bcf_get_genotypes(sr->readers[0].header, line, >_arr, &ngt_arr); nds = bcf_get_format_float(sr->readers[0].header, line,"DS", &ds_arr, &nds_arr); if (nds == n_samples || ngt == 2*n_samples) { bcf_unpack(line, BCF_UN_STR); string sid = string(line->d.id); if (filter_genotype.check(sid)) { genotype_id.push_back(sid); genotype_chr.push_back(string(bcf_hdr_id2name(sr->readers[0].header, line->rid))); string genotype_ref = string(line->d.allele[0]); genotype_start.push_back(line->pos + 1); nsl = bcf_get_info_int32(sr->readers[0].header, line, "END", &sl_arr, &nsl_arr); if (nsl >= 0 && nsl_arr == 1) genotype_end.push_back(sl_arr[0]); else genotype_end.push_back(genotype_start.back() + genotype_ref.size() - 1); genotype_val.push_back(vector < float > (sample_count, 0.0)); for(int i = 0 ; i < n_samples ; i ++) { if (mappingS[i] >= 0) { if (nds > 0) genotype_val.back()[mappingS[i]] = ds_arr[i]; else { if (gt_arr[2*i+0] == bcf_gt_missing || gt_arr[2*i+1] == bcf_gt_missing) genotype_val.back()[mappingS[i]] = bcf_float_missing; else genotype_val.back()[mappingS[i]] = bcf_gt_allele(gt_arr[2*i+0]) + bcf_gt_allele(gt_arr[2*i+1]); } } } pair < string, int > temp (sid,n_includedG); genotype_id_to_idx.insert(temp); n_includedG++; } else n_excludedG_user ++; } else n_excludedG_void ++; } else n_excludedG_mult ++; } //Finalize free(gt_arr); free(ds_arr); bcf_sr_destroy(sr); genotype_count = n_includedG; //vrb.bullet(stb.str(n_includedG) + " variants included"); //if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user"); //if (n_excludedG_mult > 0) vrb.bullet(stb.str(n_excludedG_mult) + " multi-allelic variants excluded"); //if (n_excludedG_void > 0) vrb.bullet(stb.str(n_excludedG_void) + " uninformative variants excluded [no GT/DS]"); //if (genotype_count == 0) vrb.leave("Cannot find genotypes in target region!"); }
int main(int argc, char **argv) { if (argc < 3) { fprintf(stderr,"%s <bcf file> <num vars>\n", argv[0]); return 1; } char *fname = argv[1]; uint32_t num_vars = atoi(argv[2]); htsFile *fp = hts_open(fname,"rb"); bcf_hdr_t *hdr = bcf_hdr_read(fp); bcf1_t *line = bcf_init1(); int32_t *gt_p = NULL; uint32_t num_inds = bcf_hdr_nsamples(hdr); int32_t i, j, k, ntmp = 0, int_i = 0, two_bit_i = 0, sum, t_sum = 0; uint32_t num_ind_ints = 1 + ((num_inds - 1) / 16); pri_queue q = priq_new(0); priority p; uint32_t *packed_ints = (uint32_t *) calloc(num_ind_ints, sizeof(uint32_t)); FILE *gt_of = fopen("gt.tmp.packed","wb"); FILE *md_of = fopen("md.tmp.packed","w"); uint32_t *md_index = (uint32_t *) malloc(num_vars * sizeof(uint32_t)); uint32_t md_i = 0; unsigned long t_bcf_read = 0, t_bcf_dup = 0, t_bcf_unpack = 0, t_bcf_get_genotypes = 0, t_bcf_hdr_nsamples = 0, t_q = 0, t_write = 0, t_get_md = 0, t_md_write = 0, t_pack = 0; for (i = 0; i < num_vars; ++i) { sum = 0; int_i = 0; two_bit_i = 0; int r = bcf_read(fp, hdr, line); // Copy bcf1_t *t_line = bcf_dup(line); // Unpack bcf_unpack(t_line, BCF_UN_ALL); // Get metadata size_t len = strlen(bcf_hdr_id2name(hdr, t_line->rid)) + 10 + // max length of pos strlen(t_line->d.id) + strlen(t_line->d.allele[0]) + strlen(t_line->d.allele[1]) + 4; //tabs char *md = (char *) malloc(len * sizeof(char)); sprintf(md, "%s\t%d\t%s\t%s\t%s", bcf_hdr_id2name(hdr, t_line->rid), t_line->pos + 1, t_line->d.id, t_line->d.allele[0], t_line->d.allele[1]); // Write metadata md_i += strlen(md); md_index[i] = md_i; fprintf(md_of, "%s", md); // Get gentotypes uint32_t num_gts_per_sample = bcf_get_genotypes(hdr, t_line, >_p, &ntmp); num_gts_per_sample /= num_inds; int32_t *gt_i = gt_p; // Pack genotypes for (j = 0; j < num_inds; ++j) { uint32_t gt = 0; for (k = 0; k < num_gts_per_sample; ++k) { gt += bcf_gt_allele(gt_i[k]); } packed_ints[int_i] += gt << (30 - 2*two_bit_i); two_bit_i += 1; if (two_bit_i == 16) { two_bit_i = 0; int_i += 1; } sum += gt; gt_i += num_gts_per_sample; } // Get a priority for the variant based on the sum and number of // leading zeros p.sum = sum; uint32_t prefix_len = 0; j = 0; while ((j < num_ind_ints) && (packed_ints[j] == 0)){ prefix_len += 32; j += 1; } if (j < num_ind_ints) prefix_len += nlz1(packed_ints[j]); // Push it into the q p.len = prefix_len; int *j = (int *) malloc (sizeof(int)); j[0] = i; priq_push(q, j, p); // Write to file fwrite(packed_ints, sizeof(uint32_t), num_ind_ints,gt_of); memset(packed_ints, 0, num_ind_ints*sizeof(uint32_t)); t_sum += sum; bcf_destroy(t_line); free(md); } fclose(gt_of); fclose(md_of); md_of = fopen("md.tmp.packed","r"); FILE *md_out = fopen("md.bim","w"); gt_of = fopen("gt.tmp.packed","rb"); FILE *s_gt_of = fopen("s.gt.tmp.packed","wb"); // Get variants in order and rewrite a variant-major sorted matrix while ( priq_top(q, &p) != NULL ) { int *d = priq_pop(q, &p); uint32_t start = 0; if (*d != 0) start = md_index[*d - 1]; uint32_t len = md_index[*d] - start; fseek(md_of, start*sizeof(char), SEEK_SET); char buf[len+1]; fread(buf, sizeof(char), len, md_of); buf[len] = '\0'; fseek(gt_of, (*d)*num_ind_ints*sizeof(uint32_t), SEEK_SET); fread(packed_ints, sizeof(uint32_t), num_ind_ints, gt_of); fwrite(packed_ints, sizeof(uint32_t), num_ind_ints,s_gt_of); fprintf(md_out, "%s\n", buf); } fclose(md_out); fclose(md_of); fclose(gt_of); fclose(s_gt_of); /* * In a packed-int variant-major matrix there will be a num_vars * number of rows, and a num_inds number of values packed into * num_inds_ints number of intergers. For examples, 16 rows of 16 values * will be 16 ints, where each int encodes 16 values. * */ uint32_t num_var_ints = 1 + ((num_vars - 1) / 16); uint32_t *I_data = (uint32_t *)calloc(num_var_ints*16,sizeof(uint32_t)); uint32_t **I = (uint32_t **)malloc(16*sizeof(uint32_t*)); for (i = 0; i < 16; ++i) I[i] = I_data + i*num_var_ints; uint32_t I_i = 0, I_int_i = 0; uint32_t v; s_gt_of = fopen("s.gt.tmp.packed","rb"); FILE *rs_gt_of = fopen("r.s.gt.tmp.packed","wb"); // Write these to values to that this is a well-formed uncompressed // packed int binary file (ubin) file fwrite(&num_vars, sizeof(uint32_t), 1, rs_gt_of); fwrite(&num_inds, sizeof(uint32_t), 1, rs_gt_of); /* * we need to loop over the columns in the v-major file. * There are num_vars rows, and num_ind_ints 16-ind packed columns * * In this loop : * i: cols in var-major form, rows in ind-major form * j: rows in var-major form, cols in ind-major form */ uint32_t num_inds_to_write = num_inds; for (i = 0; i < num_ind_ints; ++i) { // loop over each int col for (j = 0; j < num_vars; ++j) { // loop over head row in that col // skip to the value at the row/col fseek(s_gt_of, j*num_ind_ints*sizeof(uint32_t) + //row i*sizeof(uint32_t), //col SEEK_SET); fread(&v, sizeof(uint32_t), 1, s_gt_of); // one int corresponds to a col of 16 two-bit values // two_bit_i will move across the cols for (two_bit_i = 0; two_bit_i < 16; ++two_bit_i) { I[two_bit_i][I_i] += ((v >> (30 - 2*two_bit_i)) & 3) << (30 - 2*I_int_i); } I_int_i += 1; if (I_int_i == 16) { I_i += 1; I_int_i = 0; } } // When we are at the end of the file, and the number of lines // is not a factor of 16, only write out the lines that contain values if (num_inds_to_write >= 16) { fwrite(I_data, sizeof(uint32_t), num_var_ints*16, rs_gt_of); num_inds_to_write -= 16; } else { fwrite(I_data, sizeof(uint32_t), num_var_ints*num_inds_to_write, rs_gt_of); } memset(I_data, 0, num_var_ints*16*sizeof(uint32_t)); I_int_i = 0; I_i = 0; } fclose(s_gt_of); fclose(rs_gt_of); free(md_index); free(packed_ints); }
static void flush_viterbi(args_t *args) { int i,j; if ( !args->nsites ) return; if ( !args->vi_training ) { // single viterbi pass, one chromsome hmm_run_viterbi(args->hmm, args->nsites, args->eprob, args->sites); hmm_run_fwd_bwd(args->hmm, args->nsites, args->eprob, args->sites); double *fwd = hmm_get_fwd_bwd_prob(args->hmm); const char *chr = bcf_hdr_id2name(args->hdr,args->prev_rid); uint8_t *vpath = hmm_get_viterbi_path(args->hmm); for (i=0; i<args->nsites; i++) { int state = vpath[i*2]==STATE_AZ ? 1 : 0; double *pval = fwd + i*2; printf("%s\t%d\t%d\t%.1f\n", chr,args->sites[i]+1, state, phred_score(1.0-pval[state])); } return; } // viterbi training, multiple chromosomes double t2az_prev, t2hw_prev; double deltaz, delthw; int niter = 0; do { double *tprob_arr = hmm_get_tprob(args->hmm); t2az_prev = MAT(tprob_arr,2,1,0); //args->t2AZ; t2hw_prev = MAT(tprob_arr,2,0,1); //args->t2HW; double tcounts[] = { 0,0,0,0 }; for (i=0; i<args->nrids; i++) { // run viterbi for each chromosomes. eprob and sites contain // multiple chromosomes, rid_offs mark the boundaries int ioff = args->rid_offs[i]; int nsites = (i+1==args->nrids ? args->nsites : args->rid_offs[i+1]) - ioff; hmm_run_viterbi(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff); // what transitions were observed: add to the total counts uint8_t *vpath = hmm_get_viterbi_path(args->hmm); for (j=1; j<nsites; j++) { // count the number of transitions int prev_state = vpath[2*(j-1)]; int curr_state = vpath[2*j]; MAT(tcounts,2,curr_state,prev_state) += 1; } } // update the transition matrix tprob for (i=0; i<2; i++) { int n = 0; for (j=0; j<2; j++) n += MAT(tcounts,2,i,j); if ( !n) error("fixme: state %d not observed\n", i+1); for (j=0; j<2; j++) MAT(tcounts,2,i,j) /= n; } if ( args->genmap_fname || args->rec_rate > 0 ) hmm_set_tprob(args->hmm, tcounts, 0); else hmm_set_tprob(args->hmm, tcounts, 10000); tprob_arr = hmm_get_tprob(args->hmm); deltaz = fabs(MAT(tprob_arr,2,1,0)-t2az_prev); delthw = fabs(MAT(tprob_arr,2,0,1)-t2hw_prev); niter++; fprintf(pysamerr,"%d: %f %f\n", niter,deltaz,delthw); } while ( deltaz > 0.0 || delthw > 0.0 ); fprintf(pysamerr, "Viterbi training converged in %d iterations to", niter); double *tprob_arr = hmm_get_tprob(args->hmm); for (i=0; i<2; i++) for (j=0; j<2; j++) fprintf(pysamerr, " %f", MAT(tprob_arr,2,i,j)); fprintf(pysamerr, "\n"); // output the results for (i=0; i<args->nrids; i++) { int ioff = args->rid_offs[i]; int nsites = (i+1==args->nrids ? args->nsites : args->rid_offs[i+1]) - ioff; hmm_run_viterbi(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff); uint8_t *vpath = hmm_get_viterbi_path(args->hmm); const char *chr = bcf_hdr_id2name(args->hdr,args->rids[i]); for (j=0; j<nsites; j++) { printf("%s\t%d\t%d\t..\n", chr,args->sites[ioff+j]+1,vpath[j*2]==STATE_AZ ? 1 : 0); } } }
void flush_viterbi(args_t *args) { const char *s1, *s2, *s3 = NULL; if ( args->mode==C_UNRL ) { s1 = bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,args->isample); s2 = bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,args->jsample); } else if ( args->mode==C_TRIO ) { s1 = bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,args->imother); s3 = bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,args->ifather); s2 = bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,args->ichild); } if ( !args->fp ) { kstring_t str = {0,0,0}; kputs(args->prefix, &str); kputs(".dat", &str); args->fp = fopen(str.s,"w"); if ( !args->fp ) error("%s: %s\n", str.s,strerror(errno)); free(str.s); fprintf(args->fp,"# SG, shared segment\t[2]Chromosome\t[3]Start\t[4]End\t[5]%s:1\t[6]%s:2\n",s2,s2); fprintf(args->fp,"# SW, number of switches\t[3]Sample\t[4]Chromosome\t[5]nHets\t[5]nSwitches\t[6]switch rate\n"); } hmm_run_viterbi(args->hmm,args->nsites,args->eprob,args->sites); uint8_t *vpath = hmm_get_viterbi_path(args->hmm); int i, iprev = -1, prev_state = -1, nstates = hmm_get_nstates(args->hmm); int nswitch_mother = 0, nswitch_father = 0; for (i=0; i<args->nsites; i++) { int state = vpath[i*nstates]; if ( state!=prev_state || i+1==args->nsites ) { uint32_t start = iprev>=0 ? args->sites[iprev]+1 : 1, end = i>0 ? args->sites[i-1] : 1; const char *chr = bcf_hdr_id2name(args->hdr,args->prev_rid); if ( args->mode==C_UNRL ) { switch (prev_state) { case UNRL_0x0x: fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:1\t-\n", chr,start,end,s1); break; case UNRL_0xx0: fprintf(args->fp,"SG\t%s\t%d\t%d\t-\t%s:1\n", chr,start,end,s1); break; case UNRL_x00x: fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:2\t-\n", chr,start,end,s1); break; case UNRL_x0x0: fprintf(args->fp,"SG\t%s\t%d\t%d\t-\t%s:2\n", chr,start,end,s1); break; case UNRL_0101: fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:1\t%s:2\n", chr,start,end,s1,s1); break; case UNRL_0110: fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:2\t%s:1\n", chr,start,end,s1,s1); break; } } else if ( args->mode==C_TRIO ) { switch (prev_state) { case TRIO_AC: fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:1\t%s:1\n", chr,start,end,s1,s3); break; case TRIO_AD: fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:1\t%s:2\n", chr,start,end,s1,s3); break; case TRIO_BC: fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:2\t%s:1\n", chr,start,end,s1,s3); break; case TRIO_BD: fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:2\t%s:2\n", chr,start,end,s1,s3); break; case TRIO_CA: fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:1\t%s:1\n", chr,start,end,s3,s1); break; case TRIO_DA: fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:2\t%s:1\n", chr,start,end,s3,s1); break; case TRIO_CB: fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:1\t%s:2\n", chr,start,end,s3,s1); break; case TRIO_DB: fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:2\t%s:2\n", chr,start,end,s3,s1); break; } if ( hap_switch[state][prev_state] & SW_MOTHER ) nswitch_mother++; if ( hap_switch[state][prev_state] & SW_FATHER ) nswitch_father++; } iprev = i-1; } prev_state = state; } float mrate = args->nhet_mother>1 ? (float)nswitch_mother/(args->nhet_mother-1) : 0; float frate = args->nhet_father>1 ? (float)nswitch_father/(args->nhet_father-1) : 0; fprintf(args->fp,"SW\t%s\t%s\t%d\t%d\t%f\n", s1,bcf_hdr_id2name(args->hdr,args->prev_rid),args->nhet_mother,nswitch_mother,mrate); fprintf(args->fp,"SW\t%s\t%s\t%d\t%d\t%f\n", s3,bcf_hdr_id2name(args->hdr,args->prev_rid),args->nhet_father,nswitch_father,frate); args->nsites = 0; args->nhet_father = args->nhet_mother = 0; }
void genrich_data::readReferenceGenotypes(string fvcf) { vector < int > mappingS; //Opening files vrb.title("Reading variant list in [" + fvcf + "] MAF=" + stb.str(threshold_maf)); bcf_srs_t * sr = bcf_sr_init(); if(!(bcf_sr_add_reader (sr, fvcf.c_str()))) { switch (sr->errnum) { case not_bgzf: vrb.error("File not compressed with bgzip!"); case idx_load_failed: vrb.error("Impossible to load index file!"); case file_type_error: vrb.error("File format not detected by htslib!"); default : vrb.error("Unknown error!"); } } //Sample processing int included_sample = 0; int n_samples = bcf_hdr_nsamples(sr->readers[0].header); for (int i = 0 ; i < n_samples ; i ++) { mappingS.push_back(findSample(string(sr->readers[0].header->samples[i]))); if (mappingS.back() >= 0) included_sample ++; } vrb.bullet("#samples = " + stb.str(included_sample)); //Variant processing unsigned int n_excludedV_mult = 0, n_excludedV_void = 0, n_excludedV_rare = 0, n_excludedV_uchr = 0, n_line = 0, n_excludedV_toofar = 0; int ngt, ngt_arr = 0, *gt_arr = NULL; bcf1_t * line; while(bcf_sr_next_line (sr)) { line = bcf_sr_get_line(sr, 0); if (line->n_allele == 2) { bcf_unpack(line, BCF_UN_STR); string sid = string(line->d.id); string chr = string(bcf_hdr_id2name(sr->readers[0].header, line->rid)); int chr_idx = findCHR(chr); if (chr_idx >= 0) { unsigned int pos = line->pos + 1; ngt = bcf_get_genotypes(sr->readers[0].header, line, >_arr, &ngt_arr); if (ngt == 2*n_samples) { double freq = 0.0, tot = 0.0; for(int i = 0 ; i < n_samples ; i ++) { assert(gt_arr[2*i+0] != bcf_gt_missing && gt_arr[2*i+1] != bcf_gt_missing); if (mappingS[i] >= 0) { freq += bcf_gt_allele(gt_arr[2*i+0]) + bcf_gt_allele(gt_arr[2*i+1]); tot += 2.0; } } double maf = freq / tot; if (maf > 0.5) maf = 1.0 - maf; if (maf >= threshold_maf) { int dist_tss = getDistance(chr_idx, pos); if (dist_tss < 1e6) { string tmp_id = chr + "_" + stb.str(pos); genotype_uuid.insert(pair < string, unsigned int > (tmp_id, genotype_pos.size())); genotype_chr.push_back(chr_idx); genotype_pos.push_back(pos); genotype_maf.push_back(maf); genotype_dist.push_back(dist_tss); genotype_haps.push_back(vector < bool > (2 * included_sample, false)); for(int i = 0 ; i < n_samples ; i ++) { if (mappingS[i] >= 0) { genotype_haps.back()[2 * mappingS[i] + 0] = bcf_gt_allele(gt_arr[2 * i + 0]); genotype_haps.back()[2 * mappingS[i] + 1] = bcf_gt_allele(gt_arr[2 * i + 1]); } } } else n_excludedV_toofar++; } else n_excludedV_rare ++; } else n_excludedV_void ++; } else n_excludedV_uchr ++; } else n_excludedV_mult ++; if (n_line % 100000 == 0) vrb.bullet("#lines = " + stb.str(n_line)); n_line ++; } genotype_qtl = vector < bool > (genotype_pos.size(), false); genotype_gwas = vector < bool > (genotype_pos.size(), false); genotype_bin = vector < int > (genotype_pos.size(), -1); //Finalize bcf_sr_destroy(sr); vrb.bullet(stb.str(genotype_pos.size()) + " variants included"); if (n_excludedV_mult > 0) vrb.bullet(stb.str(n_excludedV_mult) + " multi-allelic variants excluded"); if (n_excludedV_uchr > 0) vrb.bullet(stb.str(n_excludedV_uchr) + " variants with unreferenced chromosome in --tss"); if (n_excludedV_rare > 0) vrb.bullet(stb.str(n_excludedV_rare) + " maf filtered variants"); if (n_excludedV_toofar > 0) vrb.bullet(stb.str(n_excludedV_toofar) + " too far variants"); }