// @param vcf_pos is 0-based // @param prev_base is -1 if SNP otherwise previous base // @param next_base is -1 unless indel at position 0 static void print_vcf_entry(size_t vcf_pos, int8_t prev_base, int8_t next_base, const char *ref, const char *alt, size_t len, const uint8_t *gts, size_t nsamples, CallDecomp *dc, const AlignedCall *call, size_t max_allele_len) { dc->stats.nvars++; StrBuf *sbuf = &dc->sbuf; strbuf_reset(sbuf); // Check actual allele length size_t i, alt_bases = 0; for(i = 0; i < len; i++) alt_bases += (alt[i] != '-'); if(alt_bases > max_allele_len) { dc->stats.nallele_too_long++; return; } // CHROM POS ID REF ALT QUAL FILTER INFO strbuf_append_str(sbuf, call->chrom->name.b); strbuf_append_char(sbuf, '\t'); strbuf_append_ulong(sbuf, vcf_pos+1); strbuf_append_str(sbuf, "\t.\t"); print_vcf_allele(ref, len, prev_base, next_base, sbuf); strbuf_append_char(sbuf, '\t'); print_vcf_allele(alt, len, prev_base, next_base, sbuf); strbuf_append_str(sbuf, "\t.\tPASS\t"); strbuf_append_str(sbuf, call->info.b ? call->info.b : "."); strbuf_append_str(sbuf, "\tGT"); // Print genotypes for(i = 0; i < nsamples; i++) { strbuf_append_char(sbuf, '\t'); strbuf_append_char(sbuf, gts[i] ? '1' : '.'); } strbuf_append_char(sbuf, '\n'); // fprintf(stderr, " prev_base:%i next_base:%i info:%s\n", prev_base, next_base, call->info.b); // fprintf(stderr, "%s [%zu vs %zu]\n", sbuf->b, sbuf->end, strlen(sbuf->b)); kstring_t ks = {.l = sbuf->end, .m = sbuf->size, .s = sbuf->b}; if(vcf_parse(&ks, dc->vcfhdr, dc->v) != 0) die("Cannot construct VCF entry: %s", sbuf->b); if(bcf_write(dc->vcffh, dc->vcfhdr, dc->v) != 0) die("Cannot write VCF entry [nsamples: %zu vs %zu]", nsamples, (size_t)bcf_hdr_nsamples(dc->vcfhdr)); // Move back into our string buffer sbuf->b = ks.s; sbuf->size = ks.m; dc->stats.nvars_printed++; } // `ref` and `alt` are aligned alleles - should both be same length strings // of 'ACGT-' // return first mismatch position or -1 static int align_get_start(const char *ref, const char *alt) { const char *start = ref; while(*ref) { if(*ref != *alt) return (ref - start); ref++; alt++; } return -1; } // `ref` and `alt` are aligned alleles - should both be same length strings // of 'ACGT-' // return first matching position static int align_get_end(const char *ref, const char *alt) { int i = 0; while(ref[i] && ref[i] != alt[i]) i++; return i; }
/** * Print paths to a string buffer. Paths are sorted before being written. * * @param hkey All paths associated with hkey are written to the buffer * @param sbuf paths are written this string buffer * @param subset is a temp variable that is reused each time * @param nbuf temporary buffer, if not NULL, used to add seq=... to output * @param jposbuf temporary buffer, if not NULL, used to add juncpos=... to output */ void gpath_save_sbuf(hkey_t hkey, StrBuf *sbuf, GPathSubset *subset, dBNodeBuffer *nbuf, SizeBuffer *jposbuf, const dBGraph *db_graph) { ctx_assert(db_graph->num_of_cols == 1 || nbuf == NULL); ctx_assert(db_graph->num_of_cols == 1 || jposbuf == NULL); const GPathStore *gpstore = &db_graph->gpstore; const GPathSet *gpset = &gpstore->gpset; const size_t ncols = gpstore->gpset.ncols; GPath *first_gpath = gpath_store_fetch(gpstore, hkey); const GPath *gpath; size_t i, j, col; // Load and sort paths for given kmer gpath_subset_reset(subset); gpath_subset_load_llist(subset, first_gpath); gpath_subset_sort(subset); if(subset->list.len == 0) return; // Print "<kmer> <npaths>" BinaryKmer bkmer = db_graph->ht.table[hkey]; char bkstr[MAX_KMER_SIZE+1]; binary_kmer_to_str(bkmer, db_graph->kmer_size, bkstr); // strbuf_sprintf(sbuf, "%s %zu\n", bkstr, subset->list.len); strbuf_append_strn(sbuf, bkstr, db_graph->kmer_size); strbuf_append_char(sbuf, ' '); strbuf_append_ulong(sbuf, subset->list.len); strbuf_append_char(sbuf, '\n'); char orchar[2] = {0}; orchar[FORWARD] = 'F'; orchar[REVERSE] = 'R'; const uint8_t *nseenptr; for(i = 0; i < subset->list.len; i++) { gpath = subset->list.b[i]; nseenptr = gpath_set_get_nseen(gpset, gpath); // strbuf_sprintf(sbuf, "%c %zu %u %u", orchar[gpath->orient], klen, // gpath->num_juncs, (uint32_t)nseenptr[0]); strbuf_append_char(sbuf, orchar[gpath->orient]); strbuf_append_char(sbuf, ' '); strbuf_append_ulong(sbuf, gpath->num_juncs); strbuf_append_char(sbuf, ' '); strbuf_append_ulong(sbuf, nseenptr[0]); for(col = 1; col < ncols; col++) { // strbuf_sprintf(sbuf, ",%u", (uint32_t)nseenptr[col]); strbuf_append_char(sbuf, ','); strbuf_append_ulong(sbuf, nseenptr[col]); } strbuf_append_char(sbuf, ' '); strbuf_ensure_capacity(sbuf, sbuf->end + gpath->num_juncs + 2); binary_seq_to_str(gpath->seq, gpath->num_juncs, sbuf->b+sbuf->end); sbuf->end += gpath->num_juncs; if(nbuf) { // Trace this path through the graph // First, find a colour this path is in for(col = 0; col < ncols && !gpath_has_colour(gpath, ncols, col); col++) {} if(col == ncols) die("path is not in any colours"); dBNode node = {.key = hkey, .orient = gpath->orient}; db_node_buf_reset(nbuf); if(jposbuf) size_buf_reset(jposbuf); // indices of junctions in nbuf gpath_fetch(node, gpath, nbuf, jposbuf, col, db_graph); strbuf_append_str(sbuf, " seq="); strbuf_ensure_capacity(sbuf, sbuf->end + db_graph->kmer_size + nbuf->len); sbuf->end += db_nodes_to_str(nbuf->b, nbuf->len, db_graph, sbuf->b+sbuf->end); if(jposbuf) { strbuf_append_str(sbuf, " juncpos="); strbuf_append_ulong(sbuf, jposbuf->b[0]); for(j = 1; j < jposbuf->len; j++) { strbuf_append_char(sbuf, ','); strbuf_append_ulong(sbuf, jposbuf->b[j]); } } } strbuf_append_char(sbuf, '\n'); } } // @subset is a temp variable that is reused each time // @sbuf is a temp variable that is reused each time static inline int _gpath_gzsave_node(hkey_t hkey, StrBuf *sbuf, GPathSubset *subset, dBNodeBuffer *nbuf, SizeBuffer *jposbuf, gzFile gzout, pthread_mutex_t *outlock, const dBGraph *db_graph) { gpath_save_sbuf(hkey, sbuf, subset, nbuf, jposbuf, db_graph); if(sbuf->end > DEFAULT_IO_BUFSIZE) _gpath_save_flush(gzout, sbuf, outlock); return 0; // => keep iterating }
// Potential bubble - filter ref and duplicate alleles static void print_bubble(BubbleCaller *caller, GCacheStep **steps, size_t num_paths) { const BubbleCallingPrefs prefs = caller->prefs; const dBGraph *db_graph = caller->db_graph; GCacheSnode *snode; size_t i; dBNodeBuffer *flank5p = &caller->flank5p; if(flank5p->len == 0) { // Haven't fetched 5p flank yet // flank5p[0] already contains the first node flank5p->len = 1; supernode_extend(flank5p, prefs.max_flank_len, db_graph); db_nodes_reverse_complement(flank5p->b, flank5p->len); } // // Print Bubble // // write to string buffer then flush to gzFile StrBuf *sbuf = &caller->output_buf; strbuf_reset(sbuf); // Temporary node buffer to use dBNodeBuffer *pathbuf = &caller->pathbuf; db_node_buf_reset(pathbuf); // Get bubble number (threadsafe num_bubbles_ptr++) size_t id = __sync_fetch_and_add((volatile size_t*)caller->num_bubbles_ptr, 1); // This can be set to anything without a '.' in it const char prefix[] = "call"; // 5p flank // strbuf_sprintf(sbuf, ">bubble.%s%zu.5pflank kmers=%zu\n", prefix, id, flank5p->len); strbuf_append_str(sbuf, ">bubble."); strbuf_append_str(sbuf, prefix); strbuf_append_ulong(sbuf, id); strbuf_append_str(sbuf, ".5pflank kmers="); strbuf_append_ulong(sbuf, flank5p->len); strbuf_append_char(sbuf, '\n'); branch_to_str(flank5p->b, flank5p->len, true, sbuf, db_graph); // 3p flank db_node_buf_reset(pathbuf); snode = graph_cache_snode(&caller->cache, steps[0]->supernode); graph_cache_snode_fetch_nodes(&caller->cache, snode, steps[0]->orient, pathbuf); // strbuf_sprintf(sbuf, ">bubble.%s%zu.3pflank kmers=%zu\n", prefix, id, pathbuf->len); strbuf_append_str(sbuf, ">bubble."); strbuf_append_str(sbuf, prefix); strbuf_append_ulong(sbuf, id); strbuf_append_str(sbuf, ".3pflank kmers="); strbuf_append_ulong(sbuf, pathbuf->len); strbuf_append_char(sbuf, '\n'); branch_to_str(pathbuf->b, pathbuf->len, false, sbuf, db_graph); // Print alleles for(i = 0; i < num_paths; i++) { db_node_buf_reset(pathbuf); graph_cache_step_fetch_nodes(&caller->cache, steps[i], pathbuf); // strbuf_sprintf(sbuf, ">bubble.%s%zu.branch.%zu kmers=%zu\n", // prefix, id, i, pathbuf->len); strbuf_append_str(sbuf, ">bubble."); strbuf_append_str(sbuf, prefix); strbuf_append_ulong(sbuf, id); strbuf_append_str(sbuf, ".branch."); strbuf_append_ulong(sbuf, i); strbuf_append_str(sbuf, " kmers="); strbuf_append_ulong(sbuf, pathbuf->len); strbuf_append_char(sbuf, '\n'); branch_to_str(pathbuf->b, pathbuf->len, false, sbuf, db_graph); } strbuf_append_char(sbuf, '\n'); ctx_assert(strlen(sbuf->b) == sbuf->end); // lock, print, unlock pthread_mutex_lock(caller->out_lock); gzwrite(caller->gzout, sbuf->b, sbuf->end); pthread_mutex_unlock(caller->out_lock); }