int load_unpadded_ref(faidx_t *fai, char *ref_name, int ref_len, kstring_t *seq) { char base; char *fai_ref = 0; int fai_ref_len = 0, k; fai_ref = fai_fetch(fai, ref_name, &fai_ref_len); if (fai_ref_len != ref_len) { fprintf(stderr, "[depad] ERROR: FASTA sequence %s length %i, expected %i\n", ref_name, fai_ref_len, ref_len); free(fai_ref); return -1; } ks_resize(seq, ref_len); seq->l = 0; for (k = 0; k < ref_len; ++k) { base = fai_ref[k]; if (base == '-' || base == '*') { // Map gaps to null to match unpad_seq function seq->s[seq->l++] = 0; } else { int i = seq_nt16_table[(int)base]; if (i == 0 || i==16) { // Equals maps to 0, anything unexpected to 16 fprintf(stderr, "[depad] ERROR: Invalid character %c (ASCII %i) in FASTA sequence %s\n", base, (int)base, ref_name); free(fai_ref); return -1; } seq->s[seq->l++] = i; } } assert(ref_len == seq->l); free(fai_ref); return 0; }
void call_stdout(marksplit_settings_t *settings, splitterhash_params_t *params, char *ffq_r1, char *ffq_r2) { kstring_t str1{0, 0, nullptr}, str2{0, 0, nullptr}; kstring_t final{0, 0, nullptr}; kputs((settings->gzip_output)? "zcat": "cat", &str1); ks_resize(&str1, 1 << 10); for(int i(0); i < settings->n_handles; ++i) ksprintf(&str1, " %s", params->outfnames_r1[i]); kputsnl(" | paste -d'~' - - - - ", &str1); str2.s = dlib::kstrdup(&str1); // strdup the string. for(uint32_t i(0); i < str2.l; ++i) { LOG_DEBUG("Current str.s + i: %s.\n", str2.s + i); if(memcmp(str2.s + i, "R1", 2) == 0) str2.s[i + 1] = '2'; } const char final_template[]{"pr -mts'~' <(%s) <(%s) | tr '~' '\\n'"}; ksprintf(&final, final_template, str1.s, str2.s); dlib::bash_system(final.s); free(str1.s), free(str2.s); free(final.s); } void cat_fastqs(marksplit_settings_t *settings, splitterhash_params_t *params, char *ffq_r1, char *ffq_r2) { settings->is_se ? cat_fastqs_se(settings, params, ffq_r1) : cat_fastqs_pe(settings, params, ffq_r1, ffq_r2); }
static inline int bcf_read1_core(BGZF *fp, bcf1_t *v) { uint32_t x[8]; int ret; if ((ret = bgzf_read(fp, x, 32)) != 32) { if (ret == 0) return -1; return -2; } x[0] -= 24; // to exclude six 32-bit integers ks_resize(&v->shared, x[0]); ks_resize(&v->indiv, x[1]); memcpy(v, x + 2, 16); v->n_allele = x[6]>>16; v->n_info = x[6]&0xffff; v->n_fmt = x[7]>>24; v->n_sample = x[7]&0xffffff; v->shared.l = x[0], v->indiv.l = x[1]; v->unpacked = 0; v->unpack_ptr = NULL; bgzf_read(fp, v->shared.s, v->shared.l); bgzf_read(fp, v->indiv.s, v->indiv.l); return 0; }
static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec) { int i = 0, max_len = 0; if ( !reg->nals ) { char *ss = reg->line.s; while ( i<als_idx && *ss ) { if ( *ss=='\t' ) i++; ss++; } char *se = ss; reg->nals = 1; while ( *se && *se!='\t' ) { if ( *se==',' ) reg->nals++; se++; } ks_resize(®->als_str, se-ss+1+reg->nals); reg->als_str.l = 0; hts_expand(char*,reg->nals,reg->mals,reg->als); reg->nals = 0; se = ss; while ( *(++se) ) { if ( *se=='\t' ) break; if ( *se!=',' ) continue; reg->als[reg->nals] = ®->als_str.s[reg->als_str.l]; kputsn(ss,se-ss,®->als_str); if ( ®->als_str.s[reg->als_str.l] - reg->als[reg->nals] > max_len ) max_len = ®->als_str.s[reg->als_str.l] - reg->als[reg->nals]; reg->als_str.l++; reg->nals++; ss = ++se; } reg->als[reg->nals] = ®->als_str.s[reg->als_str.l]; kputsn(ss,se-ss,®->als_str); if ( ®->als_str.s[reg->als_str.l] - reg->als[reg->nals] > max_len ) max_len = ®->als_str.s[reg->als_str.l] - reg->als[reg->nals]; reg->nals++; reg->als_type = max_len > 1 ? VCF_INDEL : VCF_SNP; // this is a simplified check, see vcf.c:bcf_set_variant_types }
static void unpad_seq(bam1_t *b, kstring_t *s) { int k, j, i; uint32_t *cigar = bam1_cigar(b); uint8_t *seq = bam1_seq(b); ks_resize(s, b->core.l_qseq); for (k = 0, s->l = 0, j = 0; k < b->core.n_cigar; ++k) { int op, ol; op = bam_cigar_op(cigar[k]); ol = bam_cigar_oplen(cigar[k]); assert(op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CSOFT_CLIP); if (op == BAM_CMATCH) { for (i = 0; i < ol; ++i) s->s[s->l++] = bam1_seqi(seq, j); ++j; } else if (op == BAM_CSOFT_CLIP) { j += ol; } else { for (i = 0; i < ol; ++i) s->s[s->l++] = 0; } } }
static int unpad_seq(bam1_t *b, kstring_t *s) { // Returns 0 on success, -1 on an error int k, j, i; int length; int cigar_n_warning = 0; /* Make this a global and limit to one CIGAR N warning? */ uint32_t *cigar = bam_get_cigar(b); uint8_t *seq = bam_get_seq(b); // b->core.l_qseq gives length of the SEQ entry (including soft clips, S) // We need the padded length after alignment from the CIGAR (excluding // soft clips S, but including pads from CIGAR D operations) length = bam_cigar2rlen(b->core.n_cigar, cigar); ks_resize(s, length); for (k = 0, s->l = 0, j = 0; k < b->core.n_cigar; ++k) { int op, ol; op = bam_cigar_op(cigar[k]); ol = bam_cigar_oplen(cigar[k]); if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (i = 0; i < ol; ++i, ++j) s->s[s->l++] = bam_seqi(seq, j); } else if (op == BAM_CSOFT_CLIP) { j += ol; } else if (op == BAM_CHARD_CLIP) { /* do nothing */ } else if (op == BAM_CDEL) { for (i = 0; i < ol; ++i) s->s[s->l++] = 0; } else if (op == BAM_CREF_SKIP) { /* Treat CIGAR N as D (not ideal, but better than ignoring it) */ for (i = 0; i < ol; ++i) s->s[s->l++] = 0; if (0 == cigar_n_warning) { cigar_n_warning = -1; fprintf(stderr, "[depad] WARNING: CIGAR op N treated as op D in read %s\n", bam_get_qname(b)); } } else { fprintf(stderr, "[depad] ERROR: Didn't expect CIGAR op %c in read %s\n", BAM_CIGAR_STR[op], bam_get_qname(b)); return -1; } } return length != s->l; }
static void apply_variant(args_t *args, bcf1_t *rec) { if ( rec->n_allele==1 ) return; if ( rec->pos <= args->fa_frz_pos ) { fprintf(pysamerr,"The site %s:%d overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),rec->pos+1); return; } if ( args->mask ) { char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid); int start = rec->pos; int end = rec->pos + rec->rlen - 1; if ( regidx_overlap(args->mask, chr,start,end,NULL) ) return; } int i, ialt = 1; if ( args->isample >= 0 ) { bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT"); if ( !fmt ) return; if ( args->haplotype ) { if ( args->haplotype > fmt->n ) error("Can't apply %d-th haplotype at %s:%d\n", args->haplotype,bcf_seqname(args->hdr,rec),rec->pos+1); uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample + args->haplotype - 1; ialt = bcf_dec_int1(ptr, fmt->type, &ignore); if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return; ialt = bcf_gt_allele(ialt); } else if ( args->output_iupac ) { uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample; ialt = bcf_dec_int1(ptr, fmt->type, &ignore); if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return; ialt = bcf_gt_allele(ialt); int jalt; if ( fmt->n>1 ) { ptr = fmt->p + fmt->size*args->isample + 1; jalt = bcf_dec_int1(ptr, fmt->type, &ignore); if ( bcf_gt_is_missing(jalt) || jalt==bcf_int32_vector_end ) jalt = ialt; else jalt = bcf_gt_allele(jalt); } else jalt = ialt; if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); if ( ialt!=jalt && !rec->d.allele[ialt][1] && !rec->d.allele[jalt][1] ) // is this a het snp? { char ial = rec->d.allele[ialt][0]; char jal = rec->d.allele[jalt][0]; rec->d.allele[ialt][0] = gt2iupac(ial,jal); } } else { for (i=0; i<fmt->n; i++) { uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample + i; ialt = bcf_dec_int1(ptr, fmt->type, &ignore); if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return; ialt = bcf_gt_allele(ialt); if ( ialt ) break; } } if ( !ialt ) return; // ref allele if ( rec->n_allele <= ialt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); } else if ( args->output_iupac && !rec->d.allele[0][1] && !rec->d.allele[1][1] ) { char ial = rec->d.allele[0][0]; char jal = rec->d.allele[1][0]; rec->d.allele[1][0] = gt2iupac(ial,jal); } int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off; if ( idx<0 || idx>=args->fa_buf.l ) error("FIXME: %s:%d .. idx=%d, ori_pos=%d, len=%d, off=%d\n",bcf_seqname(args->hdr,rec),rec->pos+1,idx,args->fa_ori_pos,args->fa_buf.l,args->fa_mod_off); // sanity check the reference base int len_diff = 0, alen = 0; if ( rec->d.allele[ialt][0]=='<' ) { if ( strcasecmp(rec->d.allele[ialt], "<DEL>") ) error("Symbolic alleles other than <DEL> are currently not supported: %s at %s:%d\n",rec->d.allele[ialt],bcf_seqname(args->hdr,rec),rec->pos+1); assert( rec->d.allele[0][1]==0 ); // todo: for now expecting strlen(REF) = 1 len_diff = 1-rec->rlen; rec->d.allele[ialt] = rec->d.allele[0]; // according to VCF spec, REF must precede the event alen = strlen(rec->d.allele[ialt]); } else if ( strncasecmp(rec->d.allele[0],args->fa_buf.s+idx,rec->rlen) ) { // fprintf(pysamerr,"%d .. [%s], idx=%d ori=%d off=%d\n",args->fa_ori_pos,args->fa_buf.s,idx,args->fa_ori_pos,args->fa_mod_off); char tmp = 0; if ( args->fa_buf.l - idx > rec->rlen ) { tmp = args->fa_buf.s[idx+rec->rlen]; args->fa_buf.s[idx+rec->rlen] = 0; } error( "The fasta sequence does not match the REF allele at %s:%d:\n" " .vcf: [%s]\n" " .vcf: [%s] <- (ALT)\n" " .fa: [%s]%c%s\n", bcf_seqname(args->hdr,rec),rec->pos+1, rec->d.allele[0], rec->d.allele[ialt], args->fa_buf.s+idx, tmp?tmp:' ',tmp?args->fa_buf.s+idx+rec->rlen+1:"" ); } else { alen = strlen(rec->d.allele[ialt]); len_diff = alen - rec->rlen; } if ( args->fa_case ) for (i=0; i<alen; i++) rec->d.allele[ialt][i] = toupper(rec->d.allele[ialt][i]); else for (i=0; i<alen; i++) rec->d.allele[ialt][i] = tolower(rec->d.allele[ialt][i]); if ( len_diff <= 0 ) { // deletion or same size event for (i=0; i<alen; i++) args->fa_buf.s[idx+i] = rec->d.allele[ialt][i]; if ( len_diff ) memmove(args->fa_buf.s+idx+alen,args->fa_buf.s+idx+rec->rlen,args->fa_buf.l-idx-rec->rlen); } else { // insertion ks_resize(&args->fa_buf, args->fa_buf.l + len_diff); memmove(args->fa_buf.s + idx + rec->rlen + len_diff, args->fa_buf.s + idx + rec->rlen, args->fa_buf.l - idx - rec->rlen); for (i=0; i<alen; i++) args->fa_buf.s[idx+i] = rec->d.allele[ialt][i]; } if (args->chain && len_diff != 0) { // If first nucleotide of both REF and ALT are the same... (indels typically include the nucleotide before the variant) if ( strncasecmp(rec->d.allele[0],rec->d.allele[ialt],1) == 0) { // ...extend the block by 1 bp: start is 1 bp further and alleles are 1 bp shorter push_chain_gap(args->chain, rec->pos + 1, rec->rlen - 1, rec->pos + 1 + args->fa_mod_off, alen - 1); } else { // otherwise, just the coordinates of the variant as given push_chain_gap(args->chain, rec->pos, rec->rlen, rec->pos + args->fa_mod_off, alen); } } args->fa_buf.l += len_diff; args->fa_mod_off += len_diff; args->fa_frz_pos = rec->pos + rec->rlen - 1; }
int kputd(double d, kstring_t *s) { int len = 0; char buf[21], *cp = buf+20, *ep; if (d == 0) { if (signbit(d)) { kputsn("-0",2,s); return 2; } else { kputsn("0",1,s); return 1; } } if (d < 0) { kputc('-',s); len = 1; d=-d; } if (!(d >= 0.0001 && d <= 999999)) { if (ks_resize(s, s->l + 50) < 0) return EOF; // We let stdio handle the exponent cases int s2 = sprintf(s->s + s->l, "%g", d); len += s2; s->l += s2; return len; } uint64_t i = d*10000000000LL; // Correction for rounding - rather ugly // Optimised for small numbers. // Better still would be __builtin_clz on hi/lo 32 and get the // starting point very rapidly. if (d<.0001) i+=0; else if (d<0.001) i+=5; else if (d < 0.01) i+=50; else if (d < 0.1) i+=500; else if (d < 1) i+=5000; else if (d < 10) i+=50000; else if (d < 100) i+=500000; else if (d < 1000) i+=5000000; else if (d < 10000) i+=50000000; else if (d < 100000) i+=500000000; else i+=5000000000LL; do { *--cp = '0' + i%10; i /= 10; } while (i >= 1); buf[20] = 0; int p = buf+20-cp; if (p <= 10) { // d < 1 //assert(d/1); cp[6] = 0; ep = cp+5;// 6 precision while (p < 10) { *--cp = '0'; p++; } *--cp = '.'; *--cp = '0'; } else { char *xp = --cp; while (p > 10) { xp[0] = xp[1]; p--; xp++; } xp[0] = '.'; cp[7] = 0; ep=cp+6; if (cp[6] == '.') cp[6] = 0; } // Cull trailing zeros while (*ep == '0' && ep > cp) ep--; char *z = ep+1; while (ep > cp) { if (*ep == '.') { if (z[-1] == '.') z[-1] = 0; else z[0] = 0; break; } ep--; } int sl = strlen(cp); len += sl; kputsn(cp, sl, s); return len; }
static void lt_proxy_loop_read(void* arg) { proxy_conn_t *proxy = (proxy_conn_t*)arg; proxy_conn_t *other = proxy->other; proxy_server *srv = &g_srv; kstring_t *buf = NULL; int ret, n; lthread_chan_t* chans[3]; void* msgs[3]; lthread_sel_t* sel = lthread_sel_create(); chans[0] = other->write_ch; chans[1] = proxy->die_ch; chans[2] = srv->die_ch; fprintf(stderr, "%s#%d started\n", __FUNCTION__, __LINE__); while(!proxy->is_die) { buf = (kstring_t*)calloc(1, sizeof(kstring_t)); ks_resize(buf, 4096); n = net_conn_read(&proxy->conn.base, (uint8_t*)buf->s, (int)buf->m, (uint64_t)10000); if(n <= 0) { fprintf(stderr, "recv error n=%d priv=%d\n", n, proxy->is_priv); break; } else { buf->l = n; msgs[0] = buf; ret = chan_select(sel, chans, msgs, 1, 2, (uint64_t)10000); switch(ret) { case 0: //send ok buf = NULL; //fprintf(stderr, "send new writer\n"); break; case 1: //proxy die ret = -10; break; case 2: //server die ret = -11; break; default: break; } if(ret < 0) { fprintf(stderr, "%s#%d error ret=%d is_priv=%d\n", __FUNCTION__, __LINE__, ret, proxy->is_priv); break; } } } if(NULL != buf) { ks_free(buf); free(buf); buf = NULL; } chan_close(other->write_ch); if(!proxy->is_die) { proxy->is_die = 1; chan_close(proxy->die_ch); } lthread_sel_dispose(sel); }