Exemple #1
0
int load_unpadded_ref(faidx_t *fai, char *ref_name, int ref_len, kstring_t *seq)
{
    char base;
    char *fai_ref = 0;
    int fai_ref_len = 0, k;

    fai_ref = fai_fetch(fai, ref_name, &fai_ref_len);
    if (fai_ref_len != ref_len) {
        fprintf(stderr, "[depad] ERROR: FASTA sequence %s length %i, expected %i\n", ref_name, fai_ref_len, ref_len);
        free(fai_ref);
        return -1;
    }
    ks_resize(seq, ref_len);
    seq->l = 0;
    for (k = 0; k < ref_len; ++k) {
        base = fai_ref[k];
        if (base == '-' || base == '*') {
            // Map gaps to null to match unpad_seq function
            seq->s[seq->l++] = 0;
        } else {
            int i = seq_nt16_table[(int)base];
            if (i == 0 || i==16) { // Equals maps to 0, anything unexpected to 16
                fprintf(stderr, "[depad] ERROR: Invalid character %c (ASCII %i) in FASTA sequence %s\n", base, (int)base, ref_name);
                free(fai_ref);
                return -1;
            }
            seq->s[seq->l++] = i;
        }
    }
    assert(ref_len == seq->l);
    free(fai_ref);
    return 0;
}
Exemple #2
0
void call_stdout(marksplit_settings_t *settings, splitterhash_params_t *params, char *ffq_r1, char *ffq_r2)
{
    kstring_t str1{0, 0, nullptr}, str2{0, 0, nullptr};
    kstring_t final{0, 0, nullptr};
    kputs((settings->gzip_output)? "zcat": "cat", &str1);
    ks_resize(&str1, 1 << 10);
    for(int i(0); i < settings->n_handles; ++i)
        ksprintf(&str1, " %s", params->outfnames_r1[i]);
    kputsnl(" | paste -d'~' - - - - ", &str1);
    str2.s = dlib::kstrdup(&str1); // strdup the string.
    for(uint32_t i(0); i < str2.l; ++i) {
        LOG_DEBUG("Current str.s + i: %s.\n", str2.s + i);
        if(memcmp(str2.s + i, "R1", 2) == 0)
            str2.s[i + 1] = '2';
    }

    const char final_template[]{"pr -mts'~' <(%s) <(%s) | tr '~' '\\n'"};
    ksprintf(&final, final_template, str1.s, str2.s);
    dlib::bash_system(final.s);
    free(str1.s), free(str2.s);
    free(final.s);
}

void cat_fastqs(marksplit_settings_t *settings, splitterhash_params_t *params, char *ffq_r1, char *ffq_r2)
{
    settings->is_se ? cat_fastqs_se(settings, params, ffq_r1)
                    : cat_fastqs_pe(settings, params, ffq_r1, ffq_r2);
}
Exemple #3
0
static inline int bcf_read1_core(BGZF *fp, bcf1_t *v)
{
	uint32_t x[8];
	int ret;
	if ((ret = bgzf_read(fp, x, 32)) != 32) {
		if (ret == 0) return -1;
		return -2;
	}
	x[0] -= 24; // to exclude six 32-bit integers
	ks_resize(&v->shared, x[0]);
	ks_resize(&v->indiv, x[1]);
	memcpy(v, x + 2, 16);
	v->n_allele = x[6]>>16; v->n_info = x[6]&0xffff;
	v->n_fmt = x[7]>>24; v->n_sample = x[7]&0xffffff;
	v->shared.l = x[0], v->indiv.l = x[1];
	v->unpacked = 0;
	v->unpack_ptr = NULL;
	bgzf_read(fp, v->shared.s, v->shared.l);
	bgzf_read(fp, v->indiv.s, v->indiv.l);
	return 0;
}
static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec)
{
    int i = 0, max_len = 0;
    if ( !reg->nals )
    {
        char *ss = reg->line.s;
        while ( i<als_idx && *ss )
        {
            if ( *ss=='\t' ) i++;
            ss++;
        }
        char *se = ss;
        reg->nals = 1;
        while ( *se && *se!='\t' )
        {
            if ( *se==',' ) reg->nals++;
            se++;
        }
        ks_resize(&reg->als_str, se-ss+1+reg->nals);
        reg->als_str.l = 0;
        hts_expand(char*,reg->nals,reg->mals,reg->als);
        reg->nals = 0;

        se = ss;
        while ( *(++se) )
        {
            if ( *se=='\t' ) break;
            if ( *se!=',' ) continue;
            reg->als[reg->nals] = &reg->als_str.s[reg->als_str.l];
            kputsn(ss,se-ss,&reg->als_str);
            if ( &reg->als_str.s[reg->als_str.l] - reg->als[reg->nals] > max_len ) max_len = &reg->als_str.s[reg->als_str.l] - reg->als[reg->nals];
            reg->als_str.l++;
            reg->nals++;
            ss = ++se;
        }
        reg->als[reg->nals] = &reg->als_str.s[reg->als_str.l];
        kputsn(ss,se-ss,&reg->als_str);
        if ( &reg->als_str.s[reg->als_str.l] - reg->als[reg->nals] > max_len ) max_len = &reg->als_str.s[reg->als_str.l] - reg->als[reg->nals];
        reg->nals++;
        reg->als_type = max_len > 1 ? VCF_INDEL : VCF_SNP;  // this is a simplified check, see vcf.c:bcf_set_variant_types
    }
Exemple #5
0
static void unpad_seq(bam1_t *b, kstring_t *s)
{
	int k, j, i;
	uint32_t *cigar = bam1_cigar(b);
	uint8_t *seq = bam1_seq(b);
	ks_resize(s, b->core.l_qseq);
	for (k = 0, s->l = 0, j = 0; k < b->core.n_cigar; ++k) {
		int op, ol;
		op = bam_cigar_op(cigar[k]);
		ol = bam_cigar_oplen(cigar[k]);
		assert(op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CSOFT_CLIP);
		if (op == BAM_CMATCH) {
			for (i = 0; i < ol; ++i) s->s[s->l++] = bam1_seqi(seq, j);
			++j;
		} else if (op == BAM_CSOFT_CLIP) {
			j += ol;
		} else {
			for (i = 0; i < ol; ++i) s->s[s->l++] = 0;
		}
	}
}
Exemple #6
0
static int unpad_seq(bam1_t *b, kstring_t *s)
{
    // Returns 0 on success, -1 on an error
    int k, j, i;
    int length;
    int cigar_n_warning = 0; /* Make this a global and limit to one CIGAR N warning? */
    uint32_t *cigar = bam_get_cigar(b);
    uint8_t *seq = bam_get_seq(b);

    // b->core.l_qseq gives length of the SEQ entry (including soft clips, S)
    // We need the padded length after alignment from the CIGAR (excluding
    // soft clips S, but including pads from CIGAR D operations)
    length = bam_cigar2rlen(b->core.n_cigar, cigar);
    ks_resize(s, length);
    for (k = 0, s->l = 0, j = 0; k < b->core.n_cigar; ++k) {
        int op, ol;
        op = bam_cigar_op(cigar[k]);
        ol = bam_cigar_oplen(cigar[k]);
        if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
            for (i = 0; i < ol; ++i, ++j) s->s[s->l++] = bam_seqi(seq, j);
        } else if (op == BAM_CSOFT_CLIP) {
            j += ol;
        } else if (op == BAM_CHARD_CLIP) {
            /* do nothing */
        } else if (op == BAM_CDEL) {
            for (i = 0; i < ol; ++i) s->s[s->l++] = 0;
        } else if (op == BAM_CREF_SKIP) {
            /* Treat CIGAR N as D (not ideal, but better than ignoring it) */
            for (i = 0; i < ol; ++i) s->s[s->l++] = 0;
            if (0 == cigar_n_warning) {
                cigar_n_warning = -1;
                fprintf(stderr, "[depad] WARNING: CIGAR op N treated as op D in read %s\n", bam_get_qname(b));
            }
        } else {
            fprintf(stderr, "[depad] ERROR: Didn't expect CIGAR op %c in read %s\n", BAM_CIGAR_STR[op], bam_get_qname(b));
            return -1;
        }
    }
    return length != s->l;
}
Exemple #7
0
static void apply_variant(args_t *args, bcf1_t *rec)
{
    if ( rec->n_allele==1 ) return;

    if ( rec->pos <= args->fa_frz_pos )
    {
        fprintf(pysamerr,"The site %s:%d overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),rec->pos+1);
        return;
    }
    if ( args->mask )
    {
        char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid);
        int start = rec->pos;
        int end   = rec->pos + rec->rlen - 1;
        if ( regidx_overlap(args->mask, chr,start,end,NULL) ) return;
    }

    int i, ialt = 1;
    if ( args->isample >= 0 )
    {
        bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT");
        if ( !fmt ) return;
        if ( args->haplotype )
        {
            if ( args->haplotype > fmt->n ) error("Can't apply %d-th haplotype at %s:%d\n", args->haplotype,bcf_seqname(args->hdr,rec),rec->pos+1);
            uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample + args->haplotype - 1;
            ialt = bcf_dec_int1(ptr, fmt->type, &ignore);
            if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
            ialt = bcf_gt_allele(ialt);
        }
        else if ( args->output_iupac ) 
        {
            uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample;
            ialt = bcf_dec_int1(ptr, fmt->type, &ignore);
            if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
            ialt = bcf_gt_allele(ialt);

            int jalt;
            if ( fmt->n>1 )
            {
                ptr = fmt->p + fmt->size*args->isample + 1;
                jalt = bcf_dec_int1(ptr, fmt->type, &ignore);
                if ( bcf_gt_is_missing(jalt) || jalt==bcf_int32_vector_end ) jalt = ialt;
                else jalt = bcf_gt_allele(jalt);
            }
            else jalt = ialt;
            if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
            if ( ialt!=jalt && !rec->d.allele[ialt][1] && !rec->d.allele[jalt][1] ) // is this a het snp?
            {
                char ial = rec->d.allele[ialt][0];
                char jal = rec->d.allele[jalt][0];
                rec->d.allele[ialt][0] = gt2iupac(ial,jal);
            }
        }
        else
        {
            for (i=0; i<fmt->n; i++)
            {
                uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample + i;
                ialt = bcf_dec_int1(ptr, fmt->type, &ignore);
                if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
                ialt = bcf_gt_allele(ialt);
                if ( ialt ) break;
            }
        }
        if ( !ialt ) return;  // ref allele
        if ( rec->n_allele <= ialt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
    }
    else if ( args->output_iupac && !rec->d.allele[0][1] && !rec->d.allele[1][1] )
    {
        char ial = rec->d.allele[0][0];
        char jal = rec->d.allele[1][0];
        rec->d.allele[1][0] = gt2iupac(ial,jal);
    }

    int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off;
    if ( idx<0 || idx>=args->fa_buf.l ) 
        error("FIXME: %s:%d .. idx=%d, ori_pos=%d, len=%d, off=%d\n",bcf_seqname(args->hdr,rec),rec->pos+1,idx,args->fa_ori_pos,args->fa_buf.l,args->fa_mod_off);

    // sanity check the reference base
    int len_diff = 0, alen = 0;
    if ( rec->d.allele[ialt][0]=='<' )
    {
        if ( strcasecmp(rec->d.allele[ialt], "<DEL>") )
            error("Symbolic alleles other than <DEL> are currently not supported: %s at %s:%d\n",rec->d.allele[ialt],bcf_seqname(args->hdr,rec),rec->pos+1);
        assert( rec->d.allele[0][1]==0 );           // todo: for now expecting strlen(REF) = 1
        len_diff = 1-rec->rlen;
        rec->d.allele[ialt] = rec->d.allele[0];     // according to VCF spec, REF must precede the event
        alen = strlen(rec->d.allele[ialt]);
    }
    else if ( strncasecmp(rec->d.allele[0],args->fa_buf.s+idx,rec->rlen) )
    {
        // fprintf(pysamerr,"%d .. [%s], idx=%d ori=%d off=%d\n",args->fa_ori_pos,args->fa_buf.s,idx,args->fa_ori_pos,args->fa_mod_off);
        char tmp = 0;
        if ( args->fa_buf.l - idx > rec->rlen ) 
        { 
            tmp = args->fa_buf.s[idx+rec->rlen];
            args->fa_buf.s[idx+rec->rlen] = 0;
        }
        error(
            "The fasta sequence does not match the REF allele at %s:%d:\n"
            "   .vcf: [%s]\n" 
            "   .vcf: [%s] <- (ALT)\n" 
            "   .fa:  [%s]%c%s\n",
            bcf_seqname(args->hdr,rec),rec->pos+1, rec->d.allele[0], rec->d.allele[ialt], args->fa_buf.s+idx, 
            tmp?tmp:' ',tmp?args->fa_buf.s+idx+rec->rlen+1:""
            );
    }
    else
    {
        alen = strlen(rec->d.allele[ialt]);
        len_diff = alen - rec->rlen;
    }

    if ( args->fa_case )
        for (i=0; i<alen; i++) rec->d.allele[ialt][i] = toupper(rec->d.allele[ialt][i]);
    else
        for (i=0; i<alen; i++) rec->d.allele[ialt][i] = tolower(rec->d.allele[ialt][i]);

    if ( len_diff <= 0 )
    {
        // deletion or same size event
        for (i=0; i<alen; i++)
            args->fa_buf.s[idx+i] = rec->d.allele[ialt][i];
        if ( len_diff )
            memmove(args->fa_buf.s+idx+alen,args->fa_buf.s+idx+rec->rlen,args->fa_buf.l-idx-rec->rlen);
    }
    else
    {
        // insertion
        ks_resize(&args->fa_buf, args->fa_buf.l + len_diff);
        memmove(args->fa_buf.s + idx + rec->rlen + len_diff, args->fa_buf.s + idx + rec->rlen, args->fa_buf.l - idx - rec->rlen);
        for (i=0; i<alen; i++)
            args->fa_buf.s[idx+i] = rec->d.allele[ialt][i];
    }
    if (args->chain && len_diff != 0)
    {
        // If first nucleotide of both REF and ALT are the same... (indels typically include the nucleotide before the variant)
        if ( strncasecmp(rec->d.allele[0],rec->d.allele[ialt],1) == 0)
        {
            // ...extend the block by 1 bp: start is 1 bp further and alleles are 1 bp shorter
            push_chain_gap(args->chain, rec->pos + 1, rec->rlen - 1, rec->pos + 1 + args->fa_mod_off, alen - 1);
        }
        else
        {
            // otherwise, just the coordinates of the variant as given
            push_chain_gap(args->chain, rec->pos, rec->rlen, rec->pos + args->fa_mod_off, alen);
        }
    }
    args->fa_buf.l += len_diff;
    args->fa_mod_off += len_diff;
    args->fa_frz_pos  = rec->pos + rec->rlen - 1;
}
Exemple #8
0
int kputd(double d, kstring_t *s) {
	int len = 0;
	char buf[21], *cp = buf+20, *ep;
	if (d == 0) {
		if (signbit(d)) {
			kputsn("-0",2,s);
			return 2;
		} else {
			kputsn("0",1,s);
			return 1;
		}
	}

	if (d < 0) {
		kputc('-',s);
		len = 1;
		d=-d;
	}
	if (!(d >= 0.0001 && d <= 999999)) {
		if (ks_resize(s, s->l + 50) < 0)
			return EOF;
		// We let stdio handle the exponent cases
		int s2 = sprintf(s->s + s->l, "%g", d);
		len += s2;
		s->l += s2;
		return len;
	}

	uint64_t i = d*10000000000LL;
	// Correction for rounding - rather ugly

	// Optimised for small numbers.
	// Better still would be __builtin_clz on hi/lo 32 and get the
	// starting point very rapidly.
	if (d<.0001)
		i+=0;
	else if (d<0.001)
		i+=5;
	else if (d < 0.01)
		i+=50;
	else if (d < 0.1)
		i+=500;
	else if (d < 1)
		i+=5000;
	else if (d < 10)
		i+=50000;
	else if (d < 100)
		i+=500000;
	else if (d < 1000)
		i+=5000000;
	else if (d < 10000)
		i+=50000000;
	else if (d < 100000)
		i+=500000000;
	else
		i+=5000000000LL;

	do {
		*--cp = '0' + i%10;
		i /= 10;
	} while (i >= 1);
	buf[20] = 0;
	int p = buf+20-cp;
	if (p <= 10) { // d < 1
		//assert(d/1);
		cp[6] = 0; ep = cp+5;// 6 precision
		while (p < 10) {
			*--cp = '0';
			p++;
		}
		*--cp = '.';
		*--cp = '0';
	} else {
		char *xp = --cp;
		while (p > 10) {
			xp[0] = xp[1];
			p--;
			xp++;
		}
		xp[0] = '.';
		cp[7] = 0; ep=cp+6;
		if (cp[6] == '.') cp[6] = 0;
	}

	// Cull trailing zeros
	while (*ep == '0' && ep > cp)
		ep--;
	char *z = ep+1;
	while (ep > cp) {
		if (*ep == '.') {
			if (z[-1] == '.')
				z[-1] = 0;
			else
				z[0] = 0;
			break;
		}
		ep--;
	}

	int sl = strlen(cp);
	len += sl;
	kputsn(cp, sl, s);
	return len;
}
Exemple #9
0
static void lt_proxy_loop_read(void* arg)
{
    proxy_conn_t *proxy = (proxy_conn_t*)arg;
    proxy_conn_t *other = proxy->other;
    proxy_server *srv = &g_srv;
    kstring_t *buf = NULL;
    int ret, n;
    lthread_chan_t* chans[3];
    void* msgs[3];
    lthread_sel_t* sel = lthread_sel_create();
    chans[0] = other->write_ch;
    chans[1] = proxy->die_ch;
    chans[2] = srv->die_ch;

    fprintf(stderr, "%s#%d started\n", __FUNCTION__, __LINE__);
    while(!proxy->is_die) {
        buf = (kstring_t*)calloc(1, sizeof(kstring_t));
        ks_resize(buf, 4096);
        n = net_conn_read(&proxy->conn.base, (uint8_t*)buf->s, (int)buf->m, (uint64_t)10000);
        if(n <= 0) {
            fprintf(stderr, "recv error n=%d priv=%d\n", n, proxy->is_priv);
            break;
        } else {
            buf->l = n;
            msgs[0] = buf;
            ret = chan_select(sel, chans, msgs, 1, 2, (uint64_t)10000);
            switch(ret) {
            case 0:
                //send ok
                buf = NULL;
                //fprintf(stderr, "send new writer\n");
                break;
            case 1:
                //proxy die
                ret = -10;
                break;
            case 2:
                //server die
                ret = -11;
                break;
            default:
                break;
            }
            if(ret < 0) {
                fprintf(stderr, "%s#%d error ret=%d is_priv=%d\n", __FUNCTION__, __LINE__, ret, proxy->is_priv);
                break;
            }
        }
    }

    if(NULL != buf) {
        ks_free(buf);
        free(buf);
        buf = NULL;
    }

    chan_close(other->write_ch);
    if(!proxy->is_die) {
        proxy->is_die = 1;
        chan_close(proxy->die_ch);
    }

    lthread_sel_dispose(sel);
}