// File name or a list of genomic locations. If file name, NULL is returned. static bcf_sr_regions_t *_regions_init_string(const char *str) { bcf_sr_regions_t *reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t)); reg->start = reg->end = -1; reg->prev_start = reg->prev_seq = -1; kstring_t tmp = {0,0,0}; const char *sp = str, *ep = str; int from, to; while ( 1 ) { while ( *ep && *ep!=',' && *ep!=':' ) ep++; tmp.l = 0; kputsn(sp,ep-sp,&tmp); if ( *ep==':' ) { sp = ep+1; from = strtol(sp,(char**)&ep,10); if ( sp==ep ) { fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s\n", __FILE__,__LINE__,__FUNCTION__,str); free(reg); free(tmp.s); return NULL; } if ( !*ep || *ep==',' ) { _regions_add(reg, tmp.s, from, from); sp = ep; continue; } if ( *ep!='-' ) { fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s\n", __FILE__,__LINE__,__FUNCTION__,str); free(reg); free(tmp.s); return NULL; } ep++; sp = ep; to = strtol(sp,(char**)&ep,10); if ( *ep && *ep!=',' ) { fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s\n", __FILE__,__LINE__,__FUNCTION__,str); free(reg); free(tmp.s); return NULL; } if ( sp==ep ) to = MAX_CSI_COOR-1; _regions_add(reg, tmp.s, from, to); if ( !*ep ) break; sp = ep; } else { if ( tmp.l ) _regions_add(reg, tmp.s, -1, -1); if ( !*ep ) break; sp = ++ep; } } free(tmp.s); return reg; }
// remove pads static void remove_pads(const kstring_t *src, kstring_t *dst) { int i, j; dst->l = 0; kputsn(src->s, src->l, dst); for (i = j = 0; i < dst->l; ++i) if (dst->s[i] != '*') dst->s[j++] = dst->s[i]; dst->s[j] = 0; dst->l = j; }
static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec) { int i = 0, max_len = 0; if ( !reg->nals ) { char *ss = reg->line.s; while ( i<als_idx && *ss ) { if ( *ss=='\t' ) i++; ss++; } char *se = ss; reg->nals = 1; while ( *se && *se!='\t' ) { if ( *se==',' ) reg->nals++; se++; } ks_resize(®->als_str, se-ss+1+reg->nals); reg->als_str.l = 0; hts_expand(char*,reg->nals,reg->mals,reg->als); reg->nals = 0; se = ss; while ( *(++se) ) { if ( *se=='\t' ) break; if ( *se!=',' ) continue; reg->als[reg->nals] = ®->als_str.s[reg->als_str.l]; kputsn(ss,se-ss,®->als_str); if ( ®->als_str.s[reg->als_str.l] - reg->als[reg->nals] > max_len ) max_len = ®->als_str.s[reg->als_str.l] - reg->als[reg->nals]; reg->als_str.l++; reg->nals++; ss = ++se; } reg->als[reg->nals] = ®->als_str.s[reg->als_str.l]; kputsn(ss,se-ss,®->als_str); if ( ®->als_str.s[reg->als_str.l] - reg->als[reg->nals] > max_len ) max_len = ®->als_str.s[reg->als_str.l] - reg->als[reg->nals]; reg->nals++; reg->als_type = max_len > 1 ? VCF_INDEL : VCF_SNP; // this is a simplified check, see vcf.c:bcf_set_variant_types }
static int load_genmap(args_t *args, bcf1_t *line) { if ( !args->genmap_fname ) { args->ngenmap = 0; return 0; } kstring_t str = {0,0,0}; char *fname = strstr(args->genmap_fname,"{CHROM}"); if ( fname ) { kputsn(args->genmap_fname, fname - args->genmap_fname, &str); kputs(bcf_seqname(args->hdr,line), &str); kputs(fname+7,&str); fname = str.s; } else fname = args->genmap_fname; htsFile *fp = hts_open(fname, "rb"); if ( !fp ) { args->ngenmap = 0; return -1; } hts_getline(fp, KS_SEP_LINE, &str); if ( strcmp(str.s,"position COMBINED_rate(cM/Mb) Genetic_Map(cM)") ) error("Unexpected header, found:\n\t[%s], but expected:\n\t[position COMBINED_rate(cM/Mb) Genetic_Map(cM)]\n", fname, str.s); args->ngenmap = args->igenmap = 0; while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 ) { args->ngenmap++; hts_expand(genmap_t,args->ngenmap,args->mgenmap,args->genmap); genmap_t *gm = &args->genmap[args->ngenmap-1]; char *tmp, *end; gm->pos = strtol(str.s, &tmp, 10); if ( str.s==tmp ) error("Could not parse %s: %s\n", fname, str.s); // skip second column tmp++; while ( *tmp && !isspace(*tmp) ) tmp++; // read the genetic map in cM gm->rate = strtod(tmp+1, &end); if ( tmp+1==end ) error("Could not parse %s: %s\n", fname, str.s); } if ( !args->ngenmap ) error("Genetic map empty?\n"); int i; for (i=0; i<args->ngenmap; i++) args->genmap[i].rate /= args->genmap[args->ngenmap-1].rate; // scale to 1 if ( hts_close(fp) ) error("Close failed\n"); free(str.s); return 0; }
static void parse_simple(const char *fname, kstring_t *id, kstring_t *secret) { kstring_t text = { 0, 0, NULL }; char *s; size_t len; FILE *fp = expand_tilde_open(fname, "r"); if (fp == NULL) return; while (kgetline(&text, (kgets_func *) fgets, fp) >= 0) kputc(' ', &text); fclose(fp); s = text.s; while (isspace_c(*s)) s++; kputsn(s, len = strcspn(s, " \t"), id); s += len; while (isspace_c(*s)) s++; kputsn(s, strcspn(s, " \t"), secret); free(text.s); }
bcf_hdr_t *vcf_hdr_read(bcf_t *bp) { kstring_t meta, smpl; int dret; vcf_t *v; bcf_hdr_t *h; if (!bp->is_vcf) return bcf_hdr_read(bp); h = calloc(1, sizeof(bcf_hdr_t)); v = (vcf_t*)bp->v; v->line.l = 0; memset(&meta, 0, sizeof(kstring_t)); memset(&smpl, 0, sizeof(kstring_t)); while (ks_getuntil(v->ks, '\n', &v->line, &dret) >= 0) { if (v->line.l < 2) continue; if (v->line.s[0] != '#') return 0; // no sample line if (v->line.s[0] == '#' && v->line.s[1] == '#') { kputsn(v->line.s, v->line.l, &meta); kputc('\n', &meta); } else if (v->line.s[0] == '#') { int k; ks_tokaux_t aux; char *p; for (p = kstrtok(v->line.s, "\t\n", &aux), k = 0; p; p = kstrtok(0, 0, &aux), ++k) { if (k >= 9) { kputsn(p, aux.p - p, &smpl); kputc('\0', &smpl); } } break; } } kputc('\0', &meta); h->name = 0; h->sname = smpl.s; h->l_smpl = smpl.l; h->txt = meta.s; h->l_txt = meta.l; bcf_hdr_sync(h); return h; }
int ploidy_parse(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr) { int i, ret; ploidy_t *ploidy = (ploidy_t*) usr; void *sex2id = ploidy->sex2id; // Check for special case of default ploidy "* * * <sex> <ploidy>" int default_ploidy_def = 0; char *ss = (char*) line; while ( *ss && isspace(*ss) ) ss++; if ( ss[0]=='*' && (!ss[1] || isspace(ss[1])) ) default_ploidy_def = 1; // definition of default ploidy, chr="*" else { // Fill CHR,FROM,TO ret = regidx_parse_tab(line,chr_beg,chr_end,reg,NULL,NULL); if ( ret!=0 ) return ret; } // Skip the fields already parsed by regidx_parse_tab ss = (char*) line; while ( *ss && isspace(*ss) ) ss++; for (i=0; i<3; i++) { while ( *ss && !isspace(*ss) ) ss++; if ( !*ss ) return -2; // wrong number of fields while ( *ss && isspace(*ss) ) ss++; } if ( !*ss ) return -2; // Parse the payload char *se = ss; while ( *se && !isspace(*se) ) se++; if ( !*se || se==ss ) error("Could not parse: %s\n", line); ploidy->tmp_str.l = 0; kputsn(ss,se-ss,&ploidy->tmp_str); sex_ploidy_t *sp = (sex_ploidy_t*) payload; if ( khash_str2int_get(sex2id, ploidy->tmp_str.s, &sp->sex) != 0 ) { ploidy->nsex++; hts_expand0(char*,ploidy->nsex,ploidy->msex,ploidy->id2sex); ploidy->id2sex[ploidy->nsex-1] = strdup(ploidy->tmp_str.s); sp->sex = khash_str2int_inc(ploidy->sex2id, ploidy->id2sex[ploidy->nsex-1]); ploidy->sex2dflt = (int*) realloc(ploidy->sex2dflt,sizeof(int)*ploidy->nsex); ploidy->sex2dflt[ploidy->nsex-1] = ploidy->dflt; }
static void read_header_file(char *fname, kstring_t *hdr) { kstring_t tmp = {0,0,0}; hdr->l = 0; htsFile *fp = hts_open(fname, "r"); if ( !fp ) error("Could not read: %s\n", fname); while ( hts_getline(fp, KS_SEP_LINE, &tmp) > 0 ) { kputsn(tmp.s,tmp.l,hdr); kputc('\n',hdr); } if ( hts_close(fp) ) error("Close failed: %s\n", fname); free(tmp.s); while ( hdr->l>0 && isspace(hdr->s[hdr->l-1]) ) hdr->l--; // remove trailing newlines kputc('\n',hdr); }
inline int regidx_push(regidx_t *idx, char *chr_beg, char *chr_end, uint32_t beg, uint32_t end, void *payload) { if ( beg > MAX_COOR_0 ) beg = MAX_COOR_0; if ( end > MAX_COOR_0 ) end = MAX_COOR_0; int rid; idx->str.l = 0; kputsn(chr_beg, chr_end-chr_beg+1, &idx->str); if ( khash_str2int_get(idx->seq2regs, idx->str.s, &rid)!=0 ) { // new chromosome idx->nseq++; int m_prev = idx->mseq; hts_expand0(reglist_t,idx->nseq,idx->mseq,idx->seq); hts_expand0(char*,idx->nseq,m_prev,idx->seq_names); idx->seq_names[idx->nseq-1] = strdup(idx->str.s); rid = khash_str2int_inc(idx->seq2regs, idx->seq_names[idx->nseq-1]); }
int main_getalt(int argc, char *argv[]) { int c; char *fn; BGZF *fp; bcf1_t *b; bcf_hdr_t *h; kstring_t s = {0,0,0}; while ((c = getopt(argc, argv, "")) >= 0) { } if (argc - optind == 0) { fprintf(stderr, "Usage: bgt getalt <bgt-base>\n"); return 1; } fn = (char*)calloc(strlen(argv[optind]) + 5, 1); sprintf(fn, "%s.bcf", argv[optind]); fp = bgzf_open(fn, "r"); free(fn); assert(fp); h = bcf_hdr_read(fp); b = bcf_init1(); while (bcf_read1(fp, b) >= 0) { char *ref, *alt; int l_ref, l_alt, i, min_l; bcf_get_ref_alt1(b, &l_ref, &ref, &l_alt, &alt); min_l = l_ref < l_alt? l_ref : l_alt; for (i = 0; i < min_l && ref[i] == alt[i]; ++i); s.l = 0; kputs(h->id[BCF_DT_CTG][b->rid].key, &s); kputc(':', &s); kputw(b->pos + 1 + i, &s); kputc(':', &s); kputw(b->rlen - i, &s); kputc(':', &s); kputsn(alt + i, l_alt - i, &s); puts(s.s); } bcf_destroy1(b); bcf_hdr_destroy(h); bgzf_close(fp); free(s.s); return 0; }
static void reheader_vcf(args_t *args) { kstring_t hdr = {0,0,0}; htsFile *fp = hts_open(args->fname, "r"); if ( !fp ) error("Failed to open: %s\n", args->fname); while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 ) { kputc('\n',&fp->line); // hts_getline eats the newline character if ( fp->line.s[0]!='#' ) break; kputsn(fp->line.s,fp->line.l,&hdr); } int nsamples = 0; char **samples = NULL; if ( args->samples_fname ) samples = hts_readlines(args->samples_fname, &nsamples); if ( args->header_fname ) { free(hdr.s); hdr.s = NULL; hdr.l = hdr.m = 0; read_header_file(args->header_fname, &hdr); } if ( samples ) { set_samples(samples, nsamples, &hdr); int i; for (i=0; i<nsamples; i++) free(samples[i]); free(samples); } int out = STDOUT_FILENO; if ( write(out, hdr.s, hdr.l)!=hdr.l ) error("Failed to write %d bytes\n", hdr.l); free(hdr.s); if ( fp->line.l ) { if ( write(out, fp->line.s, fp->line.l)!=fp->line.l ) error("Failed to write %d bytes\n", fp->line.l); } while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 ) // uncompressed file implies small size, we don't worry about speed { kputc('\n',&fp->line); if ( write(out, fp->line.s, fp->line.l)!=fp->line.l ) error("Failed to write %d bytes\n", fp->line.l); } hts_close(fp); }
// Expands a output filename format string static char* expand_format_string(const char* format_string, const char* basename, const char* rg_id, const int rg_idx, const htsFormat *format) { kstring_t str = { 0, 0, NULL }; const char* pointer = format_string; const char* next; while ((next = strchr(pointer, '%')) != NULL) { kputsn(pointer, next-pointer, &str); ++next; switch (*next) { case '%': kputc('%', &str); break; case '*': kputs(basename, &str); break; case '#': kputl(rg_idx, &str); break; case '!': kputs(rg_id, &str); break; case '.': // Only really need to cope with sam, bam, cram if (format->format != unknown_format) kputs(hts_format_file_extension(format), &str); else kputs("bam", &str); break; case '\0': // Error is: fprintf(pysam_stderr, "bad format string, trailing %%\n"); free(str.s); return NULL; default: // Error is: fprintf(pysam_stderr, "bad format string, unknown format specifier\n"); free(str.s); return NULL; } pointer = next + 1; } kputs(pointer, &str); return ks_release(&str); }
int regidx_insert_list(regidx_t *idx, char *line, char delim) { kstring_t tmp = {0,0,0}; char *ss = line; while ( *ss ) { char *se = ss; while ( *se && *se!=delim ) se++; tmp.l = 0; kputsn(ss, se-ss, &tmp); if ( regidx_insert(idx,tmp.s) < 0 ) { free(tmp.s); return -1; } if ( !*se ) break; ss = se+1; } free(tmp.s); return 0; }
static void base64_kput(const unsigned char *data, size_t len, kstring_t *str) { static const char base64[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; size_t i = 0; unsigned x = 0; int bits = 0, pad = 0; while (bits || i < len) { if (bits < 6) { x <<= 8, bits += 8; if (i < len) x |= data[i++]; else pad++; } bits -= 6; kputc(base64[(x >> bits) & 63], str); } str->l -= pad; kputsn("==", pad, str); }
void printAuxBuffered(uint8_t *s, uint8_t *sStop,kstring_t &str ) { // fprintf(stderr,"\ncomp:%p vs %p\n",s,sStop); while (s < sStop) { uint8_t type; kputc('\t', &str);kputc(s[0], &str);kputc(s[1], &str); kputc(':', &str); // fprintf(stderr,"\t%c%c:",s[0],s[1]); s += 2; type = *s; ++s; // fprintf(stderr,"\ntype=%c\n",type);//,(char)*s); // kputc('\t', &str); kputsn((char*)key, 2, &str); kputc(':', &str); if (type == 'A') { kputsn("A:", 2, &str); kputc(*s, &str); ++s; } else if (type == 'C') { kputsn("i:", 2, &str); kputw(*s, &str); ++s; } else if (type == 'c') { kputsn("i:", 2, &str); kputw(*(int8_t*)s, &str); ++s; } else if (type == 'S') { kputsn("i:", 2, &str); kputw(*(uint16_t*)s, &str); s += 2; } else if (type == 's') { kputsn("i:", 2, &str); kputw(*(int16_t*)s, &str); s += 2; } else if (type == 'I') { kputsn("i:", 2, &str); kputuw(*(uint32_t*)s, &str); s += 4; } else if (type == 'i') { kputsn("i:", 2, &str); kputw(*(int32_t*)s, &str); s += 4; } else if (type == 'f') { ksprintf(&str, "f:%g", *(float*)s); s += 4; } else if (type == 'd') { ksprintf(&str, "d:%lg", *(double*)s); s += 8; } else if (type == 'Z' || type == 'H') { kputc(type, &str); kputc(':', &str); while (*s) kputc(*s++, &str); ++s; } else if (type == 'B') { uint8_t sub_type = *(s++); int32_t n; memcpy(&n, s, 4); s += 4; // no point to the start of the array kputc(type, &str); kputc(':', &str); kputc(sub_type, &str); // write the typing for (int i = 0; i < n; ++i) { kputc(',', &str); if ('c' == sub_type || 'c' == sub_type) { kputw(*(int8_t*)s, &str); ++s; } else if ('C' == sub_type) { kputw(*(uint8_t*)s, &str); ++s; } else if ('s' == sub_type) { kputw(*(int16_t*)s, &str); s += 2; } else if ('S' == sub_type) { kputw(*(uint16_t*)s, &str); s += 2; } else if ('i' == sub_type) { kputw(*(int32_t*)s, &str); s += 4; } else if ('I' == sub_type) { kputuw(*(uint32_t*)s, &str); s += 4; } else if ('f' == sub_type) { ksprintf(&str, "%g", *(float*)s); s += 4; } } } } // fprintf(stderr,"done\n"); }
int regidx_insert(regidx_t *idx, char *line) { if ( !line ) return _regidx_build_index(idx); char *chr_from, *chr_to; reg_t reg; int ret = idx->parse(line,&chr_from,&chr_to,®,idx->payload,idx->usr); if ( ret==-2 ) return -1; // error if ( ret==-1 ) return 0; // skip the line int rid; idx->str.l = 0; kputsn(chr_from, chr_to-chr_from+1, &idx->str); if ( khash_str2int_get(idx->seq2regs, idx->str.s, &rid)!=0 ) { idx->nseq++; int m_prev = idx->mseq; hts_expand0(reglist_t,idx->nseq,idx->mseq,idx->seq); hts_expand0(char*,idx->nseq,m_prev,idx->seq_names); idx->seq_names[idx->nseq-1] = strdup(idx->str.s); rid = khash_str2int_inc(idx->seq2regs, idx->seq_names[idx->nseq-1]); }
char *config_get_string(const char *haystack, char *needle) { kstring_t str = {0,0,0}; ksprintf(&str,"%s=", needle); char *ret; while ( *haystack && (ret = strstr(haystack,str.s)) ) { if ( !ret ) break; if ( ret!=haystack && ret[-1]!=':' ) { // shared prefix haystack = ret+1; continue; } ret += str.l; char *se = ret; while ( *se && *se!=':' ) se++; str.l = 0; kputsn(ret,se-ret,&str); return str.s; } free(str.s); return NULL; }
bcf_hdr_t *vcf_hdr_read(htsFile *fp) { if (!fp->is_bin) { kstring_t txt, *s = &fp->line; bcf_hdr_t *h; h = bcf_hdr_init(); txt.l = txt.m = 0; txt.s = 0; while (hts_getline(fp, KS_SEP_LINE, s) >= 0) { if (s->l == 0) continue; if (s->s[0] != '#') { if (hts_verbose >= 2) fprintf(stderr, "[E::%s] no sample line\n", __func__); free(txt.s); bcf_hdr_destroy(h); return 0; } if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here int dret; gzFile f; kstream_t *ks; kstring_t tmp; tmp.l = tmp.m = 0; tmp.s = 0; f = gzopen(fp->fn_aux, "r"); ks = ks_init(f); while (ks_getuntil(ks, 0, &tmp, &dret) >= 0) { int c; kputs("##contig=<ID=", &txt); kputs(tmp.s, &txt); ks_getuntil(ks, 0, &tmp, &dret); kputs(",length=", &txt); kputw(atol(tmp.s), &txt); kputsn(">\n", 2, &txt); if (dret != '\n') while ((c = ks_getc(ks)) != '\n' && c != -1); // skip the rest of the line } free(tmp.s); ks_destroy(ks); gzclose(f); } kputsn(s->s, s->l, &txt); if (s->s[1] != '#') break; kputc('\n', &txt); } h->l_text = txt.l + 1; // including NULL h->text = txt.s; bcf_hdr_parse(h); // check tabix index, are all contigs listed in the header? add the missing ones tbx_t *idx = tbx_index_load(fp->fn); if ( idx ) { int i, n, need_sync = 0; const char **names = tbx_seqnames(idx, &n); for (i=0; i<n; i++) { bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_DT_CTG, (char*) names[i]); if ( hrec ) continue; hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t)); hrec->key = strdup("contig"); bcf_hrec_add_key(hrec, "ID", strlen("ID")); bcf_hrec_set_val(hrec, hrec->nkeys-1, (char*) names[i], strlen(names[i]), 0); bcf_hrec_add_key(hrec, "length", strlen("length")); bcf_hrec_set_val(hrec, hrec->nkeys-1, "-1", strlen("-1"), 0); // what is a good default value? bcf_hdr_add_hrec(h, hrec); need_sync = 1; } free(names); tbx_destroy(idx); if ( need_sync ) { bcf_hdr_sync(h); bcf_hdr_fmt_text(h); } } return h; } else return bcf_hdr_read((BGZF*)fp->fp); }
/** * Classifies variants. */ int32_t VariantManip::classify_variant(bcf_hdr_t *h, bcf1_t *v, Variant& var) { bcf_unpack(v, BCF_UN_STR); const char* chrom = bcf_get_chrom(h, v); uint32_t pos1 = bcf_get_pos1(v); char** allele = bcf_get_allele(v); int32_t n_allele = bcf_get_n_allele(v); int32_t pos0 = pos1-1; var.ts = 0; var.tv = 0; var.ins = 0; var.del = 0; var.clear(); // this sets the type to VT_REF by default. bool homogeneous_length = true; char* ref = allele[0]; int32_t rlen = strlen(ref); //if only ref allele, skip this entire for loop for (size_t i=1; i<n_allele; ++i) { int32_t type = VT_REF; //check for tags if (strchr(allele[i],'<')) { size_t len = strlen(allele[i]); if (len>=5) { //VN/d+ if (allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][len-1]=='>' ) { for (size_t j=3; j<len-1; ++j) { if (allele[i][j]<'0' || allele[i][j]>'9') { type = VT_VNTR; } } } //VNTR else if (len==6 && allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][3]=='T' && allele[i][4]=='R' && allele[i][5]=='>' ) { type = VT_VNTR; } //ST/d+ else if (allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][len-1]=='>' ) { for (size_t j=3; j<len-1; ++j) { if (allele[i][j]<'0' || allele[i][j]>'9') { type = VT_VNTR; } } } //STR else if (len==5 && allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][3]=='R' && allele[i][4]=='>' ) { type = VT_VNTR; } } if (type==VT_VNTR) { type = VT_VNTR; var.type |= type; var.alleles.push_back(Allele(type)); } else { type = VT_SV; var.type |= type; std::string sv_type(allele[i]); var.alleles.push_back(Allele(type, sv_type)); } } else if (allele[i][0]=='.' || strcmp(allele[i],allele[0])==0) { type = VT_REF; } else { kstring_t REF = {0,0,0}; kstring_t ALT = {0,0,0}; ref = allele[0]; char* alt = allele[i]; int32_t alen = strlen(alt); if (rlen!=alen) { homogeneous_length = false; } //trimming //this is required in particular for the //characterization of multiallelics and //in general, any unnormalized variant int32_t rl = rlen; int32_t al = alen; //trim right while (rl!=1 && al!=1) { if (ref[rl-1]==alt[al-1]) { --rl; --al; } else { break; } } //trim left while (rl !=1 && al!=1) { if (ref[0]==alt[0]) { ++ref; ++alt; --rl; --al; } else { break; } } kputsn(ref, rl, &REF); kputsn(alt, al, &ALT); ref = REF.s; alt = ALT.s; int32_t mlen = std::min(rl, al); int32_t dlen = al-rl; int32_t diff = 0; int32_t ts = 0; int32_t tv = 0; if (mlen==1 && dlen) { char ls, le, ss; if (rl>al) { ls = ref[0]; le = ref[rl-1]; ss = alt[0]; } else { ls = alt[0]; le = alt[al-1]; ss = ref[0]; } if (ls!=ss && le!=ss) { ++diff; if ((ls=='G' && ss=='A') || (ls=='A' && ss=='G') || (ls=='C' && ss=='T') || (ls=='T' && ss=='C')) { ++ts; } else { ++tv; } } } else { for (int32_t j=0; j<mlen; ++j) { if (ref[j]!=alt[j]) { ++diff; if ((ref[j]=='G' && alt[j]=='A') || (ref[j]=='A' && alt[j]=='G') || (ref[j]=='C' && alt[j]=='T') || (ref[j]=='T' && alt[j]=='C')) { ++ts; } else { ++tv; } } } } //substitution variants if (mlen==diff) { type |= mlen==1 ? VT_SNP : VT_MNP; } //indel variants if (dlen) { type |= VT_INDEL; } //clumped SNPs and MNPs if (diff && diff < mlen) //internal gaps { type |= VT_CLUMPED; } var.type |= type; var.alleles.push_back(Allele(type, diff, alen, dlen, mlen, ts, tv)); var.ts += ts; var.tv += tv; var.ins = dlen>0?1:0; var.del = dlen<0?1:0; if (REF.m) free(REF.s); if (ALT.m) free(ALT.s); } } if (var.type==VT_VNTR) { bcf_unpack(v, BCF_UN_INFO); //populate motif, motif len etc. etc. // char* str = NULL; // int32_t n = 0; // int32_t ret = bcf_get_info_string(h, v, "MOTIF", &str, &n); // if (ret>0) // { // var.motif = std::string(str); // var.mlen = var.motif.size(); // } // ret = bcf_get_info_string(h, v, "RU", &str, &n); // if (ret>0) // { // var.ru = std::string(str); // var.mlen = var.ru.size(); // } // if (n) free(str); // // int32_t* no = NULL; // n = 0; // ret = bcf_get_info_int32(h, v, "RL", &no, &n); // if (ret>0) var.rlen = *no; // if (n) free(no); // // int32_t* fl = NULL; // n = 0; // ret = bcf_get_info_int32(h, v, "REF", &fl, &n); // if (ret>0) var.rcn = *fl; // if (n) free(fl); } //additionally define MNPs by length of all alleles if (!(var.type&(VT_VNTR|VT_SV)) && var.type!=VT_REF) { if (homogeneous_length && rlen>1 && n_allele>1) { var.type |= VT_MNP; } } return var.type; }
int main(int argc, char *argv[]) { gzFile fp; kstream_t *ks; kstring_t s, t[N_TMPSTR]; int dret, i, k, af_n, af_max, af_i, c, is_padded = 0, write_cns = 0, *p2u = 0; long m_cigar = 0, n_cigar = 0; unsigned *af, *cigar = 0; while ((c = getopt(argc, argv, "pc")) >= 0) { switch (c) { case 'p': is_padded = 1; break; case 'c': write_cns = 1; break; } } if (argc == optind) { fprintf(stderr, "\nUsage: ace2sam [-pc] <in.ace>\n\n"); fprintf(stderr, "Options: -p output padded SAM\n"); fprintf(stderr, " -c write the contig sequence in SAM\n\n"); fprintf(stderr, "Notes: 1. Fields must appear in the following order: (CO->[BQ]->(AF)->(RD->QA))\n"); fprintf(stderr, " 2. The order of reads in AF and in RD must be identical\n"); fprintf(stderr, " 3. Except in BQ, words and numbers must be separated by a single SPACE or TAB\n"); fprintf(stderr, " 4. This program writes the headerless SAM to stdout and header to stderr\n\n"); return 1; } s.l = s.m = 0; s.s = 0; af_n = af_max = af_i = 0; af = 0; for (i = 0; i < N_TMPSTR; ++i) t[i].l = t[i].m = 0, t[i].s = 0; fp = strcmp(argv[1], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); ks = ks_init(fp); while (ks_getuntil(ks, 0, &s, &dret) >= 0) { if (strcmp(s.s, "CO") == 0) { // contig sequence kstring_t *cns; t[0].l = t[1].l = t[2].l = t[3].l = t[4].l = 0; // 0: name; 1: padded ctg; 2: unpadded ctg/padded read; 3: unpadded read; 4: SAM line af_n = af_i = 0; // reset the af array ks_getuntil(ks, 0, &s, &dret); kputs(s.s, &t[0]); // contig name ks_getuntil(ks, '\n', &s, &dret); // read the whole line while (ks_getuntil(ks, '\n', &s, &dret) >= 0 && s.l > 0) kputsn(s.s, s.l, &t[1]); // read the padded consensus sequence remove_pads(&t[1], &t[2]); // construct the unpadded sequence // compute the array for mapping padded positions to unpadded positions p2u = realloc(p2u, t[1].m * sizeof(int)); for (i = k = 0; i < t[1].l; ++i) { p2u[i] = k; if (t[1].s[i] != '*') ++k; } // write out the SAM header and contig sequences fprintf(stderr, "H @SQ\tSN:%s\tLN:%llu\n", t[0].s, (unsigned long long)(t[is_padded?1:2].l)); // The SAM header line cns = &t[is_padded?1:2]; fprintf(stderr, "S >%s\n", t[0].s); for (i = 0; i < cns->l; i += LINE_LEN) { fputs("S ", stderr); for (k = 0; k < LINE_LEN && i + k < cns->l; ++k) fputc(cns->s[i + k], stderr); fputc('\n', stderr); } #define __padded2cigar(sp) do { \ int i, l_M = 0, l_D = 0; \ for (i = 0; i < sp.l; ++i) { \ if (sp.s[i] == '*') { \ if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \ ++l_D; l_M = 0; \ } else { \ if (l_D) write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \ ++l_M; l_D = 0; \ } \ } \ if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \ else write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \ } while (0) if (write_cns) { // write the consensus SAM line (dummy read) n_cigar = 0; if (is_padded) __padded2cigar(t[1]); else write_cigar(cigar, n_cigar, m_cigar, t[2].l<<4); kputsn(t[0].s, t[0].l, &t[4]); kputs("\t516\t", &t[4]); kputsn(t[0].s, t[0].l, &t[4]); kputs("\t1\t60\t", &t[4]); for (i = 0; i < n_cigar; ++i) { kputw(cigar[i]>>4, &t[4]); kputc("MIDNSHP=X"[cigar[i]&0xf], &t[4]); } kputs("\t*\t0\t0\t", &t[4]); kputsn(t[2].s, t[2].l, &t[4]); kputs("\t*", &t[4]); } } else if (strcmp(s.s, "BQ") == 0) { // contig quality
int ingest1(const char *input,const char *output,char *ref,bool exit_on_mismatch=true) { cerr << "Input: " << input << "\tOutput: "<<output<<endl; kstream_t *ks; kstring_t str = {0,0,0}; gzFile fp = gzopen(input, "r"); VarBuffer vbuf(1000); int prev_rid = -1; if(fp==NULL) { fprintf(stderr,"problem opening %s\n",input); exit(1); } char *out_fname = (char *)malloc(strlen(output)+5); strcpy(out_fname,output); strcat(out_fname,".tmp"); if(fileexists(out_fname)) { fprintf(stderr,"%s file already exists. will not overwrite\n",out_fname); exit(1); } printf("depth: %s\n",out_fname); gzFile depth_fp = gzopen(out_fname, "wb1"); strcpy(out_fname,output); strcat(out_fname,".bcf"); if(fileexists(out_fname)) { fprintf(stderr,"%s file already exists. will not overwrite\n",out_fname); exit(1); } printf("variants: %s\n",out_fname); htsFile *variant_fp=hts_open(out_fname,"wb1"); if(variant_fp==NULL) { fprintf(stderr,"problem opening %s\n",input); exit(1); } ks = ks_init(fp); htsFile *hfp=hts_open(input, "r"); bcf_hdr_t *hdr_in = bcf_hdr_read(hfp); hts_close(hfp); //this is a hack to fix gvcfs where AD is incorrectly defined in the header. (vcf4.2 does not technically allow Number=R) bcf_hdr_remove(hdr_in,BCF_HL_FMT,"AD"); assert( bcf_hdr_append(hdr_in,"##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths for the ref and alt alleles in the order listed. For indels this value only includes reads which confidently support each allele (posterior prob 0.999 or higher that read contains indicated allele vs all other intersecting indel alleles)\">") == 0); //this is a hack to fix broken gvcfs where GQ is incorrectly labelled as float (v4.3 spec says it should be integer) bcf_hdr_remove(hdr_in,BCF_HL_FMT,"GQ"); assert( bcf_hdr_append(hdr_in,"##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">") == 0); // bcf_hdr_t *hdr_out=hdr_in; bcf_hdr_t *hdr_out = bcf_hdr_dup(hdr_in); remove_hdr_lines(hdr_out,BCF_HL_INFO); remove_hdr_lines(hdr_out,BCF_HL_FLT); bcf_hdr_sync(hdr_out); //here we add FORMAT/PF. which is the pass filter flag for alts. assert( bcf_hdr_append(hdr_out,"##FORMAT=<ID=PF,Number=A,Type=Integer,Description=\"variant was PASS filter in original sample gvcf\">") == 0); args_t *norm_args = init_vcfnorm(hdr_out,ref); norm_args->check_ref |= CHECK_REF_WARN; bcf1_t *bcf_rec = bcf_init(); bcf_hdr_write(variant_fp, hdr_out); kstring_t work1 = {0,0,0}; int buf[5]; ks_tokaux_t aux; int ndec=0; int ref_len,alt_len; while( ks_getuntil(ks, '\n', &str, 0) >=0) { // fprintf(stderr,"%s\n",str.s); if(str.s[0]!='#') { char *ptr = kstrtok(str.s,"\t",&aux);//chrom ptr = kstrtok(NULL,NULL,&aux);//pos work1.l=0; kputsn(str.s,ptr-str.s-1, &work1); buf[0] = bcf_hdr_name2id(hdr_in, work1.s); assert( buf[0]>=0); buf[1]=atoi(ptr)-1; ptr = kstrtok(NULL,NULL,&aux);//ID ptr = kstrtok(NULL,NULL,&aux);//REF ref_len=0; while(ptr[ref_len]!='\t') ref_len++; ptr = kstrtok(NULL,NULL,&aux);//ALT bool is_variant=false; alt_len=0; while(ptr[alt_len]!='\t') alt_len++; if(ptr[0]!='.') is_variant=true; char * QUAL_ptr = kstrtok(NULL, NULL, &aux); assert (QUAL_ptr != NULL); for(int i=0;i<2;i++) ptr = kstrtok(NULL,NULL,&aux);// gets us to INFO //find END if it is there char *end_ptr=strstr(ptr,"END=") ; if(end_ptr!=NULL) buf[2]=atoi(end_ptr+4)-1; else buf[2]=buf[1]+alt_len-1; ptr = kstrtok(NULL,NULL,&aux);//FORMAT //find index of DP (if present) //if not present, dont output anything (indels ignored) char *DP_ptr = find_format(ptr,"DP"); int GQX = 0; int QUAL = 0; // AH: change code to use the minimum of GQ and QUAL fields if // GQX is not defined. See here: // https://support.basespace.illumina.com/knowledgebase/articles/144844-vcf-file // "GQXGenotype quality. GQX is the minimum of the GQ value // and the QUAL column. In general, these are similar values; // taking the minimum makes GQX the more conservative measure of // genotype quality." if(DP_ptr!=NULL) { buf[3]=atoi(DP_ptr); char *GQX_ptr = find_format(ptr,"GQX"); if (GQX_ptr == NULL) { GQX_ptr = find_format(ptr,"GQ"); GQX = atoi(GQX_ptr); if (QUAL_ptr[0] != '.') { QUAL = atoi(QUAL_ptr); if (QUAL < GQX) GQX = QUAL; } } else { GQX = atoi(GQX_ptr); } //trying to reduce entropy on GQ to get better compression performance. //1. rounds down to nearest 10. //2. sets gq to min(gq,100). buf[4]=GQX/10; buf[4]*=10; if(buf[4]>100) buf[4]=100; // printf("%d\t%d\t%d\t%d\t%d\n",buf[0],buf[1],buf[2],buf[3],buf[4]); if(gzwrite(depth_fp,buf,5*sizeof(int))!=(5*sizeof(int))) die("ERROR: problem writing "+(string)out_fname+".tmp"); } if(is_variant) {//wass this a variant? if so write it out to the bcf norm_args->ntotal++; vcf_parse(&str,hdr_in,bcf_rec); // cerr<<bcf_rec->rid<<":"<<bcf_rec->pos<<endl; if(prev_rid!=bcf_rec->rid) vbuf.flush(variant_fp,hdr_out); else vbuf.flush(bcf_rec->pos,variant_fp,hdr_out); prev_rid=bcf_rec->rid; int32_t pass = bcf_has_filter(hdr_in, bcf_rec, "."); bcf_update_format_int32(hdr_out,bcf_rec,"PF",&pass,1); bcf_update_filter(hdr_out,bcf_rec,NULL,0); if(bcf_rec->n_allele>2) {//split multi-allelics (using vcfnorm.c from bcftools1.3 norm_args->nsplit++; split_multiallelic_to_biallelics(norm_args,bcf_rec ); for(int i=0;i<norm_args->ntmp_lines;i++){ remove_info(norm_args->tmp_lines[i]); if(realign(norm_args,norm_args->tmp_lines[i]) != ERR_REF_MISMATCH) ndec+=decompose(norm_args->tmp_lines[i],hdr_out,vbuf); else if(exit_on_mismatch) die("vcf did not match the reference"); else norm_args->nskipped++; } } else { remove_info(bcf_rec); if( realign(norm_args,bcf_rec) != ERR_REF_MISMATCH) ndec+=decompose(bcf_rec,hdr_out,vbuf); else if(exit_on_mismatch) die("vcf did not match the reference"); else norm_args->nskipped++; } vbuf.flush(bcf_rec->pos,variant_fp,hdr_out); } } } vbuf.flush(variant_fp,hdr_out); bcf_hdr_destroy(hdr_in); bcf_hdr_destroy(hdr_out); bcf_destroy1(bcf_rec); ks_destroy(ks); gzclose(fp); gzclose(depth_fp); free(str.s); free(work1.s); hts_close(variant_fp); destroy_data(norm_args); fprintf(stderr,"Variant lines total/split/realigned/skipped:\t%d/%d/%d/%d\n", norm_args->ntotal,norm_args->nsplit,norm_args->nchanged,norm_args->nskipped); fprintf(stderr,"Decomposed %d MNPs\n", ndec); fprintf(stderr,"Indexing %s\n",out_fname); bcf_index_build(out_fname, BCF_LIDX_SHIFT); free(out_fname); return 0; }
int kputd(double d, kstring_t *s) { int len = 0; char buf[21], *cp = buf+20, *ep; if (d == 0) { if (signbit(d)) { kputsn("-0",2,s); return 2; } else { kputsn("0",1,s); return 1; } } if (d < 0) { kputc('-',s); len = 1; d=-d; } if (!(d >= 0.0001 && d <= 999999)) { if (ks_resize(s, s->l + 50) < 0) return EOF; // We let stdio handle the exponent cases int s2 = sprintf(s->s + s->l, "%g", d); len += s2; s->l += s2; return len; } uint64_t i = d*10000000000LL; // Correction for rounding - rather ugly // Optimised for small numbers. // Better still would be __builtin_clz on hi/lo 32 and get the // starting point very rapidly. if (d<.0001) i+=0; else if (d<0.001) i+=5; else if (d < 0.01) i+=50; else if (d < 0.1) i+=500; else if (d < 1) i+=5000; else if (d < 10) i+=50000; else if (d < 100) i+=500000; else if (d < 1000) i+=5000000; else if (d < 10000) i+=50000000; else if (d < 100000) i+=500000000; else i+=5000000000LL; do { *--cp = '0' + i%10; i /= 10; } while (i >= 1); buf[20] = 0; int p = buf+20-cp; if (p <= 10) { // d < 1 //assert(d/1); cp[6] = 0; ep = cp+5;// 6 precision while (p < 10) { *--cp = '0'; p++; } *--cp = '.'; *--cp = '0'; } else { char *xp = --cp; while (p > 10) { xp[0] = xp[1]; p--; xp++; } xp[0] = '.'; cp[7] = 0; ep=cp+6; if (cp[6] == '.') cp[6] = 0; } // Cull trailing zeros while (*ep == '0' && ep > cp) ep--; char *z = ep+1; while (ep > cp) { if (*ep == '.') { if (z[-1] == '.') z[-1] = 0; else z[0] = 0; break; } ep--; } int sl = strlen(cp); len += sl; kputsn(cp, sl, s); return len; }
/* * Loads a CRAM .crai index into memory. * * Returns 0 for success * -1 for failure */ int cram_index_load(cram_fd *fd, const char *fn) { char fn2[PATH_MAX]; char buf[65536]; ssize_t len; kstring_t kstr = {0}; hFILE *fp; cram_index *idx; cram_index **idx_stack = NULL, *ep, e; int idx_stack_alloc = 0, idx_stack_ptr = 0; size_t pos = 0; /* Check if already loaded */ if (fd->index) return 0; fd->index = calloc((fd->index_sz = 1), sizeof(*fd->index)); if (!fd->index) return -1; idx = &fd->index[0]; idx->refid = -1; idx->start = INT_MIN; idx->end = INT_MAX; idx_stack = calloc(++idx_stack_alloc, sizeof(*idx_stack)); idx_stack[idx_stack_ptr] = idx; sprintf(fn2, "%s.crai", fn); if (!(fp = hopen(fn2, "r"))) { perror(fn2); free(idx_stack); return -1; } // Load the file into memory while ((len = hread(fp, buf, 65536)) > 0) kputsn(buf, len, &kstr); if (len < 0 || kstr.l < 2) { if (kstr.s) free(kstr.s); free(idx_stack); return -1; } if (hclose(fp)) { if (kstr.s) free(kstr.s); free(idx_stack); return -1; } // Uncompress if required if (kstr.s[0] == 31 && (uc)kstr.s[1] == 139) { size_t l; char *s = zlib_mem_inflate(kstr.s, kstr.l, &l); free(kstr.s); if (!s) { free(idx_stack); return -1; } kstr.s = s; kstr.l = l; kstr.m = l; // conservative estimate of the size allocated kputsn("", 0, &kstr); // ensure kstr.s is NUL-terminated } // Parse it line at a time do { /* 1.1 layout */ if (kget_int32(&kstr, &pos, &e.refid) == -1) { free(kstr.s); free(idx_stack); return -1; } if (kget_int32(&kstr, &pos, &e.start) == -1) { free(kstr.s); free(idx_stack); return -1; } if (kget_int32(&kstr, &pos, &e.end) == -1) { free(kstr.s); free(idx_stack); return -1; } if (kget_int64(&kstr, &pos, &e.offset) == -1) { free(kstr.s); free(idx_stack); return -1; } if (kget_int32(&kstr, &pos, &e.slice) == -1) { free(kstr.s); free(idx_stack); return -1; } if (kget_int32(&kstr, &pos, &e.len) == -1) { free(kstr.s); free(idx_stack); return -1; } e.end += e.start-1; //printf("%d/%d..%d\n", e.refid, e.start, e.end); if (e.refid < -1) { free(kstr.s); free(idx_stack); fprintf(stderr, "Malformed index file, refid %d\n", e.refid); return -1; } if (e.refid != idx->refid) { if (fd->index_sz < e.refid+2) { size_t index_end = fd->index_sz * sizeof(*fd->index); fd->index_sz = e.refid+2; fd->index = realloc(fd->index, fd->index_sz * sizeof(*fd->index)); memset(((char *)fd->index) + index_end, 0, fd->index_sz * sizeof(*fd->index) - index_end); } idx = &fd->index[e.refid+1]; idx->refid = e.refid; idx->start = INT_MIN; idx->end = INT_MAX; idx->nslice = idx->nalloc = 0; idx->e = NULL; idx_stack[(idx_stack_ptr = 0)] = idx; } while (!(e.start >= idx->start && e.end <= idx->end)) { idx = idx_stack[--idx_stack_ptr]; } // Now contains, so append if (idx->nslice+1 >= idx->nalloc) { idx->nalloc = idx->nalloc ? idx->nalloc*2 : 16; idx->e = realloc(idx->e, idx->nalloc * sizeof(*idx->e)); } e.nalloc = e.nslice = 0; e.e = NULL; *(ep = &idx->e[idx->nslice++]) = e; idx = ep; if (++idx_stack_ptr >= idx_stack_alloc) { idx_stack_alloc *= 2; idx_stack = realloc(idx_stack, idx_stack_alloc*sizeof(*idx_stack)); } idx_stack[idx_stack_ptr] = idx; while (pos < kstr.l && kstr.s[pos] != '\n') pos++; pos++; } while (pos < kstr.l); free(idx_stack); free(kstr.s); // dump_index(fd); return 0; }
char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of) { uint8_t *s = bam1_seq(b), *t = bam1_qual(b); int i; const bam1_core_t *c = &b->core; kstring_t str; str.l = str.m = 0; str.s = 0; kputsn(bam1_qname(b), c->l_qname-1, &str); kputc('\t', &str); if (of == BAM_OFDEC) { kputw(c->flag, &str); kputc('\t', &str); } else if (of == BAM_OFHEX) ksprintf(&str, "0x%x\t", c->flag); else { // BAM_OFSTR for (i = 0; i < 16; ++i) if ((c->flag & 1<<i) && bam_flag2char_table[i]) kputc(bam_flag2char_table[i], &str); kputc('\t', &str); } if (c->tid < 0) kputsn("*\t", 2, &str); else { if (header) kputs(header->target_name[c->tid] , &str); else kputw(c->tid, &str); kputc('\t', &str); } kputw(c->pos + 1, &str); kputc('\t', &str); kputw(c->qual, &str); kputc('\t', &str); if (c->n_cigar == 0) kputc('*', &str); else { for (i = 0; i < c->n_cigar; ++i) { kputw(bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, &str); kputc("MIDNSHP"[bam1_cigar(b)[i]&BAM_CIGAR_MASK], &str); } } kputc('\t', &str); if (c->mtid < 0) kputsn("*\t", 2, &str); else if (c->mtid == c->tid) kputsn("=\t", 2, &str); else { if (header) kputs(header->target_name[c->mtid], &str); else kputw(c->mtid, &str); kputc('\t', &str); } kputw(c->mpos + 1, &str); kputc('\t', &str); kputw(c->isize, &str); kputc('\t', &str); if (c->l_qseq) { for (i = 0; i < c->l_qseq; ++i) kputc(bam_nt16_rev_table[bam1_seqi(s, i)], &str); kputc('\t', &str); if (t[0] == 0xff) kputc('*', &str); else for (i = 0; i < c->l_qseq; ++i) kputc(t[i] + 33, &str); } else kputsn("*\t*", 3, &str); s = bam1_aux(b); while (s < b->data + b->data_len) { uint8_t type, key[2]; key[0] = s[0]; key[1] = s[1]; s += 2; type = *s; ++s; kputc('\t', &str); kputsn((char*)key, 2, &str); kputc(':', &str); if (type == 'A') { kputsn("A:", 2, &str); kputc(*s, &str); ++s; } else if (type == 'C') { kputsn("i:", 2, &str); kputw(*s, &str); ++s; } else if (type == 'c') { kputsn("i:", 2, &str); kputw(*(int8_t*)s, &str); ++s; } else if (type == 'S') { kputsn("i:", 2, &str); kputw(*(uint16_t*)s, &str); s += 2; } else if (type == 's') { kputsn("i:", 2, &str); kputw(*(int16_t*)s, &str); s += 2; } else if (type == 'I') { kputsn("i:", 2, &str); kputuw(*(uint32_t*)s, &str); s += 4; } else if (type == 'i') { kputsn("i:", 2, &str); kputw(*(int32_t*)s, &str); s += 4; } else if (type == 'f') { ksprintf(&str, "f:%g", *(float*)s); s += 4; } else if (type == 'd') { ksprintf(&str, "d:%lg", *(double*)s); s += 8; } else if (type == 'Z' || type == 'H') { kputc(type, &str); kputc(':', &str); while (*s) kputc(*s++, &str); ++s; } } return str.s; }
// Transform a bam1_t record into a string with the FASTQ representation of it // @returns false for error, true for success static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state) { int i; int32_t qlen = b->core.l_qseq; assert(qlen >= 0); uint8_t *seq; uint8_t *qual = bam_get_qual(b); const uint8_t *oq = NULL; if (state->use_oq) { oq = bam_aux_get(b, "OQ"); if (oq) oq++; // skip tag type } bool has_qual = (qual[0] != 0xff || (state->use_oq && oq)); // test if there is quality linebuf->l = 0; // Write read name readpart readpart = which_readpart(b); kputc(state->filetype == FASTA? '>' : '@', linebuf); kputs(bam_get_qname(b), linebuf); // Add the /1 /2 if requested if (state->has12) { if (readpart == READ_1) kputs("/1", linebuf); else if (readpart == READ_2) kputs("/2", linebuf); } if (state->copy_tags) { for (i = 0; copied_tags[i]; ++i) { uint8_t *s; if ((s = bam_aux_get(b, copied_tags[i])) != 0) { kputc('\t', linebuf); kputsn(copied_tags[i], 2, linebuf); kputsn(":Z:", 3, linebuf); kputs(bam_aux2Z(s), linebuf); } } } kputc('\n', linebuf); seq = bam_get_seq(b); if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented for (i = qlen-1; i > -1; --i) { char c = seq_nt16_str[seq_comp_table[bam_seqi(seq,i)]]; kputc(c, linebuf); } } else { for (i = 0; i < qlen; ++i) { char c = seq_nt16_str[bam_seqi(seq,i)]; kputc(c, linebuf); } } kputc('\n', linebuf); if (state->filetype == FASTQ) { // Write quality kputs("+\n", linebuf); if (has_qual) { if (state->use_oq && oq) { if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented for (i = qlen-1; i > -1; --i) { kputc(oq[i], linebuf); } } else { kputs((char*)oq, linebuf); } } else { if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented for (i = qlen-1; i > -1; --i) { kputc(33 + qual[i], linebuf); } } else { for (i = 0; i < qlen; ++i) { kputc(33 + qual[i], linebuf); } } } } else { for (i = 0; i < qlen; ++i) { kputc(33 + state->def_qual, linebuf); } } kputc('\n', linebuf); } return true; }
void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int rm_mask) { int *map = (int*) calloc(line->n_allele, sizeof(int)); // create map of indexes from old to new ALT numbering and modify ALT kstring_t str = {0,0,0}; kputs(line->d.allele[0], &str); int nrm = 0, i,j; // i: ori alleles, j: new alleles for (i=1, j=1; i<line->n_allele; i++) { if ( rm_mask & 1<<i ) { // remove this allele line->d.allele[i] = NULL; nrm++; continue; } kputc(',', &str); kputs(line->d.allele[i], &str); map[i] = j; j++; } if ( !nrm ) { free(map); free(str.s); return; } int nR_ori = line->n_allele; int nR_new = line->n_allele-nrm; assert(nR_new > 0); // should not be able to remove reference allele int nA_ori = nR_ori-1; int nA_new = nR_new-1; int nG_ori = nR_ori*(nR_ori + 1)/2; int nG_new = nR_new*(nR_new + 1)/2; bcf_update_alleles_str(header, line, str.s); // remove from Number=G, Number=R and Number=A INFO fields. uint8_t *dat = NULL; int mdat = 0, ndat = 0, mdat_bytes = 0, nret; for (i=0; i<line->n_info; i++) { bcf_info_t *info = &line->d.info[i]; int vlen = bcf_hdr_id2length(header,BCF_HL_INFO,info->key); if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change int type = bcf_hdr_id2type(header,BCF_HL_INFO,info->key); if ( type==BCF_HT_FLAG ) continue; int size = 1; if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4; mdat = mdat_bytes / size; nret = bcf_get_info_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void**)&dat, &mdat, type); mdat_bytes = mdat * size; if ( nret<0 ) { fprintf(stderr,"[%s:%d %s] Could not access INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); exit(1); } if ( type==BCF_HT_STR ) { str.l = 0; char *ss = (char*) dat, *se = (char*) dat; if ( vlen==BCF_VL_A || vlen==BCF_VL_R ) { int nexp, inc = 0; if ( vlen==BCF_VL_A ) { nexp = nA_ori; inc = 1; } else nexp = nR_ori; for (j=0; j<nexp; j++) { if ( !*se ) break; while ( *se && *se!=',' ) se++; if ( rm_mask & 1<<(j+inc) ) { if ( *se ) se++; ss = se; continue; } if ( str.l ) kputc(',',&str); kputsn(ss,se-ss,&str); if ( *se ) se++; ss = se; } assert( j==nexp ); } else // Number=G, assuming diploid genotype { int k = 0, n = 0; for (j=0; j<nR_ori; j++) { for (k=0; k<=j; k++) { if ( !*se ) break; while ( *se && *se!=',' ) se++; n++; if ( rm_mask & 1<<j || rm_mask & 1<<k ) { if ( *se ) se++; ss = se; continue; } if ( str.l ) kputc(',',&str); kputsn(ss,se-ss,&str); if ( *se ) se++; ss = se; } if ( !*se ) break; } assert( n=nG_ori ); } nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)str.s, str.l, type); if ( nret<0 ) { fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); exit(1); } continue; } if ( vlen==BCF_VL_A || vlen==BCF_VL_R ) { int inc = 0, ntop; if ( vlen==BCF_VL_A ) { assert( nret==nA_ori ); ntop = nA_ori; ndat = nA_new; inc = 1; } else { assert( nret==nR_ori ); ntop = nR_ori; ndat = nR_new; } int k = 0; #define BRANCH(type_t,is_vector_end) \ { \ type_t *ptr = (type_t*) dat; \ int size = sizeof(type_t); \ for (j=0; j<ntop; j++) /* j:ori, k:new */ \ { \ if ( is_vector_end ) { memcpy(dat+k*size, dat+j*size, size); break; } \ if ( rm_mask & 1<<(j+inc) ) continue; \ if ( j!=k ) memcpy(dat+k*size, dat+j*size, size); \ k++; \ } \ } switch (type) { case BCF_HT_INT: BRANCH(int32_t,ptr[j]==bcf_int32_vector_end); break; case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[j])); break; } #undef BRANCH } else // Number=G { assert( nret==nG_ori ); int k, l_ori = -1, l_new = 0; ndat = nG_new; #define BRANCH(type_t,is_vector_end) \ { \ type_t *ptr = (type_t*) dat; \ int size = sizeof(type_t); \ for (j=0; j<nR_ori; j++) \ { \ for (k=0; k<=j; k++) \ { \ l_ori++; \ if ( is_vector_end ) { memcpy(dat+l_new*size, dat+l_ori*size, size); break; } \ if ( rm_mask & 1<<j || rm_mask & 1<<k ) continue; \ if ( l_ori!=l_new ) memcpy(dat+l_new*size, dat+l_ori*size, size); \ l_new++; \ } \ } \ } switch (type) { case BCF_HT_INT: BRANCH(int32_t,ptr[l_ori]==bcf_int32_vector_end); break; case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[l_ori])); break; } #undef BRANCH } nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)dat, ndat, type); if ( nret<0 ) { fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); exit(1); } } // Update GT fields, the allele indexes might have changed for (i=1; i<line->n_allele; i++) if ( map[i]!=i ) break; if ( i<line->n_allele ) { mdat = mdat_bytes / 4; // sizeof(int32_t) nret = bcf_get_genotypes(header,line,(void**)&dat,&mdat); mdat_bytes = mdat * 4; if ( nret>0 ) { nret /= line->n_sample; int32_t *ptr = (int32_t*) dat; for (i=0; i<line->n_sample; i++) { for (j=0; j<nret; j++) { if ( ptr[j]==bcf_gt_missing ) continue; if ( ptr[j]==bcf_int32_vector_end ) break; int al = bcf_gt_allele(ptr[j]); assert( al<nR_ori && map[al]>=0 ); ptr[j] = (map[al]+1)<<1 | (ptr[j]&1); } ptr += nret; } bcf_update_genotypes(header, line, (void*)dat, nret*line->n_sample); } } // Remove from Number=G, Number=R and Number=A FORMAT fields. // Assuming haploid or diploid GTs for (i=0; i<line->n_fmt; i++) { bcf_fmt_t *fmt = &line->d.fmt[i]; int vlen = bcf_hdr_id2length(header,BCF_HL_FMT,fmt->id); if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change int type = bcf_hdr_id2type(header,BCF_HL_FMT,fmt->id); if ( type==BCF_HT_FLAG ) continue; int size = 1; if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4; mdat = mdat_bytes / size; nret = bcf_get_format_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void**)&dat, &mdat, type); mdat_bytes = mdat * size; if ( nret<0 ) { fprintf(stderr,"[%s:%d %s] Could not access FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); exit(1); } if ( type==BCF_HT_STR ) { int size = nret/line->n_sample; // number of bytes per sample str.l = 0; if ( vlen==BCF_VL_A || vlen==BCF_VL_R ) { int nexp, inc = 0; if ( vlen==BCF_VL_A ) { nexp = nA_ori; inc = 1; } else nexp = nR_ori; for (j=0; j<line->n_sample; j++) { char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss; int k_src = 0, k_dst = 0, l = str.l; for (k_src=0; k_src<nexp; k_src++) { if ( ptr>=se || !*ptr) break; while ( ptr<se && *ptr && *ptr!=',' ) ptr++; if ( rm_mask & 1<<(k_src+inc) ) { ss = ++ptr; continue; } if ( k_dst ) kputc(',',&str); kputsn(ss,ptr-ss,&str); ss = ++ptr; k_dst++; } assert( k_src==nexp ); l = str.l - l; for (; l<size; l++) kputc(0, &str); } } else // Number=G, diploid or haploid { for (j=0; j<line->n_sample; j++) { char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss; int k_src = 0, k_dst = 0, l = str.l; int nexp = 0; // diploid or haploid? while ( ptr<se ) { if ( !*ptr ) break; if ( *ptr==',' ) nexp++; ptr++; } if ( ptr!=ss ) nexp++; assert( nexp==nG_ori || nexp==nR_ori ); ptr = ss; if ( nexp==nG_ori ) // diploid { int ia, ib; for (ia=0; ia<nR_ori; ia++) { for (ib=0; ib<=ia; ib++) { if ( ptr>=se || !*ptr ) break; while ( ptr<se && *ptr && *ptr!=',' ) ptr++; if ( rm_mask & 1<<ia || rm_mask & 1<<ib ) { ss = ++ptr; continue; } if ( k_dst ) kputc(',',&str); kputsn(ss,ptr-ss,&str); ss = ++ptr; k_dst++; } if ( ptr>=se || !*ptr ) break; } } else // haploid { for (k_src=0; k_src<nR_ori; k_src++) { if ( ptr>=se || !*ptr ) break; while ( ptr<se && *ptr && *ptr!=',' ) ptr++; if ( rm_mask & 1<<k_src ) { ss = ++ptr; continue; } if ( k_dst ) kputc(',',&str); kputsn(ss,ptr-ss,&str); ss = ++ptr; k_dst++; } assert( k_src==nR_ori ); l = str.l - l; for (; l<size; l++) kputc(0, &str); } } } nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)str.s, str.l, type); if ( nret<0 ) { fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); exit(1); } continue; } int nori = nret / line->n_sample; if ( vlen==BCF_VL_A || vlen==BCF_VL_R || (vlen==BCF_VL_G && nori==nR_ori) ) // Number=A, R or haploid Number=G { int ntop, inc = 0; if ( vlen==BCF_VL_A ) { assert( nori==nA_ori ); // todo: will fail if all values are missing ntop = nA_ori; ndat = nA_new*line->n_sample; inc = 1; } else { assert( nori==nR_ori ); // todo: will fail if all values are missing ntop = nR_ori; ndat = nR_new*line->n_sample; } #define BRANCH(type_t,is_vector_end) \ { \ for (j=0; j<line->n_sample; j++) \ { \ type_t *ptr_src = ((type_t*)dat) + j*nori; \ type_t *ptr_dst = ((type_t*)dat) + j*nA_new; \ int size = sizeof(type_t); \ int k_src, k_dst = 0; \ for (k_src=0; k_src<ntop; k_src++) \ { \ if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); break; } \ if ( rm_mask & 1<<(k_src+inc) ) continue; \ if ( k_src!=k_dst ) memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \ k_dst++; \ } \ } \ } switch (type) { case BCF_HT_INT: BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break; case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break; } #undef BRANCH } else // Number=G, diploid or mixture of haploid+diploid { assert( nori==nG_ori ); ndat = nG_new*line->n_sample; #define BRANCH(type_t,is_vector_end) \ { \ for (j=0; j<line->n_sample; j++) \ { \ type_t *ptr_src = ((type_t*)dat) + j*nori; \ type_t *ptr_dst = ((type_t*)dat) + j*nG_new; \ int size = sizeof(type_t); \ int ia, ib, k_dst = 0, k_src; \ int nset = 0; /* haploid or diploid? */ \ for (k_src=0; k_src<nG_ori; k_src++) { if ( is_vector_end ) break; nset++; } \ if ( nset==nR_ori ) /* haploid */ \ { \ for (k_src=0; k_src<nR_ori; k_src++) \ { \ if ( rm_mask & 1<<k_src ) continue; \ if ( k_src!=k_dst ) memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \ k_dst++; \ } \ memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \ } \ else /* diploid */ \ { \ k_src = -1; \ for (ia=0; ia<nR_ori; ia++) \ { \ for (ib=0; ib<=ia; ib++) \ { \ k_src++; \ if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); ia = nR_ori; break; } \ if ( rm_mask & 1<<ia || rm_mask & 1<<ib ) continue; \ if ( k_src!=k_dst ) memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \ k_dst++; \ } \ } \ } \ } \ } switch (type) { case BCF_HT_INT: BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break; case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break; } #undef BRANCH } nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)dat, ndat, type); if ( nret<0 ) { fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); exit(1); } } free(dat); free(str.s); free(map); }
void bcf_enc_vchar(kstring_t *s, int l, char *a) { bcf_enc_size(s, l, BCF_BT_CHAR); kputsn(a, l, s); }
static hFILE * s3_rewrite(const char *s3url, const char *mode, va_list *argsp) { const char *bucket, *path; char *header_list[4], **header = header_list; kstring_t url = { 0, 0, NULL }; kstring_t profile = { 0, 0, NULL }; kstring_t host_base = { 0, 0, NULL }; kstring_t token_hdr = { 0, 0, NULL }; s3_auth_data *ad = calloc(1, sizeof(*ad)); if (!ad) return NULL; ad->mode = strchr(mode, 'r') ? 'r' : 'w'; // Our S3 URL format is s3[+SCHEME]://[ID[:SECRET[:TOKEN]]@]BUCKET/PATH if (s3url[2] == '+') { bucket = strchr(s3url, ':') + 1; kputsn(&s3url[3], bucket - &s3url[3], &url); } else { kputs("https:", &url); bucket = &s3url[3]; } while (*bucket == '/') kputc(*bucket++, &url); path = bucket + strcspn(bucket, "/?#@"); if (*path == '@') { const char *colon = strpbrk(bucket, ":@"); if (*colon != ':') { urldecode_kput(bucket, colon - bucket, &profile); } else { const char *colon2 = strpbrk(&colon[1], ":@"); urldecode_kput(bucket, colon - bucket, &ad->id); urldecode_kput(&colon[1], colon2 - &colon[1], &ad->secret); if (*colon2 == ':') urldecode_kput(&colon2[1], path - &colon2[1], &ad->token); } bucket = &path[1]; path = bucket + strcspn(bucket, "/?#"); } else { // If the URL has no ID[:SECRET]@, consider environment variables. const char *v; if ((v = getenv("AWS_ACCESS_KEY_ID")) != NULL) kputs(v, &ad->id); if ((v = getenv("AWS_SECRET_ACCESS_KEY")) != NULL) kputs(v, &ad->secret); if ((v = getenv("AWS_SESSION_TOKEN")) != NULL) kputs(v, &ad->token); if ((v = getenv("AWS_DEFAULT_PROFILE")) != NULL) kputs(v, &profile); else if ((v = getenv("AWS_PROFILE")) != NULL) kputs(v, &profile); else kputs("default", &profile); } if (ad->id.l == 0) { const char *v = getenv("AWS_SHARED_CREDENTIALS_FILE"); parse_ini(v? v : "~/.aws/credentials", profile.s, "aws_access_key_id", &ad->id, "aws_secret_access_key", &ad->secret, "aws_session_token", &ad->token, NULL); } if (ad->id.l == 0) parse_ini("~/.s3cfg", profile.s, "access_key", &ad->id, "secret_key", &ad->secret, "access_token", &ad->token, "host_base", &host_base, NULL); if (ad->id.l == 0) parse_simple("~/.awssecret", &ad->id, &ad->secret); if (host_base.l == 0) kputs("s3.amazonaws.com", &host_base); // Use virtual hosted-style access if possible, otherwise path-style. if (is_dns_compliant(bucket, path)) { kputsn(bucket, path - bucket, &url); kputc('.', &url); kputs(host_base.s, &url); } else { kputs(host_base.s, &url); kputc('/', &url); kputsn(bucket, path - bucket, &url); } kputs(path, &url); if (ad->token.l > 0) { kputs("X-Amz-Security-Token: ", &token_hdr); kputs(ad->token.s, &token_hdr); *header++ = token_hdr.s; } ad->bucket = strdup(bucket); if (!ad->bucket) goto fail; *header = NULL; hFILE *fp = hopen(url.s, mode, "va_list", argsp, "httphdr:v", header_list, "httphdr_callback", auth_header_callback, "httphdr_callback_data", ad, NULL); if (!fp) goto fail; free(url.s); free(profile.s); free(host_base.s); free(token_hdr.s); return fp; fail: free(url.s); free(profile.s); free(host_base.s); free(token_hdr.s); free_auth_data(ad); return NULL; }
static void concat(args_t *args) { int i; if ( args->phased_concat ) // phased concat { // keep only two open files at a time while ( args->ifname < args->nfnames ) { int new_file = 0; while ( args->files->nreaders < 2 && args->ifname < args->nfnames ) { if ( !bcf_sr_add_reader(args->files,args->fnames[args->ifname]) ) error("Failed to open %s: %s\n", args->fnames[args->ifname],bcf_sr_strerror(args->files->errnum)); new_file = 1; args->ifname++; if ( args->start_pos[args->ifname-1]==-1 ) break; // new chromosome, start with only one file open if ( args->ifname < args->nfnames && args->start_pos[args->ifname]==-1 ) break; // next file starts on a different chromosome } // is there a line from the previous run? Seek the newly opened reader to that position int seek_pos = -1; int seek_chr = -1; if ( bcf_sr_has_line(args->files,0) ) { bcf1_t *line = bcf_sr_get_line(args->files,0); bcf_sr_seek(args->files, bcf_seqname(args->files->readers[0].header,line), line->pos); seek_pos = line->pos; seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[0].header,line)); } else if ( new_file ) bcf_sr_seek(args->files,NULL,0); // set to start int nret; while ( (nret = bcf_sr_next_line(args->files)) ) { if ( !bcf_sr_has_line(args->files,0) ) // no input from the first reader { // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped if ( ! bcf_sr_region_done(args->files,0) ) continue; phased_flush(args); bcf_sr_remove_reader(args->files, 0); } // Get a line to learn about current position for (i=0; i<args->files->nreaders; i++) if ( bcf_sr_has_line(args->files,i) ) break; bcf1_t *line = bcf_sr_get_line(args->files,i); // This can happen after bcf_sr_seek: indel may start before the coordinate which we seek to. if ( seek_chr>=0 && seek_pos>line->pos && seek_chr==bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line)) ) continue; seek_pos = seek_chr = -1; // Check if the position overlaps with the next, yet unopened, reader int must_seek = 0; while ( args->ifname < args->nfnames && args->start_pos[args->ifname]!=-1 && line->pos >= args->start_pos[args->ifname] ) { must_seek = 1; if ( !bcf_sr_add_reader(args->files,args->fnames[args->ifname]) ) error("Failed to open %s: %s\n", args->fnames[args->ifname],bcf_sr_strerror(args->files->errnum)); args->ifname++; } if ( must_seek ) { bcf_sr_seek(args->files, bcf_seqname(args->files->readers[i].header,line), line->pos); seek_pos = line->pos; seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line)); continue; } // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped if ( args->files->nreaders>1 && !bcf_sr_has_line(args->files,1) && !bcf_sr_region_done(args->files,1) ) continue; phased_push(args, bcf_sr_get_line(args->files,0), args->files->nreaders>1 ? bcf_sr_get_line(args->files,1) : NULL); } if ( args->files->nreaders ) { phased_flush(args); while ( args->files->nreaders ) bcf_sr_remove_reader(args->files, 0); } } } else if ( args->files ) // combining overlapping files, using synced reader { while ( bcf_sr_next_line(args->files) ) { for (i=0; i<args->files->nreaders; i++) { bcf1_t *line = bcf_sr_get_line(args->files,i); if ( !line ) continue; bcf_translate(args->out_hdr, args->files->readers[i].header, line); bcf_write1(args->out_fh, args->out_hdr, line); if ( args->remove_dups ) break; } } } else // concatenating { kstring_t tmp = {0,0,0}; int prev_chr_id = -1, prev_pos; bcf1_t *line = bcf_init(); for (i=0; i<args->nfnames; i++) { htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]); if ( !fp->is_bin && args->output_type&FT_VCF ) { line->max_unpack = BCF_UN_STR; // if VCF is on both input and output, avoid VCF to BCF conversion while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 ) { char *str = fp->line.s; while ( *str && *str!='\t' ) str++; tmp.l = 0; kputsn(fp->line.s,str-fp->line.s,&tmp); int chr_id = bcf_hdr_name2id(args->out_hdr, tmp.s); if ( chr_id<0 ) error("The sequence \"%s\" not defined in the header: %s\n(Quick workaround: index the file.)\n", tmp.s, args->fnames[i]); if ( prev_chr_id!=chr_id ) { prev_pos = -1; if ( args->seen_seq[chr_id] ) error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", tmp.s); } char *end; int pos = strtol(str+1,&end,10) - 1; if ( end==str+1 ) error("Could not parse line: %s\n", fp->line.s); if ( prev_pos > pos ) error("The chromosome block %s is not sorted, consider running with -a.\n", tmp.s); args->seen_seq[chr_id] = 1; prev_chr_id = chr_id; if ( vcf_write_line(args->out_fh, &fp->line)!=0 ) error("Failed to write %d bytes\n", fp->line.l); } } else { // BCF conversion is required line->max_unpack = 0; while ( bcf_read(fp, hdr, line)==0 ) { bcf_translate(args->out_hdr, hdr, line); if ( prev_chr_id!=line->rid ) { prev_pos = -1; if ( args->seen_seq[line->rid] ) error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", bcf_seqname(args->out_hdr, line)); } if ( prev_pos > line->pos ) error("The chromosome block %s is not sorted, consider running with -a.\n", bcf_seqname(args->out_hdr, line)); args->seen_seq[line->rid] = 1; prev_chr_id = line->rid; if ( bcf_write(args->out_fh, args->out_hdr, line)!=0 ) error("Failed to write\n"); } } bcf_hdr_destroy(hdr); hts_close(fp); } bcf_destroy(line); free(tmp.s); } }
void bcf_enc_vfloat(kstring_t *s, int n, float *a) { bcf_enc_size(s, n, BCF_BT_FLOAT); kputsn((char*)a, n << 2, s); }