int process_region_precise(args_t *args, char *seq, regitr_t *itr) { int k = 1; uint32_t start = itr->reg[itr->i].start, end = itr->reg[itr->i].end; while ( itr->i+k<itr->n && start==itr->reg[itr->i+k].start && end==itr->reg[itr->i+k].end ) k++; int ret = ploidy_query(args->ploidy, seq, start, args->sex2ploidy, NULL, NULL); assert(ret); memset(args->counts,0,args->ncounts*sizeof(int)); // Select 'nsites' sites spaced so that they evenly cover the whole region // to get a representative sample. We index-jump as we should be checking // a few sites only. int i, rid = -1, pos, prev_pos = -1, ismpl; for (i=0; i<args->nsites; i++) { rid = -1; pos = ((i+1.0)/(args->nsites+1))*(end - start) + start; if ( i>0 && pos <= prev_pos ) continue; // the vcf is too sparse if ( bcf_sr_seek(args->sr,seq,pos)!=0 ) return k; // sequence not present if ( !bcf_sr_next_line(args->sr) ) return k; // no sites found bcf1_t *rec = bcf_sr_get_line(args->sr,0); if ( rid==-1 ) rid = rec->rid; if ( rid!=rec->rid || rec->pos > end ) break; prev_pos = rec->pos; int ngts = bcf_get_genotypes(args->hdr,rec,&args->gts,&args->ngts); ngts /= args->nsample; for (ismpl=0; ismpl<args->nsample; ismpl++) { int32_t *gts = args->gts + ngts*ismpl; int igt, ploidy = 0; for (igt=0; igt<ngts; igt++) { if ( gts[igt]==bcf_int32_vector_end || bcf_gt_is_missing(gts[igt]) ) break; else ploidy++; } args->counts[ismpl*(args->max_ploidy+1) + ploidy]++; if ( args->verbose ) fprintf(stderr,"%s:%d\t%s\tploidy=%d\n", seq,rec->pos+1,args->hdr->samples[ismpl],ploidy); } } for (ismpl=0; ismpl<args->nsample; ismpl++) { float sum = 0, *probs = args->sex2prob + ismpl*args->nsex; int *counts = args->counts + ismpl*(args->max_ploidy+1); for (i=0; i<args->max_ploidy+1; i++) sum += counts[i]; if ( !sum ) continue; for (i=0; i<args->nsex; i++) { int ploidy = args->sex2ploidy[i]; probs[i] *= counts[ploidy]/sum; } } return k; }
static void init_region(args_t *args, char *line) { char *ss, *se = line; while ( *se && !isspace(*se) && *se!=':' ) se++; int from = 0, to = 0; char tmp, *tmp_ptr = NULL; if ( *se ) { tmp = *se; *se = 0; tmp_ptr = se; ss = ++se; from = strtol(ss,&se,10); if ( ss==se || !*se || *se!='-' ) from = 0; else { from--; ss = ++se; to = strtol(ss,&se,10); if ( ss==se || (*se && !isspace(*se)) ) { from = 0; to = 0; } else to--; } } args->rid = bcf_hdr_name2id(args->hdr,line); if ( args->rid<0 ) fprintf(pysamerr,"Warning: Sequence \"%s\" not in %s\n", line,args->fname); args->fa_buf.l = 0; args->fa_length = 0; args->fa_end_pos = to; args->fa_ori_pos = from; args->fa_src_pos = from; args->fa_mod_off = 0; args->fa_frz_pos = -1; args->fa_case = -1; args->vcf_rbuf.n = 0; bcf_sr_seek(args->files,line,args->fa_ori_pos); if ( tmp_ptr ) *tmp_ptr = tmp; fprintf(args->fp_out,">%s\n",line); if (args->chain_fname ) { args->chain = init_chain(args->chain, args->fa_ori_pos); } else { args->chain = NULL; } }
static void concat(args_t *args) { int i; if ( args->phased_concat ) // phased concat { // keep only two open files at a time while ( args->ifname < args->nfnames ) { int new_file = 0; while ( args->files->nreaders < 2 && args->ifname < args->nfnames ) { if ( !bcf_sr_add_reader(args->files,args->fnames[args->ifname]) ) error("Failed to open %s: %s\n", args->fnames[args->ifname],bcf_sr_strerror(args->files->errnum)); new_file = 1; args->ifname++; if ( args->start_pos[args->ifname-1]==-1 ) break; // new chromosome, start with only one file open if ( args->ifname < args->nfnames && args->start_pos[args->ifname]==-1 ) break; // next file starts on a different chromosome } // is there a line from the previous run? Seek the newly opened reader to that position int seek_pos = -1; int seek_chr = -1; if ( bcf_sr_has_line(args->files,0) ) { bcf1_t *line = bcf_sr_get_line(args->files,0); bcf_sr_seek(args->files, bcf_seqname(args->files->readers[0].header,line), line->pos); seek_pos = line->pos; seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[0].header,line)); } else if ( new_file ) bcf_sr_seek(args->files,NULL,0); // set to start int nret; while ( (nret = bcf_sr_next_line(args->files)) ) { if ( !bcf_sr_has_line(args->files,0) ) // no input from the first reader { // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped if ( ! bcf_sr_region_done(args->files,0) ) continue; phased_flush(args); bcf_sr_remove_reader(args->files, 0); } // Get a line to learn about current position for (i=0; i<args->files->nreaders; i++) if ( bcf_sr_has_line(args->files,i) ) break; bcf1_t *line = bcf_sr_get_line(args->files,i); // This can happen after bcf_sr_seek: indel may start before the coordinate which we seek to. if ( seek_chr>=0 && seek_pos>line->pos && seek_chr==bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line)) ) continue; seek_pos = seek_chr = -1; // Check if the position overlaps with the next, yet unopened, reader int must_seek = 0; while ( args->ifname < args->nfnames && args->start_pos[args->ifname]!=-1 && line->pos >= args->start_pos[args->ifname] ) { must_seek = 1; if ( !bcf_sr_add_reader(args->files,args->fnames[args->ifname]) ) error("Failed to open %s: %s\n", args->fnames[args->ifname],bcf_sr_strerror(args->files->errnum)); args->ifname++; } if ( must_seek ) { bcf_sr_seek(args->files, bcf_seqname(args->files->readers[i].header,line), line->pos); seek_pos = line->pos; seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line)); continue; } // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped if ( args->files->nreaders>1 && !bcf_sr_has_line(args->files,1) && !bcf_sr_region_done(args->files,1) ) continue; phased_push(args, bcf_sr_get_line(args->files,0), args->files->nreaders>1 ? bcf_sr_get_line(args->files,1) : NULL); } if ( args->files->nreaders ) { phased_flush(args); while ( args->files->nreaders ) bcf_sr_remove_reader(args->files, 0); } } } else if ( args->files ) // combining overlapping files, using synced reader { while ( bcf_sr_next_line(args->files) ) { for (i=0; i<args->files->nreaders; i++) { bcf1_t *line = bcf_sr_get_line(args->files,i); if ( !line ) continue; bcf_translate(args->out_hdr, args->files->readers[i].header, line); bcf_write1(args->out_fh, args->out_hdr, line); if ( args->remove_dups ) break; } } } else // concatenating { kstring_t tmp = {0,0,0}; int prev_chr_id = -1, prev_pos; bcf1_t *line = bcf_init(); for (i=0; i<args->nfnames; i++) { htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]); if ( !fp->is_bin && args->output_type&FT_VCF ) { line->max_unpack = BCF_UN_STR; // if VCF is on both input and output, avoid VCF to BCF conversion while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 ) { char *str = fp->line.s; while ( *str && *str!='\t' ) str++; tmp.l = 0; kputsn(fp->line.s,str-fp->line.s,&tmp); int chr_id = bcf_hdr_name2id(args->out_hdr, tmp.s); if ( chr_id<0 ) error("The sequence \"%s\" not defined in the header: %s\n(Quick workaround: index the file.)\n", tmp.s, args->fnames[i]); if ( prev_chr_id!=chr_id ) { prev_pos = -1; if ( args->seen_seq[chr_id] ) error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", tmp.s); } char *end; int pos = strtol(str+1,&end,10) - 1; if ( end==str+1 ) error("Could not parse line: %s\n", fp->line.s); if ( prev_pos > pos ) error("The chromosome block %s is not sorted, consider running with -a.\n", tmp.s); args->seen_seq[chr_id] = 1; prev_chr_id = chr_id; if ( vcf_write_line(args->out_fh, &fp->line)!=0 ) error("Failed to write %d bytes\n", fp->line.l); } } else { // BCF conversion is required line->max_unpack = 0; while ( bcf_read(fp, hdr, line)==0 ) { bcf_translate(args->out_hdr, hdr, line); if ( prev_chr_id!=line->rid ) { prev_pos = -1; if ( args->seen_seq[line->rid] ) error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", bcf_seqname(args->out_hdr, line)); } if ( prev_pos > line->pos ) error("The chromosome block %s is not sorted, consider running with -a.\n", bcf_seqname(args->out_hdr, line)); args->seen_seq[line->rid] = 1; prev_chr_id = line->rid; if ( bcf_write(args->out_fh, args->out_hdr, line)!=0 ) error("Failed to write\n"); } } bcf_hdr_destroy(hdr); hts_close(fp); } bcf_destroy(line); free(tmp.s); } }
int process_region_guess(args_t *args, char *seq, regitr_t *itr) { int kitr = 1; uint32_t start = 0, end = INT_MAX; reg_stats_t *stats = NULL; // set the start and the end position if ( itr ) { start = itr->reg[itr->i].start; end = itr->reg[itr->i].end; // flush all records with the same coordinates while ( itr->i+kitr<itr->n && start==itr->reg[itr->i+kitr].start && end==itr->reg[itr->i+kitr].end ) kitr++; int min,max,ret = ploidy_query(args->ploidy, seq, start, args->sex2ploidy, &min, &max); assert(ret); stats = expand_regs(args, seq,start,end); } else { // background region int spos, epos; const char *ptr = hts_parse_reg(args->background, &spos, &epos); if ( !ptr ) error("Could not parse the region: %s\n", args->background); seq = (char*) malloc(ptr - args->background + 1); memcpy(seq,args->background,ptr-args->background); seq[ptr-args->background] = 0; start = spos; end = epos; } if ( bcf_sr_seek(args->sr,seq,start)!=0 ) { // sequence not present if ( !itr ) free(seq); return kitr; } int ismpl, rid = bcf_hdr_name2id(args->hdr,seq); if ( !itr ) free(seq); while ( bcf_sr_next_line(args->sr) ) { bcf1_t *rec = bcf_sr_get_line(args->sr,0); if ( rec->rid!=rid || rec->pos > end ) break; if ( args->guess & GUESS_GT ) // use GTs to guess the ploidy { bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT"); if ( !fmt ) continue; for (ismpl=0; ismpl<args->nsample; ismpl++) { count_t *counts = stats ? &stats->counts[ismpl] : &args->bg_counts[ismpl]; int gt = bcf_gt_type(fmt, ismpl, NULL,NULL); if ( gt==GT_UNKN ) counts->nmiss++; else if ( gt==GT_HET_RA || gt==GT_HET_AA ) counts->nhet++; else counts->nhom++; } } else // use PLs to guess the ploidy { int gl2pl = args->guess & GUESS_PL ? 1 : -1; int npl = bcf_get_format_int32(args->hdr,rec,args->guess&GUESS_PL?"PL":"GL",&args->pls,&args->npls); if ( npl<=0 ) continue; npl /= args->nsample; for (ismpl=0; ismpl<args->nsample; ismpl++) { int32_t *ptr = args->pls + ismpl*npl; int phom = INT_MAX, phet = INT_MAX, ial, jal, k = 0; for (ial=0; ial<rec->n_allele; ial++) { for (jal=0; jal<ial; jal++) { if ( ptr[k] == bcf_int32_missing || ptr[k] == bcf_int32_vector_end ) break; ptr[k] *= gl2pl; if ( phet > ptr[k] ) phet = ptr[k]; k++; } if ( ptr[k] == bcf_int32_missing || ptr[k] == bcf_int32_vector_end ) break; ptr[k] *= gl2pl; if ( phom > ptr[k] ) phom = ptr[k]; k++; } count_t *counts = stats ? &stats->counts[ismpl] : &args->bg_counts[ismpl]; if ( k == rec->n_allele ) counts->nhom++; // haploid else if ( phet == phom || k != rec->n_allele*(rec->n_allele+1)/2 ) counts->nmiss++; else if ( phet < phom ) counts->nhet++; else counts->nhom++; } } } return kitr; }