static void flush_bcf_records(mplp_conf_t *conf, htsFile *fp, bcf_hdr_t *hdr, bcf1_t *rec) { if ( !conf->gvcf ) { if ( rec ) bcf_write1(fp, hdr, rec); return; } if ( !rec ) { gvcf_write(conf->gvcf, fp, hdr, NULL, 0); return; } int is_ref = 0; if ( rec->n_allele==1 ) is_ref = 1; else if ( rec->n_allele==2 ) { // second allele is mpileup's X, not a variant if ( rec->d.allele[1][0]=='<' && rec->d.allele[1][1]=='*' && rec->d.allele[1][2]=='>' ) is_ref = 1; } rec = gvcf_write(conf->gvcf, fp, hdr, rec, is_ref); if ( rec ) bcf_write1(fp,hdr,rec); }
static void flush_buffer(args_t *args, int n) { int i, j; for (i=0; i<n; i++) { int k = rbuf_shift(&args->rbuf); bcf1_t *rec = args->rbuf_lines[k]; int pass = 1; if ( !args->soft_filter ) { for (j=0; j<rec->d.n_flt; j++) { if ( args->indel_gap && rec->d.flt[j]==args->IndelGap_id ) { pass = 0; break; } if ( args->snp_gap && rec->d.flt[j]==args->SnpGap_id ) { pass = 0; break; } } } if ( pass ) bcf_write1(args->out_fh, args->hdr, rec); } }
//write out variants to out file int flush(int pos,htsFile *outf,bcf_hdr_t *hdr_out) { int n = 0; while(_buf.size()>0 && (pos - _buf.front()->pos) > _w ) { // cerr << _last_pos<<"<="<<_buf.front()->pos<<endl; assert(_last_pos<=_buf.front()->pos); if( _last_pos!=_buf.front()->pos ) _seen.clear(); // bcf1_t *tmp = _buf.front(); //capitalises ref/alt. this should now be fixed upstream. // int i=0; // while(tmp->d.allele[0][i]) { // tmp->d.allele[0][i]=toupper(tmp->d.allele[0][i]); // i++; // } // i=0; // while(tmp->d.allele[1][i]) { // tmp->d.allele[1][i]=toupper(tmp->d.allele[1][i]); // i++; // } // bcf_update_alleles(hdr_out,tmp,(const char**)tmp->d.allele,tmp->n_allele); string variant=(string)_buf.front()->d.allele[0] +"."+ (string)_buf.front()->d.allele[1]; if(_seen.count(variant)) { _ndup++; } else { _seen.insert(variant); bcf_write1(outf, hdr_out, _buf.front()); } _last_pos=_buf.front()->pos; bcf_destroy1( _buf.front() ); _buf.pop_front(); n++; } return(n); }
static void print_missed_line(bcf_sr_regions_t *regs, void *data) { args_t *args = (args_t*) data; call_t *call = &args->aux; bcf1_t *missed = args->missed_line; if ( args->flag & CF_GVCF ) error("todo: Combine --gvcf and --insert-missed\n"); char *ss = regs->line.s; int i = 0; while ( i<args->aux.srs->targets_als-1 && *ss ) { if ( *ss=='\t' ) i++; ss++; } if ( !*ss ) error("Could not parse: [%s] (%d)\n", regs->line.s,args->aux.srs->targets_als); missed->rid = bcf_hdr_name2id(call->hdr,regs->seq_names[regs->prev_seq]); missed->pos = regs->start; bcf_update_alleles_str(call->hdr, missed,ss); bcf_write1(args->out_fh, call->hdr, missed); }
int main_vcfcall(int argc, char *argv[]) { char *samples_fname = NULL; args_t args; memset(&args, 0, sizeof(args_t)); args.argc = argc; args.argv = argv; args.aux.prior_type = -1; args.aux.indel_frac = -1; args.aux.theta = 1e-3; args.aux.pref = 0.5; args.aux.min_perm_p = 0.01; args.aux.min_lrt = 1; args.flag = CF_ACGT_ONLY; args.output_fname = "-"; args.output_type = FT_VCF; args.aux.trio_Pm_SNPs = 1 - 1e-8; args.aux.trio_Pm_ins = args.aux.trio_Pm_del = 1 - 1e-9; int i, c, samples_is_file = 0; static struct option loptions[] = { {"help",0,0,'h'}, {"gvcf",1,0,'g'}, {"format-fields",1,0,'f'}, {"output",1,0,'o'}, {"output-type",1,0,'O'}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, {"samples",1,0,'s'}, {"samples-file",1,0,'S'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, {"keep-alts",0,0,'A'}, {"insert-missed",0,0,'i'}, {"skip-Ns",0,0,'N'}, // now the new default {"keep-masked-refs",0,0,'M'}, {"skip-variants",1,0,'V'}, {"variants-only",0,0,'v'}, {"consensus-caller",0,0,'c'}, {"constrain",1,0,'C'}, {"multiallelic-caller",0,0,'m'}, {"pval-threshold",1,0,'p'}, {"prior",1,0,'P'}, {"chromosome-X",0,0,'X'}, {"chromosome-Y",0,0,'Y'}, {"novel-rate",1,0,'n'}, {0,0,0,0} }; char *tmp = NULL; while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:XYn:P:f:ig:", loptions, NULL)) >= 0) { switch (c) { case 'g': args.flag |= CF_GVCF; args.gvcf.min_dp = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse, expected integer argument: -g %s\n", optarg); break; case 'f': args.aux.output_tags |= parse_format_flag(optarg); break; case 'M': args.flag &= ~CF_ACGT_ONLY; break; // keep sites where REF is N case 'N': args.flag |= CF_ACGT_ONLY; break; // omit sites where first base in REF is N (the new default) case 'A': args.aux.flag |= CALL_KEEPALT; break; case 'c': args.flag |= CF_CCALL; break; // the original EM based calling method case 'i': args.flag |= CF_INS_MISSED; break; case 'v': args.aux.flag |= CALL_VARONLY; break; case 'o': args.output_fname = optarg; break; case 'O': switch (optarg[0]) { case 'b': args.output_type = FT_BCF_GZ; break; case 'u': args.output_type = FT_BCF; break; case 'z': args.output_type = FT_VCF_GZ; break; case 'v': args.output_type = FT_VCF; break; default: error("The output type \"%s\" not recognised\n", optarg); } break; case 'C': if ( !strcasecmp(optarg,"alleles") ) args.aux.flag |= CALL_CONSTR_ALLELES; else if ( !strcasecmp(optarg,"trio") ) args.aux.flag |= CALL_CONSTR_TRIO; else error("Unknown argument to -C: \"%s\"\n", optarg); break; case 'X': args.aux.flag |= CALL_CHR_X; break; case 'Y': args.aux.flag |= CALL_CHR_Y; break; case 'V': if ( !strcasecmp(optarg,"snps") ) args.flag |= CF_INDEL_ONLY; else if ( !strcasecmp(optarg,"indels") ) args.flag |= CF_NO_INDEL; else error("Unknown skip category \"%s\" (-S argument must be \"snps\" or \"indels\")\n", optarg); break; case 'm': args.flag |= CF_MCALL; break; // multiallelic calling method case 'p': args.aux.pref = atof(optarg); break; case 'P': args.aux.theta = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse, expected float argument: -P %s\n", optarg); break; case 'n': parse_novel_rate(&args,optarg); break; case 'r': args.regions = optarg; break; case 'R': args.regions = optarg; args.regions_is_file = 1; break; case 't': args.targets = optarg; break; case 'T': args.targets = optarg; args.targets_is_file = 1; break; case 's': samples_fname = optarg; break; case 'S': samples_fname = optarg; samples_is_file = 1; break; default: usage(&args); } } if ( optind>=argc ) { if ( !isatty(fileno((FILE *)stdin)) ) args.bcf_fname = "-"; // reading from stdin else usage(&args); } else args.bcf_fname = argv[optind++]; // Sanity check options and initialize if ( samples_fname ) { args.samples = read_samples(&args.aux, samples_fname, samples_is_file, &args.nsamples); args.aux.ploidy = (uint8_t*) calloc(args.nsamples+1, 1); args.aux.all_diploid = 1; for (i=0; i<args.nsamples; i++) { args.aux.ploidy[i] = args.samples[i][strlen(args.samples[i]) + 1]; if ( args.aux.ploidy[i]!=2 ) args.aux.all_diploid = 0; } } if ( args.flag & CF_GVCF ) { // Force some flags to avoid unnecessary branching args.aux.flag &= ~CALL_KEEPALT; args.aux.flag |= CALL_VARONLY; } if ( (args.flag & CF_CCALL ? 1 : 0) + (args.flag & CF_MCALL ? 1 : 0) + (args.flag & CF_QCALL ? 1 : 0) > 1 ) error("Only one of -c or -m options can be given\n"); if ( !(args.flag & CF_CCALL) && !(args.flag & CF_MCALL) && !(args.flag & CF_QCALL) ) error("Expected -c or -m option\n"); if ( args.aux.n_perm && args.aux.ngrp1_samples<=0 ) error("Expected -1 with -U\n"); // not sure about this, please fix if ( args.aux.flag & CALL_CONSTR_ALLELES ) { if ( !args.targets ) error("Expected -t or -T with \"-C alleles\"\n"); if ( !(args.flag & CF_MCALL) ) error("The \"-C alleles\" mode requires -m\n"); } if ( args.aux.flag & CALL_CHR_X && args.aux.flag & CALL_CHR_Y ) error("Only one of -X or -Y should be given\n"); if ( args.flag & CF_INS_MISSED && !(args.aux.flag&CALL_CONSTR_ALLELES) ) error("The -i option requires -C alleles\n"); init_data(&args); while ( bcf_sr_next_line(args.aux.srs) ) { bcf1_t *bcf_rec = args.aux.srs->readers[0].buffer[0]; if ( args.samples_map ) bcf_subset(args.aux.hdr, bcf_rec, args.nsamples, args.samples_map); bcf_unpack(bcf_rec, BCF_UN_STR); // Skip unwanted sites if ( args.aux.flag & CALL_VARONLY ) { int is_ref = 0; if ( bcf_rec->n_allele==1 ) is_ref = 1; // not a variant else if ( bcf_rec->n_allele==2 ) { // second allele is mpileup's X, not a variant if ( bcf_rec->d.allele[1][0]=='X' ) is_ref = 1; else if ( bcf_rec->d.allele[1][0]=='<' && bcf_rec->d.allele[1][1]=='X' && bcf_rec->d.allele[1][2]=='>' ) is_ref = 1; } if ( is_ref ) { // gVCF output if ( args.flag & CF_GVCF ) gvcf_write(args.out_fh, &args.gvcf, args.aux.hdr, bcf_rec, 1); continue; } } if ( (args.flag & CF_INDEL_ONLY) && bcf_is_snp(bcf_rec) ) continue; // not an indel if ( (args.flag & CF_NO_INDEL) && !bcf_is_snp(bcf_rec) ) continue; // not a SNP if ( (args.flag & CF_ACGT_ONLY) && (bcf_rec->d.allele[0][0]=='N' || bcf_rec->d.allele[0][0]=='n') ) continue; // REF[0] is 'N' bcf_unpack(bcf_rec, BCF_UN_ALL); // Various output modes: QCall output (todo) if ( args.flag & CF_QCALL ) { qcall(&args.aux, bcf_rec); continue; } // Calling modes which output VCFs int ret; if ( args.flag & CF_MCALL ) ret = mcall(&args.aux, bcf_rec); else ret = ccall(&args.aux, bcf_rec); if ( ret==-1 ) error("Something is wrong\n"); // gVCF output if ( args.flag & CF_GVCF ) { gvcf_write(args.out_fh, &args.gvcf, args.aux.hdr, bcf_rec, ret?0:1); continue; } // Normal output if ( (args.aux.flag & CALL_VARONLY) && ret==0 ) continue; // not a variant bcf_write1(args.out_fh, args.aux.hdr, bcf_rec); } if ( args.flag & CF_GVCF ) gvcf_write(args.out_fh, &args.gvcf, args.aux.hdr, NULL, 0); if ( args.flag & CF_INS_MISSED ) bcf_sr_regions_flush(args.aux.srs->targets); destroy_data(&args); return 0; }
void isec_vcf(args_t *args) { bcf_srs_t *files = args->files; kstring_t str = {0,0,0}; htsFile *out_fh = NULL; // When only one VCF is output, print VCF to pysam_stdout or -o file int out_std = 0; if ( args->nwrite==1 && !args->prefix ) out_std = 1; if ( args->targets_list && files->nreaders==1 ) out_std = 1; if ( out_std ) { out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type)); if ( out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec"); bcf_hdr_write(out_fh, files->readers[args->iwrite].header); } if ( !args->nwrite && !out_std && !args->prefix ) fprintf(pysam_stderr,"Note: -w option not given, printing list of sites...\n"); int n; while ( (n=bcf_sr_next_line(files)) ) { bcf_sr_t *reader = NULL; bcf1_t *line = NULL; int i, ret = 0; for (i=0; i<files->nreaders; i++) { if ( !bcf_sr_has_line(files,i) ) continue; if ( args->nflt && args->flt[i] ) { bcf1_t *rec = bcf_sr_get_line(files, i); int pass = filter_test(args->flt[i], rec, NULL); if ( args->flt_logic[i] & FLT_EXCLUDE ) pass = pass ? 0 : 1; if ( !pass ) { files->has_line[i] = 0; n--; continue; } } if ( !line ) { line = files->readers[i].buffer[0]; reader = &files->readers[i]; } ret |= 1<<i; // this may overflow for many files, but will be used only with two (OP_VENN) } switch (args->isec_op) { case OP_COMPLEMENT: if ( n!=1 || !bcf_sr_has_line(files,0) ) continue; break; case OP_EQUAL: if ( n != args->isec_n ) continue; break; case OP_PLUS: if ( n < args->isec_n ) continue; break; case OP_MINUS: if ( n > args->isec_n ) continue; break; case OP_EXACT: for (i=0; i<files->nreaders; i++) if ( files->has_line[i] != args->isec_exact[i] ) break; if ( i<files->nreaders ) continue; break; } if ( out_std ) { if ( bcf_sr_has_line(files,args->iwrite) ) bcf_write1(out_fh, files->readers[args->iwrite].header, files->readers[args->iwrite].buffer[0]); continue; } else if ( args->fh_sites ) { str.l = 0; kputs(reader->header->id[BCF_DT_CTG][line->rid].key, &str); kputc('\t', &str); kputw(line->pos+1, &str); kputc('\t', &str); if (line->n_allele > 0) kputs(line->d.allele[0], &str); else kputc('.', &str); kputc('\t', &str); if (line->n_allele > 1) kputs(line->d.allele[1], &str); else kputc('.', &str); for (i=2; i<line->n_allele; i++) { kputc(',', &str); kputs(line->d.allele[i], &str); } kputc('\t', &str); for (i=0; i<files->nreaders; i++) kputc(bcf_sr_has_line(files,i)?'1':'0', &str); kputc('\n', &str); fwrite(str.s,sizeof(char),str.l,args->fh_sites); } if ( args->prefix ) { if ( args->isec_op==OP_VENN && ret==3 ) { if ( !args->nwrite || args->write[0] ) bcf_write1(args->fh_out[2], bcf_sr_get_header(files,0), bcf_sr_get_line(files,0)); if ( !args->nwrite || args->write[1] ) bcf_write1(args->fh_out[3], bcf_sr_get_header(files,1), bcf_sr_get_line(files,1)); } else { for (i=0; i<files->nreaders; i++) { if ( !bcf_sr_has_line(files,i) ) continue; if ( args->write && !args->write[i] ) continue; bcf_write1(args->fh_out[i], files->readers[i].header, files->readers[i].buffer[0]); } } } } if ( str.s ) free(str.s); if ( out_fh ) hts_close(out_fh); }
static void concat(args_t *args) { int i; if ( args->phased_concat ) // phased concat { // keep only two open files at a time while ( args->ifname < args->nfnames ) { int new_file = 0; while ( args->files->nreaders < 2 && args->ifname < args->nfnames ) { if ( !bcf_sr_add_reader(args->files,args->fnames[args->ifname]) ) error("Failed to open %s: %s\n", args->fnames[args->ifname],bcf_sr_strerror(args->files->errnum)); new_file = 1; args->ifname++; if ( args->start_pos[args->ifname-1]==-1 ) break; // new chromosome, start with only one file open if ( args->ifname < args->nfnames && args->start_pos[args->ifname]==-1 ) break; // next file starts on a different chromosome } // is there a line from the previous run? Seek the newly opened reader to that position int seek_pos = -1; int seek_chr = -1; if ( bcf_sr_has_line(args->files,0) ) { bcf1_t *line = bcf_sr_get_line(args->files,0); bcf_sr_seek(args->files, bcf_seqname(args->files->readers[0].header,line), line->pos); seek_pos = line->pos; seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[0].header,line)); } else if ( new_file ) bcf_sr_seek(args->files,NULL,0); // set to start int nret; while ( (nret = bcf_sr_next_line(args->files)) ) { if ( !bcf_sr_has_line(args->files,0) ) // no input from the first reader { // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped if ( ! bcf_sr_region_done(args->files,0) ) continue; phased_flush(args); bcf_sr_remove_reader(args->files, 0); } // Get a line to learn about current position for (i=0; i<args->files->nreaders; i++) if ( bcf_sr_has_line(args->files,i) ) break; bcf1_t *line = bcf_sr_get_line(args->files,i); // This can happen after bcf_sr_seek: indel may start before the coordinate which we seek to. if ( seek_chr>=0 && seek_pos>line->pos && seek_chr==bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line)) ) continue; seek_pos = seek_chr = -1; // Check if the position overlaps with the next, yet unopened, reader int must_seek = 0; while ( args->ifname < args->nfnames && args->start_pos[args->ifname]!=-1 && line->pos >= args->start_pos[args->ifname] ) { must_seek = 1; if ( !bcf_sr_add_reader(args->files,args->fnames[args->ifname]) ) error("Failed to open %s: %s\n", args->fnames[args->ifname],bcf_sr_strerror(args->files->errnum)); args->ifname++; } if ( must_seek ) { bcf_sr_seek(args->files, bcf_seqname(args->files->readers[i].header,line), line->pos); seek_pos = line->pos; seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line)); continue; } // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped if ( args->files->nreaders>1 && !bcf_sr_has_line(args->files,1) && !bcf_sr_region_done(args->files,1) ) continue; phased_push(args, bcf_sr_get_line(args->files,0), args->files->nreaders>1 ? bcf_sr_get_line(args->files,1) : NULL); } if ( args->files->nreaders ) { phased_flush(args); while ( args->files->nreaders ) bcf_sr_remove_reader(args->files, 0); } } } else if ( args->files ) // combining overlapping files, using synced reader { while ( bcf_sr_next_line(args->files) ) { for (i=0; i<args->files->nreaders; i++) { bcf1_t *line = bcf_sr_get_line(args->files,i); if ( !line ) continue; bcf_translate(args->out_hdr, args->files->readers[i].header, line); bcf_write1(args->out_fh, args->out_hdr, line); if ( args->remove_dups ) break; } } } else // concatenating { kstring_t tmp = {0,0,0}; int prev_chr_id = -1, prev_pos; bcf1_t *line = bcf_init(); for (i=0; i<args->nfnames; i++) { htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]); if ( !fp->is_bin && args->output_type&FT_VCF ) { line->max_unpack = BCF_UN_STR; // if VCF is on both input and output, avoid VCF to BCF conversion while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 ) { char *str = fp->line.s; while ( *str && *str!='\t' ) str++; tmp.l = 0; kputsn(fp->line.s,str-fp->line.s,&tmp); int chr_id = bcf_hdr_name2id(args->out_hdr, tmp.s); if ( chr_id<0 ) error("The sequence \"%s\" not defined in the header: %s\n(Quick workaround: index the file.)\n", tmp.s, args->fnames[i]); if ( prev_chr_id!=chr_id ) { prev_pos = -1; if ( args->seen_seq[chr_id] ) error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", tmp.s); } char *end; int pos = strtol(str+1,&end,10) - 1; if ( end==str+1 ) error("Could not parse line: %s\n", fp->line.s); if ( prev_pos > pos ) error("The chromosome block %s is not sorted, consider running with -a.\n", tmp.s); args->seen_seq[chr_id] = 1; prev_chr_id = chr_id; if ( vcf_write_line(args->out_fh, &fp->line)!=0 ) error("Failed to write %d bytes\n", fp->line.l); } } else { // BCF conversion is required line->max_unpack = 0; while ( bcf_read(fp, hdr, line)==0 ) { bcf_translate(args->out_hdr, hdr, line); if ( prev_chr_id!=line->rid ) { prev_pos = -1; if ( args->seen_seq[line->rid] ) error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", bcf_seqname(args->out_hdr, line)); } if ( prev_pos > line->pos ) error("The chromosome block %s is not sorted, consider running with -a.\n", bcf_seqname(args->out_hdr, line)); args->seen_seq[line->rid] = 1; prev_chr_id = line->rid; if ( bcf_write(args->out_fh, args->out_hdr, line)!=0 ) error("Failed to write\n"); } } bcf_hdr_destroy(hdr); hts_close(fp); } bcf_destroy(line); free(tmp.s); } }
void isec_vcf(args_t *args) { bcf_srs_t *files = args->files; kstring_t str = {0,0,0}; htsFile *out_fh = NULL; // When only one VCF is output, print VCF to stdout int out_std = 0; if ( args->nwrite==1 ) out_std = 1; if ( args->targets_list && files->nreaders==1 ) out_std = 1; if ( out_std ) { out_fh = hts_open("-",hts_bcf_wmode(args->output_type)); bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec"); bcf_hdr_write(out_fh, files->readers[args->iwrite].header); } if ( !args->nwrite && !out_std && !args->prefix ) fprintf(stderr,"Note: -w option not given, printing list of sites...\n"); int n; while ( (n=bcf_sr_next_line(files)) ) { bcf_sr_t *reader = NULL; bcf1_t *line = NULL; int i, ret = 0; for (i=0; i<files->nreaders; i++) { if ( !bcf_sr_has_line(files,i) ) continue; if ( !line ) { line = files->readers[i].buffer[0]; reader = &files->readers[i]; } ret |= 1<<i; // this may overflow for many files, but will be used only with two (OP_VENN) } switch (args->isec_op) { case OP_COMPLEMENT: if ( n!=1 || !bcf_sr_has_line(files,0) ) continue; break; case OP_EQUAL: if ( n != args->isec_n ) continue; break; case OP_PLUS: if ( n < args->isec_n ) continue; break; case OP_MINUS: if ( n > args->isec_n ) continue; } if ( out_std ) { if ( bcf_sr_has_line(files,args->iwrite) ) bcf_write1(out_fh, files->readers[args->iwrite].header, files->readers[args->iwrite].buffer[0]); continue; } else if ( args->fh_sites ) { str.l = 0; kputs(reader->header->id[BCF_DT_CTG][line->rid].key, &str); kputc('\t', &str); kputw(line->pos+1, &str); kputc('\t', &str); if (line->n_allele > 0) kputs(line->d.allele[0], &str); else kputc('.', &str); kputc('\t', &str); if (line->n_allele > 1) kputs(line->d.allele[1], &str); else kputc('.', &str); for (i=2; i<line->n_allele; i++) { kputc(',', &str); kputs(line->d.allele[i], &str); } kputc('\t', &str); for (i=0; i<files->nreaders; i++) kputc(bcf_sr_has_line(files,i)?'1':'0', &str); kputc('\n', &str); fwrite(str.s,sizeof(char),str.l,args->fh_sites); } if ( args->prefix ) { if ( args->isec_op==OP_VENN ) bcf_write1(args->fh_out[ret-1], reader->header, line); else { for (i=0; i<files->nreaders; i++) { if ( !bcf_sr_has_line(files,i) ) continue; if ( args->write && !args->write[i] ) continue; bcf_write1(args->fh_out[i], files->readers[i].header, files->readers[i].buffer[0]); } } } } if ( str.s ) free(str.s); if ( out_fh ) hts_close(out_fh); }
//{{{ void get_bcf_query_result(uint32_t *mask, void get_bcf_query_result(uint32_t *mask, uint32_t mask_len, struct gqt_query *q, char **id_query_list, uint32_t *id_lens, uint32_t num_qs, uint32_t num_fields, char *vid_file_name, char *bcf_file_name, int bcf_output) { /* The VID file contains the line numbers of the variants after they have * been sorted. To reach back into the BCF file to print the metadata * associated with the variants marked in the mask, we need to create a * sorted list of line numbers we want. So first we intersect the VID file * and the mask, then sort it. */ /* FILE *vid_f = fopen(vid_file_name, "rb"); if (!vid_f) err(EX_NOINPUT, "Cannot read file\"%s\"", vid_file_name); uint32_t *vids = (uint32_t *) malloc(num_fields*sizeof(uint32_t)); if (!vids ) err(EX_OSERR, "malloc error"); size_t fr = fread(vids, sizeof(uint32_t), num_fields, vid_f); check_file_read(vid_file_name, vid_f, num_fields, fr); fclose(vid_f); */ struct vid_file *vid_f = open_vid_file(vid_file_name); load_vid_data(vid_f); uint32_t i, j, masked_vid_count = 0; for (i = 0; i < mask_len; ++i) masked_vid_count += popcount(mask[i]); uint32_t *masked_vids = (uint32_t *) malloc(masked_vid_count*sizeof(uint32_t)); if (!masked_vids ) err(EX_OSERR, "malloc error"); uint32_t masked_vid_i = 0; for (i = 0; i < mask_len; ++i) { uint32_t bytes = mask[i]; if (bytes == 0) continue; /* skip a bunch of ops if you can */ for (j = 0; j < 32; j++) { if (bytes & (1 << (31 - j))) { masked_vids[masked_vid_i] = vid_f->vids[i*32 + j]; masked_vid_i+=1; } } if (masked_vid_i == masked_vid_count) break; } destroy_vid_file(vid_f); qsort(masked_vids, masked_vid_count, sizeof(uint32_t), compare_uint32_t); htsFile *fp = hts_open(bcf_file_name,"rb"); bcf_hdr_t *hdr = bcf_hdr_read(fp); bcf1_t *line = bcf_init1(); //bcf_hdr_set_samples(hdr, print_name_csv, 0); htsFile *out; if (!bcf_output) out = hts_open("-", "w"); else out = hts_open("-", "wb"); int r = bcf_hdr_write(out, hdr); uint32_t bcf_line_i = 0; masked_vid_i = 0; while ( bcf_read(fp, hdr, line) != -1) { if (masked_vids[masked_vid_i] == bcf_line_i) { r = bcf_unpack(line, BCF_UN_ALL); r = bcf_write1(out, hdr, line); masked_vid_i+=1; } if (masked_vid_i == masked_vid_count) break; bcf_line_i += 1; } hts_close(out); hts_close(fp); }
int main_plugin(int argc, char *argv[]) { int c; args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->argc = argc; args->argv = argv; args->files = bcf_sr_init(); args->output_fname = "-"; args->output_type = FT_VCF; args->nplugin_paths = -1; int regions_is_file = 0, targets_is_file = 0, plist_only = 0; if ( argc==1 ) usage(args); char *plugin_name = NULL; if ( argv[1][0]!='-' ) { plugin_name = argv[1]; argc--; argv++; } static struct option loptions[] = { {"verbose",0,0,'v'}, {"help",0,0,'h'}, {"list-plugins",0,0,'l'}, {"output",1,0,'o'}, {"output-type",1,0,'O'}, {"include",1,0,'i'}, {"exclude",1,0,'e'}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, {0,0,0,0} }; while ((c = getopt_long(argc, argv, "h?o:O:r:R:li:e:v",loptions,NULL)) >= 0) { switch (c) { case 'v': args->verbose = 1; break; case 'o': args->output_fname = optarg; break; case 'O': switch (optarg[0]) { case 'b': args->output_type = FT_BCF_GZ; break; case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; default: error("The output type \"%s\" not recognised\n", optarg); }; break; case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; case 't': args->targets_list = optarg; break; case 'T': args->targets_list = optarg; targets_is_file = 1; break; case 'l': plist_only = 1; break; case '?': case 'h': load_plugin(args, plugin_name, 1, &args->plugin); fprintf(stderr,"%s",args->plugin.usage()); return 0; break; default: error("Unknown argument: %s\n", optarg); } } if ( plist_only ) return list_plugins(args); char *fname = NULL; if ( optind>=argc || argv[optind][0]=='-' ) { if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin else usage(args); args->plugin.argc = argc - optind + 1; args->plugin.argv = argv + optind - 1; } else { fname = argv[optind]; args->plugin.argc = argc - optind; args->plugin.argv = argv + optind; } optind = 0; args->plugin.argv[0] = plugin_name; load_plugin(args, plugin_name, 1, &args->plugin); if ( args->regions_list ) { if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } if ( args->targets_list ) { if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); args->files->collapse |= COLLAPSE_SOME; } if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open or the file not indexed: %s\n", fname); init_data(args); while ( bcf_sr_next_line(args->files) ) { bcf1_t *line = bcf_sr_get_line(args->files,0); if ( args->filter ) { int pass = filter_test(args->filter, line, NULL); if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; if ( !pass ) continue; } line = args->plugin.process(line); if ( line ) bcf_write1(args->out_fh, args->hdr_out, line); } destroy_data(args); bcf_sr_destroy(args->files); free(args); return 0; }
int run(int argc, char **argv) { char *trio_samples = NULL, *trio_file = NULL, *rules_fname = NULL, *rules_string = NULL; memset(&args,0,sizeof(args_t)); args.mode = 0; args.output_fname = "-"; static struct option loptions[] = { {"trio",1,0,'t'}, {"trio-file",1,0,'T'}, {"delete",0,0,'d'}, {"list",1,0,'l'}, {"count",0,0,'c'}, {"rules",1,0,'r'}, {"rules-file",1,0,'R'}, {"output",required_argument,NULL,'o'}, {"output-type",required_argument,NULL,'O'}, {0,0,0,0} }; int c; while ((c = getopt_long(argc, argv, "?ht:T:l:cdr:R:o:O:",loptions,NULL)) >= 0) { switch (c) { case 'o': args.output_fname = optarg; break; case 'O': switch (optarg[0]) { case 'b': args.output_type = FT_BCF_GZ; break; case 'u': args.output_type = FT_BCF; break; case 'z': args.output_type = FT_VCF_GZ; break; case 'v': args.output_type = FT_VCF; break; default: error("The output type \"%s\" not recognised\n", optarg); }; break; case 'R': rules_fname = optarg; break; case 'r': rules_string = optarg; break; case 'd': args.mode |= MODE_DELETE; break; case 'c': args.mode |= MODE_COUNT; break; case 'l': if ( !strcmp("+",optarg) ) args.mode |= MODE_LIST_GOOD; else if ( !strcmp("x",optarg) ) args.mode |= MODE_LIST_BAD; else error("The argument not recognised: --list %s\n", optarg); break; case 't': trio_samples = optarg; break; case 'T': trio_file = optarg; break; case 'h': case '?': default: error("%s",usage()); break; } } if ( rules_fname ) args.rules = regidx_init(rules_fname, parse_rules, NULL, sizeof(rule_t), &args); else args.rules = init_rules(&args, rules_string); if ( !args.rules ) return -1; args.itr = regitr_init(args.rules); args.itr_ori = regitr_init(args.rules); char *fname = NULL; if ( optind>=argc || argv[optind][0]=='-' ) { if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin else error("%s",usage()); } else fname = argv[optind]; if ( !trio_samples && !trio_file ) error("Expected the -t/T option\n"); if ( !args.mode ) error("Expected one of the -c, -d or -l options\n"); if ( args.mode&MODE_DELETE && !(args.mode&(MODE_LIST_GOOD|MODE_LIST_BAD)) ) args.mode |= MODE_LIST_GOOD|MODE_LIST_BAD; args.sr = bcf_sr_init(); if ( !bcf_sr_add_reader(args.sr, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args.sr->errnum)); args.hdr = bcf_sr_get_header(args.sr, 0); args.out_fh = hts_open(args.output_fname,hts_bcf_wmode(args.output_type)); if ( args.out_fh == NULL ) error("Can't write to \"%s\": %s\n", args.output_fname, strerror(errno)); bcf_hdr_write(args.out_fh, args.hdr); int i, n = 0; char **list; if ( trio_samples ) { args.ntrios = 1; args.trios = (trio_t*) calloc(1,sizeof(trio_t)); list = hts_readlist(trio_samples, 0, &n); if ( n!=3 ) error("Expected three sample names with -t\n"); args.trios[0].imother = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[0]); args.trios[0].ifather = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[1]); args.trios[0].ichild = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[2]); for (i=0; i<n; i++) free(list[i]); free(list); } if ( trio_file ) { list = hts_readlist(trio_file, 1, &n); args.ntrios = n; args.trios = (trio_t*) calloc(n,sizeof(trio_t)); for (i=0; i<n; i++) { char *ss = list[i], *se; se = strchr(ss, ','); if ( !se ) error("Could not parse %s: %s\n",trio_file, ss); *se = 0; args.trios[i].imother = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, ss); if ( args.trios[i].imother<0 ) error("No such sample: \"%s\"\n", ss); ss = ++se; se = strchr(ss, ','); if ( !se ) error("Could not parse %s\n",trio_file); *se = 0; args.trios[i].ifather = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, ss); if ( args.trios[i].ifather<0 ) error("No such sample: \"%s\"\n", ss); ss = ++se; if ( *ss=='\0' ) error("Could not parse %s\n",trio_file); args.trios[i].ichild = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, ss); if ( args.trios[i].ichild<0 ) error("No such sample: \"%s\"\n", ss); free(list[i]); } free(list); } while ( bcf_sr_next_line(args.sr) ) { bcf1_t *line = bcf_sr_get_line(args.sr,0); line = process(line); if ( line ) { if ( line->errcode ) error("TODO: Unchecked error (%d), exiting\n",line->errcode); bcf_write1(args.out_fh, args.hdr, line); } } fprintf(stderr,"# [1]nOK\t[2]nBad\t[3]nSkipped\t[4]Trio\n"); for (i=0; i<args.ntrios; i++) { trio_t *trio = &args.trios[i]; fprintf(stderr,"%d\t%d\t%d\t%s,%s,%s\n", trio->nok,trio->nbad,args.nrec-(trio->nok+trio->nbad), bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->imother), bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ifather), bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ichild) ); } free(args.gt_arr); free(args.trios); regitr_destroy(args.itr); regitr_destroy(args.itr_ori); regidx_destroy(args.rules); bcf_sr_destroy(args.sr); if ( hts_close(args.out_fh)!=0 ) error("Error: close failed\n"); return 0; }
/* * Performs pileup * @param conf configuration for this pileup * @param n number of files specified in fn * @param fn filenames */ static int mpileup(mplp_conf_t *conf, int n, char **fn) { extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list); extern void bcf_call_del_rghash(void *rghash); mplp_aux_t **data; int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth; const bam_pileup1_t **plp; bam_mplp_t iter; bam_hdr_t *h = NULL; /* header of first file in input list */ char *ref; void *rghash = NULL; FILE *pileup_fp = NULL; bcf_callaux_t *bca = NULL; bcf_callret1_t *bcr = NULL; bcf_call_t bc; htsFile *bcf_fp = NULL; bcf_hdr_t *bcf_hdr = NULL; bam_sample_t *sm = NULL; kstring_t buf; mplp_pileup_t gplp; memset(&gplp, 0, sizeof(mplp_pileup_t)); memset(&buf, 0, sizeof(kstring_t)); memset(&bc, 0, sizeof(bcf_call_t)); data = calloc(n, sizeof(mplp_aux_t*)); plp = calloc(n, sizeof(bam_pileup1_t*)); n_plp = calloc(n, sizeof(int)); sm = bam_smpl_init(); if (n == 0) { fprintf(stderr,"[%s] no input file/data given\n", __func__); exit(1); } // read the header of each file in the list and initialize data for (i = 0; i < n; ++i) { bam_hdr_t *h_tmp; data[i] = calloc(1, sizeof(mplp_aux_t)); data[i]->fp = sam_open(fn[i], "rb"); if ( !data[i]->fp ) { fprintf(stderr, "[%s] failed to open %s: %s\n", __func__, fn[i], strerror(errno)); exit(1); } hts_set_fai_filename(data[i]->fp, conf->fai_fname); data[i]->conf = conf; h_tmp = sam_hdr_read(data[i]->fp); if ( !h_tmp ) { fprintf(stderr,"[%s] fail to read the header of %s\n", __func__, fn[i]); exit(1); } data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); // Collect read group IDs with PL (platform) listed in pl_list (note: fragile, strstr search) rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); if (conf->reg) { hts_idx_t *idx = sam_index_load(data[i]->fp, fn[i]); if (idx == 0) { fprintf(stderr, "[%s] fail to load index for %s\n", __func__, fn[i]); exit(1); } if ( (data[i]->iter=sam_itr_querys(idx, data[i]->h, conf->reg)) == 0) { fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, conf->reg); exit(1); } if (i == 0) tid0 = data[i]->iter->tid, beg0 = data[i]->iter->beg, end0 = data[i]->iter->end; hts_idx_destroy(idx); } if (i == 0) h = h_tmp; /* save the header of first file in list */ else { // FIXME: to check consistency bam_hdr_destroy(h_tmp); } } // allocate data storage proportionate to number of samples being studied sm->n gplp.n = sm->n; gplp.n_plp = calloc(sm->n, sizeof(int)); gplp.m_plp = calloc(sm->n, sizeof(int)); gplp.plp = calloc(sm->n, sizeof(bam_pileup1_t*)); fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n); // write the VCF header if (conf->flag & MPLP_BCF) { const char *mode; if ( conf->flag & MPLP_VCF ) mode = (conf->flag&MPLP_NO_COMP)? "wu" : "wz"; // uncompressed VCF or compressed VCF else mode = (conf->flag&MPLP_NO_COMP)? "wub" : "wb"; // uncompressed BCF or compressed BCF bcf_fp = bcf_open(conf->output_fname? conf->output_fname : "-", mode); if (bcf_fp == NULL) { fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno)); exit(1); } bcf_hdr = bcf_hdr_init("w"); kstring_t str = {0,0,0}; ksprintf(&str, "##samtoolsVersion=%s+htslib-%s\n",samtools_version(),hts_version()); bcf_hdr_append(bcf_hdr, str.s); str.l = 0; ksprintf(&str, "##samtoolsCommand=samtools mpileup"); for (i=1; i<conf->argc; i++) ksprintf(&str, " %s", conf->argv[i]); kputc('\n', &str); bcf_hdr_append(bcf_hdr, str.s); if (conf->fai_fname) { str.l = 0; ksprintf(&str, "##reference=file://%s\n", conf->fai_fname); bcf_hdr_append(bcf_hdr, str.s); } // todo: use/write new BAM header manipulation routines, fill also UR, M5 for (i=0; i<h->n_targets; i++) { str.l = 0; ksprintf(&str, "##contig=<ID=%s,length=%d>", h->target_name[i], h->target_len[i]); bcf_hdr_append(bcf_hdr, str.s); } free(str.s); bcf_hdr_append(bcf_hdr,"##ALT=<ID=X,Description=\"Represents allele(s) other than observed.\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=IDV,Number=1,Type=Integer,Description=\"Maximum number of reads supporting an indel\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=IMF,Number=1,Type=Float,Description=\"Maximum fraction of reads supporting an indel\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)\",Version=\"3\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">"); #if CDF_MWU_TESTS bcf_hdr_append(bcf_hdr,"##INFO=<ID=RPB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias [CDF] (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias [CDF] (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=BQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias [CDF] (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQSB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias [CDF] (bigger is better)\">"); #endif bcf_hdr_append(bcf_hdr,"##INFO=<ID=SGB,Number=1,Type=Float,Description=\"Segregation based metric.\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQ0F,Number=1,Type=Float,Description=\"Fraction of MQ0 reads (smaller is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=I16,Number=16,Type=Float,Description=\"Auxiliary tag used for calling, see description of bcf_callret1_t in bam2bcf.h\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=QS,Number=R,Type=Float,Description=\"Auxiliary tag used for calling\">"); bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods\">"); if ( conf->fmt_flag&B2B_FMT_DP ) bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Number of high-quality bases\">"); if ( conf->fmt_flag&B2B_FMT_DV ) bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DV,Number=1,Type=Integer,Description=\"Number of high-quality non-reference bases\">"); if ( conf->fmt_flag&B2B_FMT_DPR ) bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">"); if ( conf->fmt_flag&B2B_INFO_DPR ) bcf_hdr_append(bcf_hdr,"##INFO=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">"); if ( conf->fmt_flag&B2B_FMT_DP4 ) bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-fwd, ref-reverse, alt-fwd and alt-reverse bases\">"); if ( conf->fmt_flag&B2B_FMT_SP ) bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Phred-scaled strand bias P-value\">"); for (i=0; i<sm->n; i++) bcf_hdr_add_sample(bcf_hdr, sm->smpl[i]); bcf_hdr_add_sample(bcf_hdr, NULL); bcf_hdr_write(bcf_fp, bcf_hdr); bca = bcf_call_init(-1., conf->min_baseQ); bcr = calloc(sm->n, sizeof(bcf_callret1_t)); bca->rghash = rghash; bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ; bca->min_frac = conf->min_frac; bca->min_support = conf->min_support; bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE; bc.bcf_hdr = bcf_hdr; bc.n = sm->n; bc.PL = malloc(15 * sm->n * sizeof(*bc.PL)); if (conf->fmt_flag) { assert( sizeof(float)==sizeof(int32_t) ); bc.DP4 = malloc(sm->n * sizeof(int32_t) * 4); bc.fmt_arr = malloc(sm->n * sizeof(float)); // all fmt_flag fields if ( conf->fmt_flag&(B2B_INFO_DPR|B2B_FMT_DPR) ) { // first B2B_MAX_ALLELES fields for total numbers, the rest per-sample bc.DPR = malloc((sm->n+1)*B2B_MAX_ALLELES*sizeof(int32_t)); for (i=0; i<sm->n; i++) bcr[i].DPR = bc.DPR + (i+1)*B2B_MAX_ALLELES; } } } else { pileup_fp = conf->output_fname? fopen(conf->output_fname, "w") : stdout; if (pileup_fp == NULL) { fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname, strerror(errno)); exit(1); } } if (tid0 >= 0 && conf->fai) { // region is set ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len); ref_tid = tid0; for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0; } else ref_tid = -1, ref = 0; // begin pileup iter = bam_mplp_init(n, mplp_func, (void**)data); if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(iter); max_depth = conf->max_depth; if (max_depth * sm->n > 1<<20) fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__); if (max_depth * sm->n < 8000) { max_depth = 8000 / sm->n; fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth); } max_indel_depth = conf->max_indel_depth * sm->n; bam_mplp_set_maxcnt(iter, max_depth); bcf1_t *bcf_rec = bcf_init1(); int ret; while ( (ret=bam_mplp_auto(iter, &tid, &pos, n_plp, plp)) > 0) { if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; if (tid != ref_tid) { free(ref); ref = 0; if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len); for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid; ref_tid = tid; } if (conf->flag & MPLP_BCF) { int total_depth, _ref0, ref16; for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i]; group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG); _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; ref16 = seq_nt16_table[_ref0]; bcf_callaux_clean(bca, &bc); for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i); bc.tid = tid; bc.pos = pos; bcf_call_combine(gplp.n, bcr, bca, ref16, &bc); bcf_clear1(bcf_rec); bcf_call2bcf(&bc, bcf_rec, bcr, conf->fmt_flag, 0, 0); bcf_write1(bcf_fp, bcf_hdr, bcf_rec); // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring? if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { bcf_callaux_clean(bca, &bc); for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i); if (bcf_call_combine(gplp.n, bcr, bca, -1, &bc) >= 0) { bcf_clear1(bcf_rec); bcf_call2bcf(&bc, bcf_rec, bcr, conf->fmt_flag, bca, ref); bcf_write1(bcf_fp, bcf_hdr, bcf_rec); } } } else { fprintf(pileup_fp, "%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); for (i = 0; i < n; ++i) { int j, cnt; for (j = cnt = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; if (bam_get_qual(p->b)[p->qpos] >= conf->min_baseQ) ++cnt; } fprintf(pileup_fp, "\t%d\t", cnt); if (n_plp[i] == 0) { fputs("*\t*", pileup_fp); if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", pileup_fp); if (conf->flag & MPLP_PRINT_POS) fputs("\t*", pileup_fp); } else { for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; if (bam_get_qual(p->b)[p->qpos] >= conf->min_baseQ) pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref); } putc('\t', pileup_fp); for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; int c = bam_get_qual(p->b)[p->qpos]; if (c >= conf->min_baseQ) { c = c + 33 < 126? c + 33 : 126; putc(c, pileup_fp); } } if (conf->flag & MPLP_PRINT_MAPQ) { putc('\t', pileup_fp); for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; int c = bam_get_qual(p->b)[p->qpos]; if ( c < conf->min_baseQ ) continue; c = plp[i][j].b->core.qual + 33; if (c > 126) c = 126; putc(c, pileup_fp); } } if (conf->flag & MPLP_PRINT_POS) { putc('\t', pileup_fp); for (j = 0; j < n_plp[i]; ++j) { if (j > 0) putc(',', pileup_fp); fprintf(pileup_fp, "%d", plp[i][j].qpos + 1); // FIXME: printf() is very slow... } } } } putc('\n', pileup_fp); } } // clean up free(bc.tmp.s); bcf_destroy1(bcf_rec); if (bcf_fp) { hts_close(bcf_fp); bcf_hdr_destroy(bcf_hdr); bcf_call_destroy(bca); free(bc.PL); free(bc.DP4); free(bc.DPR); free(bc.fmt_arr); free(bcr); } if (pileup_fp && conf->output_fname) fclose(pileup_fp); bam_smpl_destroy(sm); free(buf.s); for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]); free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp); bcf_call_del_rghash(rghash); bam_mplp_destroy(iter); bam_hdr_destroy(h); for (i = 0; i < n; ++i) { sam_close(data[i]->fp); if (data[i]->iter) hts_itr_destroy(data[i]->iter); free(data[i]); } free(data); free(plp); free(ref); free(n_plp); return ret; }
void abcWriteBcf::print(funkyPars *pars){ if(doBcf==0) return; kstring_t buf; if(fp==NULL){ buf.s=NULL;buf.l=buf.m=0; fp=aio::openFileHts(outfiles,".bcf"); hdr = bcf_hdr_init("w"); rec = bcf_init1(); print_bcf_header(fp,hdr,args,buf,header); } lh3struct *lh3 = (lh3struct*) pars->extras[5]; freqStruct *freq = (freqStruct *) pars->extras[6]; genoCalls *geno = (genoCalls *) pars->extras[10]; for(int s=0;s<pars->numSites;s++){ if(pars->keepSites[s]==0) continue; rec->rid = bcf_hdr_name2id(hdr,header->target_name[pars->refId]); rec->pos = pars->posi[s];//<- maybe one index? // bcf_update_id(hdr, rec, "rs6054257"); char majmin[4]={intToRef[pars->major[s]],',',intToRef[pars->minor[s]],'\0'}; bcf_update_alleles_str(hdr, rec, majmin); rec->qual = 29; // .. FILTER int32_t tmpi = bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS"); bcf_update_filter(hdr, rec, &tmpi, 1); // .. INFO tmpi = pars->keepSites[s]; bcf_update_info_int32(hdr, rec, "NS", &tmpi, 1); if(pars->counts){ int depth = 0; for(int i=0; i<4*pars->nInd; i++) depth += pars->counts[s][i]; tmpi = depth; bcf_update_info_int32(hdr, rec, "DP", &tmpi, 1); } if(freq){ float tmpf = freq->freq_EM[s]; bcf_update_info_float(hdr, rec, "AF", &tmpf, 1); } // .. FORMAT assert(geno); if(geno){ int32_t *tmpia = (int*)malloc(bcf_hdr_nsamples(hdr)*2*sizeof(int32_t)); for(int i=0; i<pars->nInd;i++){ if(geno->dat[s][i]==0){ tmpia[2*i+0] = bcf_gt_unphased(0); tmpia[2*i+1] = bcf_gt_unphased(0); }else if(geno->dat[s][i]==1){ tmpia[2*i+0] = bcf_gt_unphased(0); tmpia[2*i+1] = bcf_gt_unphased(1); } else{ tmpia[2*i+0] = bcf_gt_unphased(1); tmpia[2*i+1] = bcf_gt_unphased(1); } } bcf_update_genotypes(hdr, rec, tmpia, bcf_hdr_nsamples(hdr)*2); free(tmpia); } if(pars->counts){ int32_t *tmpfa = (int32_t*)malloc(sizeof(int32_t)*bcf_hdr_nsamples(hdr)); suint *ary=pars->counts[s]; for(int i=0;i<bcf_hdr_nsamples(hdr);i++) tmpfa[i] = ary[0]+ary[1]+ary[2]+ary[3]; bcf_update_format_int32(hdr, rec, "DP", tmpfa,bcf_hdr_nsamples(hdr) ); free(tmpfa); } assert(lh3); if(lh3){ float *tmpfa = (float*)malloc(3*bcf_hdr_nsamples(hdr)*sizeof(float )); int32_t *tmpi = (int32_t*)malloc(3*bcf_hdr_nsamples(hdr)*sizeof(int32_t)); double *ary = lh3->lh3[s]; for(int i=0;i<bcf_hdr_nsamples(hdr);i++) for(int j=0;j<3;j++){ tmpfa[i*3+j] = ary[i*3+j]/M_LN10; tmpi[i*3+j] =(int) -log10(exp(ary[i*3+j]))*10.0; // fprintf(stderr,"pl:%d raw:%f\n",tmpi[i*3+j],ary[i*3+j]); } bcf_update_format_float(hdr, rec, "GL", tmpfa,3*bcf_hdr_nsamples(hdr) ); bcf_update_format_int32(hdr, rec, "PL", tmpi,3*bcf_hdr_nsamples(hdr) ); free(tmpfa); free(tmpi); } if ( bcf_write1(fp, hdr, rec)!=0 ){ fprintf(stderr,"Failed to write to \n"); exit(0); } // fprintf(stderr,"------\n"); bcf_clear1(rec); } }
int main_vcfview(int argc, char *argv[]) { int c; args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->argc = argc; args->argv = argv; args->files = bcf_sr_init(); args->clevel = -1; args->print_header = 1; args->update_info = 1; args->output_type = FT_VCF; int targets_is_file = 0, regions_is_file = 0; static struct option loptions[] = { {"genotype",1,0,'g'}, {"compression-level",1,0,'l'}, {"header-only",0,0,'h'}, {"no-header",0,0,'H'}, {"exclude",1,0,'e'}, {"include",1,0,'i'}, {"trim-alt-alleles",0,0,'a'}, {"no-update",0,0,'I'}, {"drop-genotypes",0,0,'G'}, {"private",0,0,'x'}, {"exclude-private",0,0,'X'}, {"uncalled",0,0,'u'}, {"exclude-uncalled",0,0,'U'}, {"apply-filters",1,0,'f'}, {"known",0,0,'k'}, {"novel",0,0,'n'}, {"min-alleles",1,0,'m'}, {"max-alleles",1,0,'M'}, {"samples",1,0,'s'}, {"samples-file",1,0,'S'}, {"force-samples",0,0,1}, {"output-type",1,0,'O'}, {"output-file",1,0,'o'}, {"types",1,0,'v'}, {"exclude-types",1,0,'V'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, {"min-ac",1,0,'c'}, {"max-ac",1,0,'C'}, {"min-af",1,0,'q'}, {"max-af",1,0,'Q'}, {"phased",0,0,'p'}, {"exclude-phased",0,0,'P'}, {0,0,0,0} }; char *tmp; while ((c = getopt_long(argc, argv, "l:t:T:r:R:o:O:s:S:Gf:knv:V:m:M:auUhHc:C:Ii:e:xXpPq:Q:g:",loptions,NULL)) >= 0) { char allele_type[8] = "nref"; switch (c) { case 'O': switch (optarg[0]) { case 'b': args->output_type = FT_BCF_GZ; break; case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; default: error("The output type \"%s\" not recognised\n", optarg); }; break; case 'l': args->clevel = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --compression-level %s\n", optarg); args->output_type |= FT_GZ; break; case 'o': args->fn_out = optarg; break; case 'H': args->print_header = 0; break; case 'h': args->header_only = 1; break; case 't': args->targets_list = optarg; break; case 'T': args->targets_list = optarg; targets_is_file = 1; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; case 's': args->sample_names = optarg; break; case 'S': args->sample_names = optarg; args->sample_is_file = 1; break; case 1 : args->force_samples = 1; break; case 'a': args->trim_alts = 1; args->calc_ac = 1; break; case 'I': args->update_info = 0; break; case 'G': args->sites_only = 1; break; case 'f': args->files->apply_filters = optarg; break; case 'k': args->known = 1; break; case 'n': args->novel = 1; break; case 'm': args->min_alleles = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --min-alleles %s\n", optarg); break; case 'M': args->max_alleles = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --max-alleles %s\n", optarg); break; case 'v': args->include_types = optarg; break; case 'V': args->exclude_types = optarg; break; case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'c': { args->min_ac_type = ALLELE_NONREF; if ( sscanf(optarg,"%d:%s",&args->min_ac, allele_type)!=2 && sscanf(optarg,"%d",&args->min_ac)!=1 ) error("Error: Could not parse --min-ac %s\n", optarg); set_allele_type(&args->min_ac_type, allele_type); args->calc_ac = 1; break; } case 'C': { args->max_ac_type = ALLELE_NONREF; if ( sscanf(optarg,"%d:%s",&args->max_ac, allele_type)!=2 && sscanf(optarg,"%d",&args->max_ac)!=1 ) error("Error: Could not parse --max-ac %s\n", optarg); set_allele_type(&args->max_ac_type, allele_type); args->calc_ac = 1; break; } case 'q': { args->min_af_type = ALLELE_NONREF; if ( sscanf(optarg,"%f:%s",&args->min_af, allele_type)!=2 && sscanf(optarg,"%f",&args->min_af)!=1 ) error("Error: Could not parse --min_af %s\n", optarg); set_allele_type(&args->min_af_type, allele_type); args->calc_ac = 1; break; } case 'Q': { args->max_af_type = ALLELE_NONREF; if ( sscanf(optarg,"%f:%s",&args->max_af, allele_type)!=2 && sscanf(optarg,"%f",&args->max_af)!=1 ) error("Error: Could not parse --min_af %s\n", optarg); set_allele_type(&args->max_af_type, allele_type); args->calc_ac = 1; break; } case 'x': args->private_vars |= FLT_INCLUDE; args->calc_ac = 1; break; case 'X': args->private_vars |= FLT_EXCLUDE; args->calc_ac = 1; break; case 'u': args->uncalled |= FLT_INCLUDE; args->calc_ac = 1; break; case 'U': args->uncalled |= FLT_EXCLUDE; args->calc_ac = 1; break; case 'p': args->phased |= FLT_INCLUDE; break; // phased case 'P': args->phased |= FLT_EXCLUDE; break; // exclude-phased case 'g': { if ( !strcasecmp(optarg,"hom") ) args->gt_type = GT_NEED_HOM; else if ( !strcasecmp(optarg,"het") ) args->gt_type = GT_NEED_HET; else if ( !strcasecmp(optarg,"miss") ) args->gt_type = GT_NEED_MISSING; else if ( !strcasecmp(optarg,"^hom") ) args->gt_type = GT_NO_HOM; else if ( !strcasecmp(optarg,"^het") ) args->gt_type = GT_NO_HET; else if ( !strcasecmp(optarg,"^miss") ) args->gt_type = GT_NO_MISSING; else error("The argument to -g not recognised. Expected one of hom/het/miss/^hom/^het/^miss, got \"%s\".\n", optarg); break; } case '?': usage(args); default: error("Unknown argument: %s\n", optarg); } } if ( args->filter_logic == (FLT_EXCLUDE|FLT_INCLUDE) ) error("Only one of -i or -e can be given.\n"); if ( args->private_vars > FLT_EXCLUDE ) error("Only one of -x or -X can be given.\n"); if ( args->uncalled > FLT_EXCLUDE ) error("Only one of -u or -U can be given.\n"); if ( args->phased > FLT_EXCLUDE ) error("Only one of -p or -P can be given.\n"); if ( args->sample_names && args->update_info) args->calc_ac = 1; char *fname = NULL; if ( optind>=argc ) { if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin else usage(args); } else fname = argv[optind]; // read in the regions from the command line if ( args->regions_list ) { if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } else if ( optind+1 < argc ) { int i; kstring_t tmp = {0,0,0}; kputs(argv[optind+1],&tmp); for (i=optind+2; i<argc; i++) { kputc(',',&tmp); kputs(argv[i],&tmp); } if ( bcf_sr_set_regions(args->files, tmp.s, 0)<0 ) error("Failed to read the regions: %s\n", tmp.s); free(tmp.s); } if ( args->targets_list ) { if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); } if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); init_data(args); bcf_hdr_t *out_hdr = args->hnull ? args->hnull : (args->hsub ? args->hsub : args->hdr); if (args->print_header) bcf_hdr_write(args->out, out_hdr); else if ( args->output_type & FT_BCF ) error("BCF output requires header, cannot proceed with -H\n"); if (!args->header_only) { while ( bcf_sr_next_line(args->files) ) { bcf1_t *line = args->files->readers[0].buffer[0]; if ( line->errcode && out_hdr!=args->hdr ) error("Undefined tags in the header, cannot proceed in the sample subset mode.\n"); if ( subset_vcf(args, line) ) bcf_write1(args->out, out_hdr, line); } } hts_close(args->out); destroy_data(args); bcf_sr_destroy(args->files); free(args); return 0; }