/** * Constructor. */ VNTRAnnotator::VNTRAnnotator(std::string& ref_fasta_file, bool debug) { vm = new VariantManip(ref_fasta_file.c_str()); float delta = 0.0001; float epsilon = 0.05; float tau = 0.01; float eta = 0.01; float mismatch_penalty = 3; ahmm = new AHMM(false); ahmm->set_delta(delta); ahmm->set_epsilon(epsilon); ahmm->set_tau(tau); ahmm->set_eta(eta); ahmm->set_mismatch_penalty(mismatch_penalty); ahmm->initialize_T(); fai = fai_load(ref_fasta_file.c_str()); if (fai==NULL) { fprintf(stderr, "[%s:%d %s] Cannot load genome index: %s\n", __FILE__, __LINE__, __FUNCTION__, ref_fasta_file.c_str()); exit(1); } cre = new CandidateRegionExtractor(ref_fasta_file, debug); cmp = new CandidateMotifPicker(debug); fd = new FlankDetector(ref_fasta_file, debug); this->debug = debug; qual.assign(256, 'K'); };
params(int8_t k, char *bedpath, char *refpath, int padding=DEFAULT_PADDING): k(k), fai(fai_load(refpath)), bed(bedpath, fai, padding) { }
int faidx_main(int argc, char *argv[]) { if (argc == 1) { fprintf(stderr, "Usage: faidx <in.fasta> [<reg> [...]]\n"); return 1; } else { if (argc == 2) fai_build(argv[1]); else { int i, j, k, l; char *s; faidx_t *fai; fai = fai_load(argv[1]); if (fai == 0) return 1; for (i = 2; i != argc; ++i) { printf(">%s\n", argv[i]); s = fai_fetch(fai, argv[i], &l); for (j = 0; j < l; j += 60) { for (k = 0; k < 60 && k < l - j; ++k) putchar(s[j + k]); putchar('\n'); } free(s); } fai_destroy(fai); } } return 0; }
/* check match between reference and bam files. prints an error * message and return non-zero on mismatch */ int checkref(char *fasta_file, char *bam_file) { int i = -1; bam_header_t *header; faidx_t *fai; char *ref; int ref_len = -1; bamFile bam_fp; if (! file_exists(fasta_file)) { LOG_FATAL("Fsata file %s does not exist. Exiting...\n", fasta_file); return 1; } if (0 != strcmp(bam_file, "-") && ! file_exists(bam_file)) { LOG_FATAL("BAM file %s does not exist. Exiting...\n", bam_file); return 1; } bam_fp = strcmp(bam_file, "-") == 0 ? bam_dopen(fileno(stdin), "r") : bam_open(bam_file, "r"); header = bam_header_read(bam_fp); if (!header) { LOG_FATAL("Failed to read BAM header from %s\n", bam_file); return 1; } fai = fai_load(fasta_file); if (!fai) { LOG_FATAL("Failed to fasta index for %s\n", fasta_file); return 1; } for (i=0; i < header->n_targets; i++) { LOG_DEBUG("BAM header target %d of %d: name=%s len=%d\n", i+1, header->n_targets, header->target_name[i], header->target_len[i]); ref = faidx_fetch_seq(fai, header->target_name[i], 0, 0x7fffffff, &ref_len); if (NULL == ref) { LOG_FATAL("Failed to fetch sequence %s from fasta file\n", header->target_name[i]); return -1; } if (header->target_len[i] != ref_len) { LOG_FATAL("Sequence length mismatch for sequence %s (%dbp in fasta; %dbp in bam)\n", header->target_name[i], header->target_len[i], ref_len); return -1; } free(ref); } fai_destroy(fai); bam_header_destroy(header); bam_close(bam_fp); return 0; }
bool IndexedFastaReader::Open(const std::string &filename) { faidx_t* handle = fai_load(filename.c_str()); if (handle == nullptr) return false; else { filename_ = filename; handle_ = handle; return true; } }
/** * Constructor. */ FlankDetector::FlankDetector(std::string& ref_fasta_file, bool debug) { ////////////////////// //initialize variables ////////////////////// this->debug = debug; /////////////////// //initialize raHMMs /////////////////// float delta = 0.0001; float epsilon = 0.0005; float tau = 0.01; float eta = 0.01; float mismatch_penalty = 3; ahmm = new AHMM(false); ahmm->set_delta(delta); ahmm->set_epsilon(epsilon); ahmm->set_tau(tau); ahmm->set_eta(eta); ahmm->set_mismatch_penalty(mismatch_penalty); ahmm->initialize_T(); lfhmm = new LFHMM(false); lfhmm->set_delta(delta); lfhmm->set_epsilon(epsilon); lfhmm->set_tau(tau); lfhmm->set_eta(eta); lfhmm->set_mismatch_penalty(mismatch_penalty); lfhmm->initialize_T(); rfhmm = new RFHMM(false); rfhmm->set_delta(delta); rfhmm->set_epsilon(epsilon); rfhmm->set_tau(tau); rfhmm->set_eta(eta); rfhmm->set_mismatch_penalty(mismatch_penalty); rfhmm->initialize_T(); qual.assign(1000, 'K'); ////////////////// //initialize tools ////////////////// fai = fai_load(ref_fasta_file.c_str()); if (fai==NULL) { fprintf(stderr, "[%s:%d %s] Cannot load genome index: %s\n", __FILE__, __LINE__, __FUNCTION__, ref_fasta_file.c_str()); exit(1); } };
/** * Constructor. * * @ref_fasta_file reference sequence FASTA file. */ VariantManip::VariantManip(std::string ref_fasta_file) { if (ref_fasta_file!="") { fai = fai_load(ref_fasta_file.c_str()); if (fai==NULL) { fprintf(stderr, "[%s:%d %s] Cannot load genome index: %s\n", __FILE__, __LINE__, __FUNCTION__, ref_fasta_file.c_str()); exit(1); } reference_present = (fai!=NULL); } };
/** * Constructor. */ CandidateRegionExtractor::CandidateRegionExtractor(std::string& ref_fasta_file, bool debug) { vm = new VariantManip(ref_fasta_file.c_str()); fai = fai_load(ref_fasta_file.c_str()); if (fai==NULL) { fprintf(stderr, "[%s:%d %s] Cannot load genome index: %s\n", __FILE__, __LINE__, __FUNCTION__, ref_fasta_file.c_str()); exit(1); } max_mlen = 10; this->debug = debug; mt = new MotifTree(max_mlen, debug); };
int add_dindel(const char *bam_in, const char *bam_out, const char *ref) { data_t_dindel tmp; int count = 0; bam1_t *b = NULL; if ((tmp.in = samopen(bam_in, "rb", 0)) == 0) { LOG_FATAL("Failed to open BAM file %s\n", bam_in); return 1; } if ((tmp.fai = fai_load(ref)) == 0) { LOG_FATAL("Failed to open reference file %s\n", ref); return 1; } /*warn_old_fai(ref);*/ if (!bam_out || bam_out[0] == '-') { tmp.out = bam_dopen(fileno(stdout), "w"); } else { tmp.out = bam_open(bam_out, "w"); } bam_header_write(tmp.out, tmp.in->header); b = bam_init1(); tmp.tid = -1; tmp.hpcount = 0; tmp.rlen = 0; while (samread(tmp.in, b) >= 0) { count++; dindel_fetch_func(b, &tmp); } bam_destroy1(b); if (tmp.hpcount) free(tmp.hpcount); samclose(tmp.in); bam_close(tmp.out); fai_destroy(tmp.fai); LOG_VERBOSE("Processed %d reads\n", count); return 0; }
static void faidx1(const char *filename) { int n, n_exp = 0, n_fq_exp = 0; char tmpfilename[FILENAME_MAX], line[500]; FILE *fin, *fout; faidx_t *fai; fin = fopen(filename, "rb"); if (fin == NULL) fail("can't open %s\n", filename); sprintf(tmpfilename, "%s.tmp", filename); fout = fopen(tmpfilename, "wb"); if (fout == NULL) fail("can't create temporary %s\n", tmpfilename); while (fgets(line, sizeof line, fin)) { if (line[0] == '>') n_exp++; if (line[0] == '+' && line[1] == '\n') n_fq_exp++; fputs(line, fout); } fclose(fin); fclose(fout); if (n_exp == 0 && n_fq_exp != 0) { // probably a fastq file n_exp = n_fq_exp; } if (fai_build(tmpfilename) < 0) fail("can't index %s", tmpfilename); fai = fai_load(tmpfilename); if (fai == NULL) { fail("can't load faidx file %s", tmpfilename); return; } n = faidx_fetch_nseq(fai); if (n != n_exp) fail("%s: faidx_fetch_nseq returned %d, expected %d", filename, n, n_exp); n = faidx_nseq(fai); if (n != n_exp) fail("%s: faidx_nseq returned %d, expected %d", filename, n, n_exp); fai_destroy(fai); }
int ctx_vcfcov(int argc, char **argv) { struct MemArgs memargs = MEM_ARGS_INIT; const char *out_path = NULL, *out_type = NULL; uint32_t max_allele_len = 0, max_gt_vars = 0; char *ref_path = NULL; bool low_mem = false; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; size_t i; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 'O': cmd_check(!out_type, cmd); out_type = optarg; break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'r': cmd_check(!ref_path, cmd); ref_path = optarg; break; case 'L': cmd_check(!max_allele_len,cmd); max_allele_len = cmd_uint32(cmd,optarg); break; case 'N': cmd_check(!max_gt_vars,cmd); max_gt_vars = cmd_uint32(cmd,optarg); break; case 'M': cmd_check(!low_mem, cmd); low_mem = true; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } // Defaults for unset values if(out_path == NULL) out_path = "-"; if(ref_path == NULL) cmd_print_usage("Require a reference (-r,--ref <ref.fa>)"); if(optind+2 > argc) cmd_print_usage("Require VCF and graph files"); if(!max_allele_len) max_allele_len = DEFAULT_MAX_ALLELE_LEN; if(!max_gt_vars) max_gt_vars = DEFAULT_MAX_GT_VARS; status("[vcfcov] max allele length: %u; max number of variants: %u", max_allele_len, max_gt_vars); // open ref // index fasta with: samtools faidx ref.fa faidx_t *fai = fai_load(ref_path); if(fai == NULL) die("Cannot load ref index: %s / %s.fai", ref_path, ref_path); // Open input VCF file const char *vcf_path = argv[optind++]; htsFile *vcffh = hts_open(vcf_path, "r"); if(vcffh == NULL) die("Cannot open VCF file: %s", vcf_path); bcf_hdr_t *vcfhdr = bcf_hdr_read(vcffh); if(vcfhdr == NULL) die("Cannot read VCF header: %s", vcf_path); // Test we can close and reopen files if(low_mem) { if((vcffh = hts_open(vcf_path, "r")) == NULL) die("Cannot re-open VCF file: %s", vcf_path); if((vcfhdr = bcf_hdr_read(vcffh)) == NULL) die("Cannot re-read VCF header: %s", vcf_path); } // // Open graph files // const size_t num_gfiles = argc - optind; char **graph_paths = argv + optind; ctx_assert(num_gfiles > 0); GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); size_t ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0; ncols = graph_files_open(graph_paths, gfiles, num_gfiles, &ctx_max_kmers, &ctx_sum_kmers); // Check graph + paths are compatible graphs_gpaths_compatible(gfiles, num_gfiles, NULL, 0, -1); // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem; bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Covg)*8 * ncols; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, low_mem ? -1 : (int64_t)ctx_max_kmers, ctx_sum_kmers, true, &graph_mem); cmd_check_mem_limit(memargs.mem_to_use, graph_mem); // // Open output file // // v=>vcf, z=>compressed vcf, b=>bcf, bu=>uncompressed bcf int mode = vcf_misc_get_outtype(out_type, out_path); futil_create_output(out_path); htsFile *outfh = hts_open(out_path, modes_htslib[mode]); status("[vcfcov] Output format: %s", hsmodes_htslib[mode]); // Allocate memory dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, ncols, 1, kmers_in_hash, DBG_ALLOC_COVGS); // // Set up tag names // // *R => ref, *A => alt sprintf(kcov_ref_tag, "K%zuR", db_graph.kmer_size); // mean coverage sprintf(kcov_alt_tag, "K%zuA", db_graph.kmer_size); // #SAMPLE=<ID=...,K29KCOV=...,K29NK=...,K29RLK> // - K29_kcov is empirical kmer coverage // - K29_nkmers is the number of kmers in the sample // - mean_read_length is the mean read length in bases char sample_kcov_tag[20], sample_nk_tag[20], sample_rlk_tag[20]; sprintf(sample_kcov_tag, "K%zu_kcov", db_graph.kmer_size); // mean coverage sprintf(sample_nk_tag, "K%zu_nkmers", db_graph.kmer_size); sprintf(sample_rlk_tag, "mean_read_length"); // // Load kmers if we are using --low-mem // VcfCovStats st; memset(&st, 0, sizeof(st)); VcfCovPrefs prefs = {.kcov_ref_tag = kcov_ref_tag, .kcov_alt_tag = kcov_alt_tag, .max_allele_len = max_allele_len, .max_gt_vars = max_gt_vars, .load_kmers_only = false}; if(low_mem) { status("[vcfcov] Loading kmers from VCF+ref"); prefs.load_kmers_only = true; vcfcov_file(vcffh, vcfhdr, NULL, NULL, vcf_path, fai, NULL, &prefs, &st, &db_graph); // Close files hts_close(vcffh); bcf_hdr_destroy(vcfhdr); // Re-open files if((vcffh = hts_open(vcf_path, "r")) == NULL) die("Cannot re-open VCF file: %s", vcf_path); if((vcfhdr = bcf_hdr_read(vcffh)) == NULL) die("Cannot re-read VCF header: %s", vcf_path); prefs.load_kmers_only = false; } // // Load graphs // GraphLoadingStats gstats; memset(&gstats, 0, sizeof(gstats)); GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph); gprefs.must_exist_in_graph = low_mem; for(i = 0; i < num_gfiles; i++) { graph_load(&gfiles[i], gprefs, &gstats); graph_file_close(&gfiles[i]); } ctx_free(gfiles); hash_table_print_stats(&db_graph.ht); // // Set up VCF header / graph matchup // size_t *samplehdrids = ctx_malloc(db_graph.num_of_cols * sizeof(size_t)); // Add samples to vcf header bcf_hdr_t *outhdr = bcf_hdr_dup(vcfhdr); bcf_hrec_t *hrec; int sid; char hdrstr[200]; for(i = 0; i < db_graph.num_of_cols; i++) { char *sname = db_graph.ginfo[i].sample_name.b; if((sid = bcf_hdr_id2int(outhdr, BCF_DT_SAMPLE, sname)) < 0) { bcf_hdr_add_sample(outhdr, sname); sid = bcf_hdr_id2int(outhdr, BCF_DT_SAMPLE, sname); } samplehdrids[i] = sid; // Add SAMPLE field hrec = bcf_hdr_get_hrec(outhdr, BCF_HL_STR, "ID", sname, "SAMPLE"); if(hrec == NULL) { sprintf(hdrstr, "##SAMPLE=<ID=%s,%s=%"PRIu64",%s=%"PRIu64",%s=%zu>", sname, sample_kcov_tag, gstats.nkmers[i] ? gstats.sumcov[i] / gstats.nkmers[i] : 0, sample_nk_tag, gstats.nkmers[i], sample_rlk_tag, (size_t)db_graph.ginfo[i].mean_read_length); bcf_hdr_append(outhdr, hdrstr); } else { // mean kcovg sprintf(hdrstr, "%"PRIu64, gstats.sumcov[i] / gstats.nkmers[i]); vcf_misc_add_update_hrec(hrec, sample_kcov_tag, hdrstr); // num kmers sprintf(hdrstr, "%"PRIu64, gstats.nkmers[i]); vcf_misc_add_update_hrec(hrec, sample_nk_tag, hdrstr); // mean read length in kmers sprintf(hdrstr, "%zu", (size_t)db_graph.ginfo[i].mean_read_length); vcf_misc_add_update_hrec(hrec, sample_rlk_tag, hdrstr); } status("[vcfcov] Colour %zu: %s [VCF column %zu]", i, sname, samplehdrids[i]); } // Add genotype format fields // One field per alternative allele sprintf(hdrstr, "##FORMAT=<ID=%s,Number=A,Type=Integer," "Description=\"Coverage on ref (k=%zu): sum(kmer_covs) / exp_num_kmers\">\n", kcov_ref_tag, db_graph.kmer_size); bcf_hdr_append(outhdr, hdrstr); sprintf(hdrstr, "##FORMAT=<ID=%s,Number=A,Type=Integer," "Description=\"Coverage on alt (k=%zu): sum(kmer_covs) / exp_num_kmers\">\n", kcov_alt_tag, db_graph.kmer_size); bcf_hdr_append(outhdr, hdrstr); bcf_hdr_set_version(outhdr, "VCFv4.2"); // Add command string to header vcf_misc_hdr_add_cmd(outhdr, cmd_get_cmdline(), cmd_get_cwd()); if(bcf_hdr_write(outfh, outhdr) != 0) die("Cannot write header to: %s", futil_outpath_str(out_path)); status("[vcfcov] Reading %s and adding coverage", vcf_path); // Reset stats and get coverage memset(&st, 0, sizeof(st)); vcfcov_file(vcffh, vcfhdr, outfh, outhdr, vcf_path, fai, samplehdrids, &prefs, &st, &db_graph); // Print statistics char ns0[50], ns1[50]; status("[vcfcov] Read %s VCF lines", ulong_to_str(st.nvcf_lines, ns0)); status("[vcfcov] Read %s ALTs", ulong_to_str(st.nalts_read, ns0)); status("[vcfcov] Used %s kmers", ulong_to_str(st.ngt_kmers, ns0)); status("[vcfcov] ALTs used: %s / %s (%.2f%%)", ulong_to_str(st.nalts_loaded, ns0), ulong_to_str(st.nalts_read, ns1), st.nalts_read ? (100.0*st.nalts_loaded) / st.nalts_read : 0.0); status("[vcfcov] ALTs too long (>%ubp): %s / %s (%.2f%%)", max_allele_len, ulong_to_str(st.nalts_too_long, ns0), ulong_to_str(st.nalts_read, ns1), st.nalts_read ? (100.0*st.nalts_too_long) / st.nalts_read : 0.0); status("[vcfcov] ALTs too dense (>%u within %zubp): %s / %s (%.2f%%)", max_gt_vars, db_graph.kmer_size, ulong_to_str(st.nalts_no_covg, ns0), ulong_to_str(st.nalts_read, ns1), st.nalts_read ? (100.0*st.nalts_no_covg) / st.nalts_read : 0.0); status("[vcfcov] ALTs printed with coverage: %s / %s (%.2f%%)", ulong_to_str(st.nalts_with_covg, ns0), ulong_to_str(st.nalts_read, ns1), st.nalts_read ? (100.0*st.nalts_with_covg) / st.nalts_read : 0.0); status("[vcfcov] Saved to: %s\n", out_path); ctx_free(samplehdrids); graph_loading_stats_destroy(&gstats); bcf_hdr_destroy(vcfhdr); bcf_hdr_destroy(outhdr); hts_close(vcffh); hts_close(outfh); fai_destroy(fai); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int main(int argc , char *argv[]) { FILE *fin , *fout ; vcf_info vcf ; faidx_t *fai ; char buffer[32]; char *s = malloc(1024*sizeof(char)); if( argc != 4 ){ fprintf(stderr,"sv2vcf [ref.fa] [input] [output] \n"); return 1 ; } fai = fai_load(argv[1]); if(!fai){ fprintf(stderr,"can't open the file:%s.fai\n",argv[1]); return 1; } fin = fopen(argv[2],"r"); if(!fin){ fprintf(stderr,"can't open the file:%s\n", argv[2]); return 1 ; } fout = fopen(argv[3],"w"); if(!fout){ fprintf(stderr,"can't open the file:%s\n", argv[3]); return 1 ; } fprintf(fout,"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILT\tINFO\n"); while(!feof(fin)){ char *tmp ; int len ; fscanf(fin,"%s\t",buffer); if(!strcmp(buffer,"TRSdL")){ fscanf(fin,"%s\t",buffer); fscanf(fin,"%d\t%d\n",&vcf.pos[0],&vcf.pos[1]); int k ; for( k = 0 ; k < 2 ; k++){ sprintf(s,"%s:%d-%d",buffer,vcf.pos[k],vcf.pos[k]); tmp = fai_fetch(fai,s,&len); vcf.mut[k] = *tmp ; free(tmp); } fprintf(fout,"%s\t%d\tbnd_V\t%c\t[%s:%d[%c\t6\tPASS\tSVTYPE=BND;MATEID=bnd_W;EVENT=PRO\n",buffer,vcf.pos[0],vcf.mut[0],buffer,vcf.pos[1],vcf.mut[0]); fprintf(fout,"%s\t%d\tbnd_W\t%c\t[%s:%d[%c\t6\tPASS\tSVTYPE=BND;MATEID=bnd_V;EVENT=PRO\n",buffer,vcf.pos[1],vcf.mut[1],buffer,vcf.pos[0],vcf.mut[1]); }else if(!strcmp(buffer,"TRSdR")){ fscanf(fin,"%s\t",buffer); fscanf(fin,"%d\t%d\n",&vcf.pos[0],&vcf.pos[1]); int k ; for( k = 0 ; k < 2 ; k++){ sprintf(s,"%s:%d-%d",buffer,vcf.pos[k],vcf.pos[k]); tmp = fai_fetch(fai,s,&len); vcf.mut[k] = *tmp ; free(tmp); } fprintf(fout,"%s\t%d\tbnd_V\t%c\t%s:%d]%c]\t6\tPASS\tSVTYPE=BND;MATEID=bnd_W;EVENT=PRO\n",buffer,vcf.pos[0],vcf.mut[0],buffer,vcf.pos[1],vcf.mut[0]); fprintf(fout,"%s\t%d\tbnd_W\t%c\t%s:%d]%c]\t6\tPASS\tSVTYPE=BND;MATEID=bnd_V;EVENT=PRO\n",buffer,vcf.pos[1],vcf.mut[1],buffer,vcf.pos[0],vcf.mut[1]); }else if((tmp = strstr(buffer,"TandemCNV"))){ tmp = tmp + 9 ; vcf.type = atoi(tmp); fscanf(fin,"%s\t",buffer); if(vcf.type == 1 ) { fscanf(fin,"%*[^\n]\n"); continue ; } fscanf(fin,"%d\t%d\n",&vcf.pos[0],&vcf.pos[1]); int k ; for( k = 0 ; k < 2 ; k++){ sprintf(s,"%s:%d-%d",buffer,vcf.pos[k],vcf.pos[k]); tmp = fai_fetch(fai,s,&len); vcf.mut[k] = *tmp ; free(tmp); } if(vcf.type == 2){ fprintf(fout,"%s\t%d\tbnd_V\t%c\t%c]%s:%d]\t6\tPASS\tSVTYPE=BND;MATEID=bnd_W;EVENT=PRO\n",buffer,vcf.pos[1],vcf.mut[1],vcf.mut[1],buffer,vcf.pos[1]); fprintf(fout,"%s\t%d\tbnd_W\t%c\t%c]%s:%d]\t6\tPASS\tSVTYPE=BND;MATEID=bnd_V;EVENT=PRO\n",buffer,vcf.pos[1],vcf.mut[1],vcf.mut[1],buffer,vcf.pos[1]); }else { fprintf(fout,"%s\t%d\tbnd_V\t%c\t[%s:%d[%c\t6\tPASS\tSVTYPE=BND;MATEID=bnd_W;EVENT=PRO\n",buffer,vcf.pos[0],vcf.mut[0],buffer,vcf.pos[0],vcf.mut[0]); fprintf(fout,"%s\t%d\tbnd_W\t%c\t[%s:%d[%c\t6\tPASS\tSVTYPE=BND;MATEID=bnd_V;EVENT=PRO\n",buffer,vcf.pos[0],vcf.mut[0],buffer,vcf.pos[0],vcf.mut[0]); } }else if((tmp = strstr( buffer,"CNV"))){ tmp = tmp + 3; vcf.type = atoi(tmp); fscanf(fin,"%s\t",buffer); fscanf(fin,"%d\t%d\t%d\n",&vcf.pos[0],&vcf.pos[1],&vcf.pos[2]); if( vcf.type == 1 ){ sprintf(s,"%s:%d-%d",buffer,vcf.pos[0],vcf.pos[0]); tmp = fai_fetch(fai,s,&len); vcf.mut[0] = *tmp ; free(tmp); sprintf(s,"%s:%d-%d",buffer,vcf.pos[2],vcf.pos[2]); tmp = fai_fetch(fai,s,&len); vcf.mut[2] = *tmp ; free(tmp); sprintf(s,"%s:%d-%d",buffer,vcf.pos[1],vcf.pos[1]); tmp = fai_fetch(fai,s,&len); vcf.mut[1] = *tmp ; free(tmp); sprintf(s,"%s:%d-%d",buffer,vcf.pos[2]+1,vcf.pos[2]+1); tmp = fai_fetch(fai,s,&len); vcf.mut[3] = *tmp ; free(tmp); fprintf(fout,"%s\t%d\tbnd_V\t%c\t]%s:%d]%c\t6\tPASS\tSVTYPE=BND;MATEID=bnd_W;EVENT=PRO\n",buffer,vcf.pos[0],vcf.mut[0],buffer,vcf.pos[2],vcf.mut[0]); fprintf(fout,"%s\t%d\tbnd_W\t%c\t%c[%s:%d[\t6\tPASS\tSVTYPE=BND;MATEID=bnd_V;EVENT=PRO\n",buffer,vcf.pos[2],vcf.mut[2],vcf.mut[2],buffer,vcf.pos[0]); fprintf(fout,"%s\t%d\tbnd_U\t%c\t%c[%s:%d[\t6\tPASS\tSVTYPE=BND;MATEID=bnd_X;EVENT=PRO\n",buffer,vcf.pos[1],vcf.mut[1],vcf.mut[1],buffer,vcf.pos[2]+1); fprintf(fout,"%s\t%d\tbnd_X\t%c\t]%s:%d]%c\t6\tPASS\tSVTYPE=BND;MATEID=bnd_U;EVENT=PRO\n",buffer,vcf.pos[2]+1,vcf.mut[3],buffer,vcf.pos[1],vcf.mut[3]); }else if( vcf.type == 2 ){ sprintf(s,"%s:%d-%d",buffer,vcf.pos[0],vcf.pos[0]); tmp = fai_fetch(fai,s,&len); vcf.mut[0] = *tmp ; free(tmp); sprintf(s,"%s:%d-%d",buffer,vcf.pos[2]+1,vcf.pos[2]+1); tmp = fai_fetch(fai,s,&len); vcf.mut[3] = *tmp ; free(tmp); sprintf(s,"%s:%d-%d",buffer,vcf.pos[1],vcf.pos[1]); tmp = fai_fetch(fai,s,&len); vcf.mut[1] = *tmp ; free(tmp); sprintf(s,"%s:%d-%d",buffer,vcf.pos[2],vcf.pos[2]); tmp = fai_fetch(fai,s,&len); vcf.mut[2] = *tmp ; free(tmp); fprintf(fout,"%s\t%d\tbnd_V\t%c\t[%s:%d[%c\t6\tPASS\tSVTYPE=BND;MATEID=bnd_W;EVENT=PRO\n",buffer,vcf.pos[0],vcf.mut[0],buffer,vcf.pos[2]+1,vcf.mut[0]); fprintf(fout,"%s\t%d\tbnd_W\t%c\t[%s:%d[%c\t6\tPASS\tSVTYPE=BND;MATEID=bnd_U;EVENT=PRO\n",buffer,vcf.pos[2]+1,vcf.mut[3],buffer,vcf.pos[0],vcf.mut[3]); fprintf(fout,"%s\t%d\tbnd_U\t%c\t%c]%s:%d]\t6\tPASS\tSVTYPE=BND;MATEID=bnd_X;EVENT=PRO\n",buffer,vcf.pos[1],vcf.mut[1],vcf.mut[1],buffer,vcf.pos[2]); fprintf(fout,"%s\t%d\tbnd_X\t%c\t%c]%s:%d]\t6\tPASS\tSVTYPE=BND;MATEID=bnd_U;EVENT=PRO\n",buffer,vcf.pos[2],vcf.mut[2],vcf.mut[2],buffer,vcf.pos[1]); }else if(vcf.type == 3){ sprintf(s,"%s:%d-%d",buffer,vcf.pos[0],vcf.pos[0]); tmp = fai_fetch(fai,s,&len); vcf.mut[0] = *tmp ; free(tmp); sprintf(s,"%s:%d-%d",buffer,vcf.pos[1],vcf.pos[1]); tmp = fai_fetch(fai,s,&len); vcf.mut[1] = *tmp ; free(tmp); sprintf(s,"%s:%d-%d",buffer,vcf.pos[0]+1,vcf.pos[0]+1); tmp = fai_fetch(fai,s,&len); vcf.mut[2] = *tmp ; free(tmp); sprintf(s,"%s:%d-%d",buffer,vcf.pos[2],vcf.pos[2]); tmp = fai_fetch(fai,s,&len); vcf.mut[3] = *tmp ; free(tmp); fprintf(fout,"%s\t%d\tbnd_V\t%c\t%c[%s:%d[\t6\tPASS\tSVTYPE=BND;MATEID=bnd_W;EVENT=PRO\n",buffer,vcf.pos[0],vcf.mut[0],vcf.mut[0],buffer,vcf.pos[1]); fprintf(fout,"%s\t%d\tbnd_W\t%c\t]%s:%d]%c\t6\tPASS\tSVTYPE=BND;MATEID=bnd_V;EVENT=PRO\n",buffer,vcf.pos[1],vcf.mut[1],buffer,vcf.pos[0],vcf.mut[1]); fprintf(fout,"%s\t%d\tbnd_U\t%c\t]%s:%d]%c\t6\tPASS\tSVTYPE=BND;MATEID=bnd_X;EVENT=PRO\n",buffer,vcf.pos[0]+1,vcf.mut[2],buffer,vcf.pos[2],vcf.mut[2]); fprintf(fout,"%s\t%d\tbnd_X\t%c\t%c[%s:%d[\t6\tPASS\tSVTYPE=BND;MATEID=bnd_W;EVENT=PRO\n",buffer,vcf.pos[2],vcf.mut[3],vcf.mut[3],buffer,vcf.pos[0]+1); }else if(vcf.type == 4){ sprintf(s,"%s:%d-%d",buffer,vcf.pos[0],vcf.pos[0]); tmp = fai_fetch(fai,s,&len); vcf.mut[0] = *tmp ; free(tmp); sprintf(s,"%s:%d-%d",buffer,vcf.pos[1],vcf.pos[1]); tmp = fai_fetch(fai,s,&len); vcf.mut[1] = *tmp ; free(tmp); sprintf(s,"%s:%d-%d",buffer,vcf.pos[0]+1,vcf.pos[0]+1); tmp = fai_fetch(fai,s,&len); vcf.mut[2] = *tmp ; free(tmp); sprintf(s,"%s:%d-%d",buffer,vcf.pos[2],vcf.pos[2]); tmp = fai_fetch(fai,s,&len); vcf.mut[3] = *tmp ; free(tmp); fprintf(fout,"%s\t%d\tbnd_V\t%c\t%c]%s:%d]\t6\tPASS\tSVTYPE=BND;MATEID=bnd_W;EVENT=PRO\n",buffer,vcf.pos[0],vcf.mut[0],vcf.mut[0],buffer,vcf.pos[2]); fprintf(fout,"%s\t%d\tbnd_W\t%c\t%c]%s:%d]\t6\tPASS\tSVTYPE=BND;MATEID=bnd_V;EVENT=PRO\n",buffer,vcf.pos[2],vcf.mut[3],vcf.mut[3],buffer,vcf.pos[0]); fprintf(fout,"%s\t%d\tbnd_U\t%c\t[%s:%d[%c\t6\tPASS\tSVTYPE=BND;MATEID=bnd_X;EVENT=PRO\n",buffer,vcf.pos[0]+1,vcf.mut[2],buffer,vcf.pos[1],vcf.mut[2]); fprintf(fout,"%s\t%d\tbnd_X\t%c\t[%s:%d[%c\t6\tPASS\tSVTYPE=BND;MATEID=bnd_U;EVENT=PRO\n",buffer,vcf.pos[1],vcf.mut[3],buffer,vcf.pos[0]+1,vcf.mut[3]); } }else if((tmp =strstr(buffer,"TRS"))){ tmp = tmp + 3; vcf.type = atoi(tmp); fscanf(fin,"%s\t",buffer); if( vcf.type == 2 || vcf.type == 3 ){ fscanf(fin,"%d\t%d\t%d\t%d\n",&vcf.pos[1],&vcf.pos[2],&vcf.pos[5],&vcf.pos[6]); vcf.pos[0] = vcf.pos[1] - 1 ; vcf.pos[3] = vcf.pos[2] + 1 ; vcf.pos[4] = vcf.pos[5] - 1 ; vcf.pos[7] = vcf.pos[6] + 1 ; int k = 0 ; for( k = 0 ; k < 8 ; k++){ sprintf(s,"%s:%d-%d",buffer,vcf.pos[k],vcf.pos[k]); tmp = fai_fetch(fai,s,&len); vcf.mut[k] = *tmp ; free(tmp); } if(vcf.type == 2 ){ fprintf(fout,"%s\t%d\tbnd_V\t%c\t%c]%s:%d]\t6\tPASS\tSVTYPE=BND;MATEID=bnd_W;EVENT=PRO\n",buffer,vcf.pos[0],vcf.mut[0],vcf.mut[0],buffer,vcf.pos[6]); fprintf(fout,"%s\t%d\tbnd_W\t%c\t%c]%s:%d]\t6\tPASS\tSVTYPE=BND;MATEID=bnd_V;EVENT=PRO\n",buffer,vcf.pos[6],vcf.mut[6],vcf.mut[6],buffer,vcf.pos[0]); fprintf(fout,"%s\t%d\tbnd_U\t%c\t[%s:%d[%c\t6\tPASS\tSVTYPE=BND;MATEID=bnd_X;EVENT=PRO\n",buffer,vcf.pos[3],vcf.mut[3],buffer,vcf.pos[5],vcf.mut[3]); fprintf(fout,"%s\t%d\tbnd_X\t%c\t[%s:%d[%c\t6\tPASS\tSVTYPE=BND;MATEID=bnd_U;EVENT=PRO\n",buffer,vcf.pos[5],vcf.mut[5],buffer,vcf.pos[3],vcf.mut[5]); fprintf(fout,"%s\t%d\tbnd_E\t%c\t]%s:%d]%c\t6\tPASS\tSVTYPE=BND;MATEID=bnd_D;EVENT=PRO\n",buffer,vcf.pos[1],vcf.mut[1],buffer,vcf.pos[4],vcf.mut[1]); fprintf(fout,"%s\t%d\tbnd_D\t%c\t%c[%s:%d[\t6\tPASS\tSVTYPE=BND;MATEID=bnd_E;EVENT=PRO\n",buffer,vcf.pos[4],vcf.mut[4],vcf.mut[4],buffer,vcf.pos[1]); fprintf(fout,"%s\t%d\tbnd_Z\t%c\t%c[%s:%d[\t6\tPASS\tSVTYPE=BND;MATEID=bnd_H;EVENT=PRO\n",buffer,vcf.pos[2],vcf.mut[2],vcf.mut[2],buffer,vcf.pos[7]); fprintf(fout,"%s\t%d\tbnd_H\t%c\t]%s:%d]%c\t6\tPASS\tSVTYPE=BND;MATEID=bnd_Z;EVENT=PRO\n",buffer,vcf.pos[7],vcf.mut[7],buffer,vcf.pos[2],vcf.mut[7]); }else { fprintf(fout,"%s\t%d\tbnd_V\t%c\t%c[%s:%d[\t6\tPASS\tSVTYPE=BND;MATEID=bnd_W;EVENT=PRO\n",buffer,vcf.pos[0],vcf.mut[0],vcf.mut[0],buffer,vcf.pos[5]); fprintf(fout,"%s\t%d\tbnd_W\t%c\t]%s:%d]%c\t6\tPASS\tSVTYPE=BND;MATEID=bnd_V;EVENT=PRO\n",buffer,vcf.pos[5],vcf.mut[5],buffer,vcf.pos[0],vcf.mut[5]); fprintf(fout,"%s\t%d\tbnd_U\t%c\t]%s:%d]%c\t6\tPASS\tSVTYPE=BND;MATEID=bnd_X;EVENT=PRO\n",buffer,vcf.pos[3],vcf.mut[3],buffer,vcf.pos[6],vcf.mut[3]); fprintf(fout,"%s\t%d\tbnd_X\t%c\t%c[%s:%d[\t6\tPASS\tSVTYPE=BND;MATEID=bnd_U;EVENT=PRO\n",buffer,vcf.pos[6],vcf.mut[6],vcf.mut[6],buffer,vcf.pos[3]); fprintf(fout,"%s\t%d\tbnd_E\t%c\t%c]%s:%d]\t6\tPASS\tSVTYPE=BND;MATEID=bnd_D;EVENT=PRO\n",buffer,vcf.pos[2],vcf.mut[2],vcf.mut[2],buffer,vcf.pos[4]); fprintf(fout,"%s\t%d\tbnd_D\t%c\t%c]%s:%d]\t6\tPASS\tSVTYPE=BND;MATEID=bnd_E;EVENT=PRO\n",buffer,vcf.pos[4],vcf.mut[4],vcf.mut[4],buffer,vcf.pos[2]); fprintf(fout,"%s\t%d\tbnd_Z\t%c\t[%s:%d[%c\t6\tPASS\tSVTYPE=BND;MATEID=bnd_H;EVENT=PRO\n",buffer,vcf.pos[1],vcf.mut[1],buffer,vcf.pos[7],vcf.mut[1]); fprintf(fout,"%s\t%d\tbnd_H\t%c\t[%s:%d[%c\t6\tPASS\tSVTYPE=BND;MATEID=bnd_Z;EVENT=PRO\n",buffer,vcf.pos[7],vcf.mut[7],buffer,vcf.pos[1],vcf.mut[7]); } }else if(vcf.type == 1){ fscanf(fin,"%d\t%d\n",&vcf.pos[1],&vcf.pos[2]); vcf.pos[0] = vcf.pos[1] - 1 ; vcf.pos[3] = vcf.pos[2] + 1 ; int k ; for( k = 0 ; k < 4 ; k++){ sprintf(s,"%s:%d-%d",buffer,vcf.pos[k],vcf.pos[k]); tmp = fai_fetch(fai,s,&len); vcf.mut[k] = *tmp ; free(tmp); } fprintf(fout,"%s\t%d\tbnd_V\t%c\t%c[%s:%d[\t6\tPASS\tSVTYPE=BND;MATEID=bnd_W;EVENT=PRO\n",buffer,vcf.pos[0],vcf.mut[0],vcf.mut[0],buffer,vcf.pos[2]); fprintf(fout,"%s\t%d\tbnd_W\t%c\t]%s:%d]%c\t6\tPASS\tSVTYPE=BND;MATEID=bnd_W;EVENT=PRO\n",buffer,vcf.pos[2],vcf.mut[2],buffer,vcf.pos[0],vcf.mut[2]); fprintf(fout,"%s\t%d\tbnd_U\t%c\t]%s:%d]%c\t6\tPASS\tSVTYPE=BND;MATEID=bnd_X;EVENT=PRO\n",buffer,vcf.pos[1],vcf.mut[1],buffer,vcf.pos[3],vcf.mut[1]); fprintf(fout,"%s\t%d\tbnd_X\t%c\t%c[%s:%d[\t6\tPASS\tSVTYPE=BND;MATEID=bnd_W;EVENT=PRO\n",buffer,vcf.pos[3],vcf.mut[3],vcf.mut[3],buffer,vcf.pos[1]); }else{ fscanf(fin,"%*[^\n]\n"); } }else{ fscanf(fin,"%*[^\n]\n"); } } free(s); fclose(fin); fclose(fout); return 0; }
int bam_mpileup(int argc, char *argv[]) { int c; const char *file_list = NULL; char **fn = NULL; int nfiles = 0, use_orphan = 0, noref = 0; mplp_conf_t mplp; memset(&mplp, 0, sizeof(mplp_conf_t)); mplp.min_baseQ = 13; mplp.capQ_thres = 0; mplp.max_depth = 250; mplp.max_indel_depth = 250; mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100; mplp.min_frac = 0.002; mplp.min_support = 1; mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_SMART_OVERLAPS; mplp.argc = argc; mplp.argv = argv; mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP; mplp.output_fname = NULL; mplp.output_type = FT_VCF; mplp.record_cmd_line = 1; mplp.n_threads = 0; mplp.bsmpl = bam_smpl_init(); static const struct option lopts[] = { {"rf", required_argument, NULL, 1}, // require flag {"ff", required_argument, NULL, 2}, // filter flag {"incl-flags", required_argument, NULL, 1}, {"excl-flags", required_argument, NULL, 2}, {"output", required_argument, NULL, 3}, {"open-prob", required_argument, NULL, 4}, {"ignore-RG", no_argument, NULL, 5}, {"ignore-rg", no_argument, NULL, 5}, {"gvcf", required_argument, NULL, 'g'}, {"no-reference", no_argument, NULL, 7}, {"no-version", no_argument, NULL, 8}, {"threads",required_argument,NULL,9}, {"illumina1.3+", no_argument, NULL, '6'}, {"count-orphans", no_argument, NULL, 'A'}, {"bam-list", required_argument, NULL, 'b'}, {"no-BAQ", no_argument, NULL, 'B'}, {"no-baq", no_argument, NULL, 'B'}, {"adjust-MQ", required_argument, NULL, 'C'}, {"adjust-mq", required_argument, NULL, 'C'}, {"max-depth", required_argument, NULL, 'd'}, {"redo-BAQ", no_argument, NULL, 'E'}, {"redo-baq", no_argument, NULL, 'E'}, {"fasta-ref", required_argument, NULL, 'f'}, {"read-groups", required_argument, NULL, 'G'}, {"region", required_argument, NULL, 'r'}, {"regions", required_argument, NULL, 'r'}, {"regions-file", required_argument, NULL, 'R'}, {"targets", required_argument, NULL, 't'}, {"targets-file", required_argument, NULL, 'T'}, {"min-MQ", required_argument, NULL, 'q'}, {"min-mq", required_argument, NULL, 'q'}, {"min-BQ", required_argument, NULL, 'Q'}, {"min-bq", required_argument, NULL, 'Q'}, {"ignore-overlaps", no_argument, NULL, 'x'}, {"output-type", required_argument, NULL, 'O'}, {"samples", required_argument, NULL, 's'}, {"samples-file", required_argument, NULL, 'S'}, {"annotate", required_argument, NULL, 'a'}, {"ext-prob", required_argument, NULL, 'e'}, {"gap-frac", required_argument, NULL, 'F'}, {"tandem-qual", required_argument, NULL, 'h'}, {"skip-indels", no_argument, NULL, 'I'}, {"max-idepth", required_argument, NULL, 'L'}, {"min-ireads ", required_argument, NULL, 'm'}, {"per-sample-mF", no_argument, NULL, 'p'}, {"per-sample-mf", no_argument, NULL, 'p'}, {"platforms", required_argument, NULL, 'P'}, {NULL, 0, NULL, 0} }; while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:Bd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:",lopts,NULL)) >= 0) { switch (c) { case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break; case 1 : mplp.rflag_require = bam_str2flag(optarg); if ( mplp.rflag_require<0 ) { fprintf(stderr,"Could not parse --rf %s\n", optarg); return 1; } break; case 2 : mplp.rflag_filter = bam_str2flag(optarg); if ( mplp.rflag_filter<0 ) { fprintf(stderr,"Could not parse --ff %s\n", optarg); return 1; } break; case 3 : mplp.output_fname = optarg; break; case 4 : mplp.openQ = atoi(optarg); break; case 5 : bam_smpl_ignore_readgroups(mplp.bsmpl); break; case 'g': mplp.gvcf = gvcf_init(optarg); if ( !mplp.gvcf ) error("Could not parse: --gvcf %s\n", optarg); break; case 'f': mplp.fai = fai_load(optarg); if (mplp.fai == NULL) return 1; mplp.fai_fname = optarg; break; case 7 : noref = 1; break; case 8 : mplp.record_cmd_line = 0; break; case 9 : mplp.n_threads = strtol(optarg, 0, 0); break; case 'd': mplp.max_depth = atoi(optarg); break; case 'r': mplp.reg_fname = strdup(optarg); break; case 'R': mplp.reg_fname = strdup(optarg); mplp.reg_is_file = 1; break; case 't': // In the original version the whole BAM was streamed which is inefficient // with few BED intervals and big BAMs. Todo: devise a heuristic to determine // best strategy, that is streaming or jumping. if ( optarg[0]=='^' ) optarg++; else mplp.bed_logic = 1; mplp.bed = regidx_init(NULL,regidx_parse_reg,NULL,0,NULL); mplp.bed_itr = regitr_init(mplp.bed); if ( regidx_insert_list(mplp.bed,optarg,',') !=0 ) { fprintf(stderr,"Could not parse the targets: %s\n", optarg); exit(EXIT_FAILURE); } break; case 'T': if ( optarg[0]=='^' ) optarg++; else mplp.bed_logic = 1; mplp.bed = regidx_init(optarg,NULL,NULL,0,NULL); if (!mplp.bed) { fprintf(stderr, "bcftools mpileup: Could not read file \"%s\"", optarg); return 1; } break; case 'P': mplp.pl_list = strdup(optarg); break; case 'p': mplp.flag |= MPLP_PER_SAMPLE; break; case 'B': mplp.flag &= ~MPLP_REALN; break; case 'I': mplp.flag |= MPLP_NO_INDEL; break; case 'E': mplp.flag |= MPLP_REDO_BAQ; break; case '6': mplp.flag |= MPLP_ILLUMINA13; break; case 's': if ( bam_smpl_add_samples(mplp.bsmpl,optarg,0)<0 ) error("Could not read samples: %s\n",optarg); break; case 'S': if ( bam_smpl_add_samples(mplp.bsmpl,optarg,1)<0 ) error("Could not read samples: %s\n",optarg); break; case 'O': switch (optarg[0]) { case 'b': mplp.output_type = FT_BCF_GZ; break; case 'u': mplp.output_type = FT_BCF; break; case 'z': mplp.output_type = FT_VCF_GZ; break; case 'v': mplp.output_type = FT_VCF; break; default: error("[error] The option \"-O\" changed meaning when mpileup moved to bcftools. Did you mean: \"bcftools mpileup --output-type\" or \"samtools mpileup --output-BP\"?\n"); } break; case 'C': mplp.capQ_thres = atoi(optarg); break; case 'q': mplp.min_mq = atoi(optarg); break; case 'Q': mplp.min_baseQ = atoi(optarg); break; case 'b': file_list = optarg; break; case 'o': { char *end; long value = strtol(optarg, &end, 10); // Distinguish between -o INT and -o FILE (a bit of a hack!) if (*end == '\0') mplp.openQ = value; else mplp.output_fname = optarg; } break; case 'e': mplp.extQ = atoi(optarg); break; case 'h': mplp.tandemQ = atoi(optarg); break; case 'A': use_orphan = 1; break; case 'F': mplp.min_frac = atof(optarg); break; case 'm': mplp.min_support = atoi(optarg); break; case 'L': mplp.max_indel_depth = atoi(optarg); break; case 'G': bam_smpl_add_readgroups(mplp.bsmpl, optarg, 1); break; case 'a': if (optarg[0]=='?') { list_annotations(stderr); return 1; } mplp.fmt_flag |= parse_format_flag(optarg); break; default: fprintf(stderr,"Invalid option: '%c'\n", c); return 1; } } if ( mplp.gvcf && !(mplp.fmt_flag&B2B_FMT_DP) ) { fprintf(stderr,"[warning] The -t DP option is required with --gvcf, switching on.\n"); mplp.fmt_flag |= B2B_FMT_DP; } if ( mplp.flag&(MPLP_BCF|MPLP_VCF|MPLP_NO_COMP) ) { if ( mplp.flag&MPLP_VCF ) { if ( mplp.flag&MPLP_NO_COMP ) mplp.output_type = FT_VCF; else mplp.output_type = FT_VCF_GZ; } else if ( mplp.flag&MPLP_BCF ) { if ( mplp.flag&MPLP_NO_COMP ) mplp.output_type = FT_BCF; else mplp.output_type = FT_BCF_GZ; } } if ( !(mplp.flag&MPLP_REALN) && mplp.flag&MPLP_REDO_BAQ ) { fprintf(stderr,"Error: The -B option cannot be combined with -E\n"); return 1; } if (use_orphan) mplp.flag &= ~MPLP_NO_ORPHAN; if (argc == 1) { print_usage(stderr, &mplp); return 1; } if (!mplp.fai && !noref) { fprintf(stderr,"Error: mpileup requires the --fasta-ref option by default; use --no-reference to run without a fasta reference\n"); return 1; } int ret,i; if (file_list) { if ( read_file_list(file_list,&nfiles,&fn) ) return 1; mplp.files = fn; mplp.nfiles = nfiles; } else { mplp.nfiles = argc - optind; mplp.files = (char**) malloc(mplp.nfiles*sizeof(char*)); for (i=0; i<mplp.nfiles; i++) mplp.files[i] = strdup(argv[optind+i]); } ret = mpileup(&mplp); for (i=0; i<mplp.nfiles; i++) free(mplp.files[i]); free(mplp.files); free(mplp.reg_fname); free(mplp.pl_list); if (mplp.fai) fai_destroy(mplp.fai); if (mplp.bed) regidx_destroy(mplp.bed); if (mplp.bed_itr) regitr_destroy(mplp.bed_itr); if (mplp.reg) regidx_destroy(mplp.reg); bam_smpl_destroy(mplp.bsmpl); return ret; }
int bam_fillmd(int argc, char *argv[]) { int c, flt_flag, tid = -2, ret, len, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag; samFile *fp = NULL, *fpout = NULL; bam_hdr_t *header = NULL; faidx_t *fai = NULL; char *ref = NULL, mode_w[8], *ref_file; bam1_t *b = NULL; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0), { NULL, 0, NULL, 0 } }; flt_flag = UPDATE_NM | UPDATE_MD; is_bam_out = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0; strcpy(mode_w, "w"); while ((c = getopt_long(argc, argv, "EqreuNhbSC:n:Ad", lopts, NULL)) >= 0) { switch (c) { case 'r': is_realn = 1; break; case 'e': flt_flag |= USE_EQUAL; break; case 'd': flt_flag |= DROP_TAG; break; case 'q': flt_flag |= BIN_QUAL; break; case 'h': flt_flag |= HASH_QNM; break; case 'N': flt_flag &= ~(UPDATE_MD|UPDATE_NM); break; case 'b': is_bam_out = 1; break; case 'u': is_uncompressed = is_bam_out = 1; break; case 'S': break; case 'n': max_nm = atoi(optarg); break; case 'C': capQ = atoi(optarg); break; case 'A': baq_flag |= 1; break; case 'E': baq_flag |= 2; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n\n", c); /* else fall-through */ case '?': return calmd_usage(); } } if (is_bam_out) strcat(mode_w, "b"); else strcat(mode_w, "h"); if (is_uncompressed) strcat(mode_w, "0"); if (optind + (ga.reference == NULL) >= argc) return calmd_usage(); fp = sam_open_format(argv[optind], "r", &ga.in); if (fp == NULL) { print_error_errno("calmd", "Failed to open input file '%s'", argv[optind]); return 1; } header = sam_hdr_read(fp); if (header == NULL || header->n_targets == 0) { fprintf(stderr, "[bam_fillmd] input SAM does not have header. Abort!\n"); goto fail; } fpout = sam_open_format("-", mode_w, &ga.out); if (fpout == NULL) { print_error_errno("calmd", "Failed to open output"); goto fail; } if (sam_hdr_write(fpout, header) < 0) { print_error_errno("calmd", "Failed to write sam header"); goto fail; } ref_file = argc > optind + 1 ? argv[optind+1] : ga.reference; fai = fai_load(ref_file); if (!fai) { print_error_errno("calmd", "Failed to open reference file '%s'", ref_file); goto fail; } b = bam_init1(); if (!b) { fprintf(stderr, "[bam_fillmd] Failed to allocate bam struct\n"); goto fail; } while ((ret = sam_read1(fp, header, b)) >= 0) { if (b->core.tid >= 0) { if (tid != b->core.tid) { free(ref); ref = fai_fetch(fai, header->target_name[b->core.tid], &len); tid = b->core.tid; if (ref == 0) { // FIXME: Should this always be fatal? fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n", header->target_name[tid]); if (is_realn || capQ > 10) goto fail; // Would otherwise crash } } if (is_realn) sam_prob_realn(b, ref, len, baq_flag); if (capQ > 10) { int q = sam_cap_mapq(b, ref, len, capQ); if (b->core.qual > q) b->core.qual = q; } if (ref) bam_fillmd1_core(b, ref, len, flt_flag, max_nm); } if (sam_write1(fpout, header, b) < 0) { print_error_errno("calmd", "failed to write to output file"); goto fail; } } if (ret < -1) { fprintf(stderr, "[bam_fillmd] Error reading input.\n"); goto fail; } bam_destroy1(b); bam_hdr_destroy(header); free(ref); fai_destroy(fai); sam_close(fp); if (sam_close(fpout) < 0) { fprintf(stderr, "[bam_fillmd] error when closing output file\n"); return 1; } return 0; fail: free(ref); if (b) bam_destroy1(b); if (header) bam_hdr_destroy(header); if (fai) fai_destroy(fai); if (fp) sam_close(fp); if (fpout) sam_close(fpout); return 1; }
GCBias::GCBias(const char* ref_filename, PosTable& foreground_position_table, pos_t median_frag_len, sequencing_bias* seqbias[2], const char* task_name) { faidx_t* ref_file = fai_load(ref_filename); if (!ref_file) { Logger::abort("Can't open fasta file '%s'.", ref_filename); } std::vector<ReadPos> foreground_positions; const size_t max_dump = 10000000; foreground_position_table.dump(foreground_positions, max_dump); std::sort(foreground_positions.begin(), foreground_positions.end(), ReadPosSeqnameCmp()); Logger::push_task(task_name, foreground_positions.size()); LoggerTask& task = Logger::get_task(task_name); typedef std::pair<float, float> WeightedGC; std::vector<WeightedGC> foreground_gc, background_gc; int seqlen = 0; SeqName curr_seqname; char* seq = NULL; twobitseq tbseq; twobitseq tbseqrc; rng_t rng; pos_t L = seqbias[0] ? seqbias[0]->getL() : 0; std::vector<ReadPos>::iterator i; for (i = foreground_positions.begin(); i != foreground_positions.end(); ++i) { if (i->seqname != curr_seqname) { free(seq); seq = faidx_fetch_seq(ref_file, i->seqname.get().c_str(), 0, INT_MAX, &seqlen); Logger::debug("read sequence %s.", i->seqname.get().c_str()); if (seq == NULL) { Logger::warn("warning: reference sequence not found, skipping."); } else { for (char* c = seq; *c; c++) *c = tolower(*c); tbseq = seq; tbseqrc = tbseq; tbseqrc.revcomp(); } curr_seqname = i->seqname; } if (seq == NULL || (pos_t) tbseq.size() < median_frag_len) continue; // fragments with many copies tend to have too much weight when training // leading to somewhat less than stable results. if (i->count > 4) continue; // sample background position boost::random::uniform_int_distribution<pos_t> random_uniform( i->start + L, i->end - median_frag_len); pos_t pos = random_uniform(rng); float gc = (float) gc_count(seq + pos, median_frag_len) / median_frag_len; float sb = seqbias[0] ? seqbias[0]->get_bias(tbseq, pos - L) * seqbias[1]->get_bias(tbseqrc, seqlen - pos - 1 - L) : 1.0; background_gc.push_back(WeightedGC(gc, 1.0 / sb)); // sample foreground position if (i->strand == 0) { if (i->pos >= i->start && i->pos + median_frag_len - 1 <= i->end) { float sb = seqbias[0] ? seqbias[0]->get_bias(tbseq, i->pos - L) * seqbias[1]->get_bias(tbseqrc, seqlen - i->pos - 1 - L) : 1.0; foreground_gc.push_back( WeightedGC((float) gc_count(seq + i->pos, median_frag_len) / median_frag_len, 1.0 / sb)); } } else { if (i->pos - median_frag_len >= i->start && i->pos <= i->end) { float sb = seqbias[0] ? seqbias[0]->get_bias(tbseq, i->pos - median_frag_len - L) * seqbias[1]->get_bias(tbseqrc, seqlen - i->pos - median_frag_len - 1 - L) : 1.0; foreground_gc.push_back( WeightedGC((float) gc_count(seq + i->pos - median_frag_len, median_frag_len) / median_frag_len, 1.0 /sb)); } } task.inc(); } free(seq); fai_destroy(ref_file); #if 0 FILE* out = fopen("gcbias.tsv", "w"); fprintf(out, "group\tgc\tweight\n"); BOOST_FOREACH (WeightedGC& value, foreground_gc) { fprintf(out, "foreground\t%f\t%f\n", (double) value.first, (double) value.second); }
int scorereads_main(int argc, char** argv) { parse_scorereads_options(argc, argv); omp_set_num_threads(opt::num_threads); Fast5Map name_map(opt::reads_file); ModelMap models; if (!opt::models_fofn.empty()) models = read_models_fofn(opt::models_fofn); // Open the BAM and iterate over reads // load bam file htsFile* bam_fh = sam_open(opt::bam_file.c_str(), "r"); assert(bam_fh != NULL); // load bam index file std::string index_filename = opt::bam_file + ".bai"; hts_idx_t* bam_idx = bam_index_load(index_filename.c_str()); assert(bam_idx != NULL); // read the bam header bam_hdr_t* hdr = sam_hdr_read(bam_fh); // load reference fai file faidx_t *fai = fai_load(opt::genome_file.c_str()); hts_itr_t* itr; // If processing a region of the genome, only emit events aligned to this window int clip_start = -1; int clip_end = -1; if(opt::region.empty()) { // TODO: is this valid? itr = sam_itr_queryi(bam_idx, HTS_IDX_START, 0, 0); } else { fprintf(stderr, "Region: %s\n", opt::region.c_str()); itr = sam_itr_querys(bam_idx, hdr, opt::region.c_str()); hts_parse_reg(opt::region.c_str(), &clip_start, &clip_end); } #ifndef H5_HAVE_THREADSAFE if(opt::num_threads > 1) { fprintf(stderr, "You enabled multi-threading but you do not have a threadsafe HDF5\n"); fprintf(stderr, "Please recompile nanopolish's built-in libhdf5 or run with -t 1\n"); exit(1); } #endif // Initialize iteration std::vector<bam1_t*> records(opt::batch_size, NULL); for(size_t i = 0; i < records.size(); ++i) { records[i] = bam_init1(); } int result; size_t num_reads_realigned = 0; size_t num_records_buffered = 0; do { assert(num_records_buffered < records.size()); // read a record into the next slot in the buffer result = sam_itr_next(bam_fh, itr, records[num_records_buffered]); num_records_buffered += result >= 0; // realign if we've hit the max buffer size or reached the end of file if(num_records_buffered == records.size() || result < 0) { #pragma omp parallel for schedule(dynamic) for(size_t i = 0; i < num_records_buffered; ++i) { bam1_t* record = records[i]; size_t read_idx = num_reads_realigned + i; if( (record->core.flag & BAM_FUNMAP) == 0) { //load read std::string read_name = bam_get_qname(record); std::string fast5_path = name_map.get_path(read_name); SquiggleRead sr(read_name, fast5_path); // TODO: early exit when have processed all of the reads in readnames if (!opt::readnames.empty() && std::find(opt::readnames.begin(), opt::readnames.end(), read_name) == opt::readnames.end() ) continue; for(size_t strand_idx = 0; strand_idx < NUM_STRANDS; ++strand_idx) { std::vector<EventAlignment> ao = alignment_from_read(sr, strand_idx, read_idx, models, fai, hdr, record, clip_start, clip_end); if (ao.size() == 0) continue; // Update pore model based on alignment if ( opt::calibrate ) recalibrate_model(sr, strand_idx, ao, false); double score = model_score(sr, strand_idx, fai, ao, 500); if (score > 0) continue; #pragma omp critical(print) std::cout << read_name << " " << ( strand_idx ? "complement" : "template" ) << " " << sr.pore_model[strand_idx].name << " " << score << std::endl; } } } num_reads_realigned += num_records_buffered; num_records_buffered = 0; } } while(result >= 0); // cleanup records for(size_t i = 0; i < records.size(); ++i) { bam_destroy1(records[i]); } // cleanup sam_itr_destroy(itr); bam_hdr_destroy(hdr); fai_destroy(fai); sam_close(bam_fh); hts_idx_destroy(bam_idx); return 0; }
int main(int argc, char *argv[]) { int c; const char* normal_sample_id = _default_normal_sample_id; const char* tumor_sample_id = _default_tumor_sample_id; const char *fn_fa = 0; pu_data2_t *d = (pu_data2_t*)calloc(1, sizeof(pu_data2_t)); d->min_somatic_qual=15; d->tid = -1; d->mask = BAM_DEF_MASK; d->mapQ = 0; d->c = sniper_maqcns_init(); int use_priors = 1; d->include_loh = 1; d->include_gor = 1; d->use_joint_priors = 0; d->somatic_mutation_rate = 0.01; const char *output_format = "classic"; while ((c = getopt(argc, argv, "n:t:vf:T:N:r:I:q:Q:pLGJs:F:")) >= 0) { switch (c) { case 'f': fn_fa = optarg; break; case 'T': d->c->theta = atof(optarg); break; case 'N': d->c->n_hap = atoi(optarg); break; case 'r': d->c->het_rate = atof(optarg); break; case 'q': d->mapQ = atoi(optarg); break; case 'Q': d->min_somatic_qual = atoi(optarg); break; case 'F': output_format = optarg; break; case 'p': use_priors = 0; break; case 'J': d->use_joint_priors = 1; break; case 's': d->somatic_mutation_rate = atof(optarg); d->use_joint_priors = 1; break; case 'v': version_info(); exit(0); break; case 't': tumor_sample_id = optarg; break; case 'n': normal_sample_id = optarg; break; case 'L': d->include_loh = 0; break; case 'G': d->include_gor = 0; break; default: fprintf(stderr, "Unrecognizd option '-%c'.\n", c); return 1; } } if (optind == argc) { usage(argv[0], d); sniper_maqcns_destroy(d->c); free(d); return 1; } if (fn_fa) { d->fai = fai_load(fn_fa); } else { fprintf(stderr, "You MUST specify a reference sequence. It isn't optional.\n"); sniper_maqcns_destroy(d->c); free(d); exit(1); } if(d->use_joint_priors) { fprintf(stderr,"Using priors accounting for somatic mutation rate. Prior probability of a somatic mutation is %f\n",d->somatic_mutation_rate); make_joint_prior(d->somatic_mutation_rate); } sniper_maqcns_prepare(d->c); fprintf(stderr,"Preparing to snipe some somatics\n"); if(use_priors) { fprintf(stderr,"Using prior probabilities\n"); makeSoloPrior(); } bamFile fp1, fp2; qAddTableInit(); fp1 = (strcmp(argv[optind], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[optind], "r"); fprintf(stderr, "Normal bam is %s\n", argv[optind+1]); fprintf(stderr, "Tumor bam is %s\n", argv[optind]); d->h1 = bam_header_read(fp1); sam_header_parse_rg(d->h1); fp2 = bam_open(argv[optind+1], "r"); d->h2 = bam_header_read(fp2); sam_header_parse_rg(d->h2); FILE* snp_fh = fopen(argv[optind+2], "w"); /* this will exit if the format name is invalid */ output_formatter_t fmt = output_formatter_create(output_format, snp_fh); d->output_formatter = &fmt; if(snp_fh) { header_data_t hdr; hdr.refseq = fn_fa; hdr.normal_sample_id = normal_sample_id; hdr.tumor_sample_id = tumor_sample_id; d->output_formatter->header_fn(snp_fh, &hdr); bam_sspileup_file(fp1, fp2, d->mask, d->mapQ, glf_somatic, d, snp_fh); } else { fprintf(stderr, "Unable to open snp file!!!!!!!!!\n"); exit(1); } bam_close(fp1); bam_close(fp2); bam_header_destroy(d->h1); bam_header_destroy(d->h2); if (d->fai) fai_destroy(d->fai); sniper_maqcns_destroy(d->c); free(d->ref); free(d); fclose(snp_fh); return 0; }
void train_one_round(const Fast5Map& name_map, size_t round) { const PoreModelMap& current_models = PoreModelSet::get_models(opt::trained_model_type); // Initialize the training summary stats for each kmer for each model ModelTrainingMap model_training_data; for(auto current_model_iter = current_models.begin(); current_model_iter != current_models.end(); current_model_iter++) { // one summary entry per kmer in the model std::vector<StateSummary> summaries(current_model_iter->second.get_num_states()); model_training_data[current_model_iter->first] = summaries; } // Open the BAM and iterate over reads // load bam file htsFile* bam_fh = sam_open(opt::bam_file.c_str(), "r"); assert(bam_fh != NULL); // load bam index file std::string index_filename = opt::bam_file + ".bai"; hts_idx_t* bam_idx = bam_index_load(index_filename.c_str()); assert(bam_idx != NULL); // read the bam header bam_hdr_t* hdr = sam_hdr_read(bam_fh); // load reference fai file faidx_t *fai = fai_load(opt::genome_file.c_str()); hts_itr_t* itr; // If processing a region of the genome, only emit events aligned to this window int clip_start = -1; int clip_end = -1; if(opt::region.empty()) { // TODO: is this valid? itr = sam_itr_queryi(bam_idx, HTS_IDX_START, 0, 0); } else { fprintf(stderr, "Region: %s\n", opt::region.c_str()); itr = sam_itr_querys(bam_idx, hdr, opt::region.c_str()); hts_parse_reg(opt::region.c_str(), &clip_start, &clip_end); } #ifndef H5_HAVE_THREADSAFE if(opt::num_threads > 1) { fprintf(stderr, "You enabled multi-threading but you do not have a threadsafe HDF5\n"); fprintf(stderr, "Please recompile nanopolish's built-in libhdf5 or run with -t 1\n"); exit(1); } #endif // Initialize iteration std::vector<bam1_t*> records(opt::batch_size, NULL); for(size_t i = 0; i < records.size(); ++i) { records[i] = bam_init1(); } int result; size_t num_reads_realigned = 0; size_t num_records_buffered = 0; Progress progress("[methyltrain]"); do { assert(num_records_buffered < records.size()); // read a record into the next slot in the buffer result = sam_itr_next(bam_fh, itr, records[num_records_buffered]); num_records_buffered += result >= 0; // realign if we've hit the max buffer size or reached the end of file if(num_records_buffered == records.size() || result < 0) { #pragma omp parallel for for(size_t i = 0; i < num_records_buffered; ++i) { bam1_t* record = records[i]; size_t read_idx = num_reads_realigned + i; if( (record->core.flag & BAM_FUNMAP) == 0) { add_aligned_events(name_map, fai, hdr, record, read_idx, clip_start, clip_end, round, model_training_data); } } num_reads_realigned += num_records_buffered; num_records_buffered = 0; } if(opt::progress) { fprintf(stderr, "Realigned %zu reads in %.1lfs\r", num_reads_realigned, progress.get_elapsed_seconds()); } } while(result >= 0); assert(num_records_buffered == 0); progress.end(); // open the summary file std::stringstream summary_fn; summary_fn << "methyltrain" << opt::out_suffix << ".summary"; FILE* summary_fp = fopen(summary_fn.str().c_str(), "w"); fprintf(summary_fp, "model_short_name\tkmer\tnum_matches\tnum_skips\t" "num_stays\tnum_events_for_training\twas_trained\t" "trained_level_mean\ttrained_level_stdv\n"); // open the tsv file with the raw training data std::stringstream training_fn; training_fn << "methyltrain" << opt::out_suffix << ".round" << round << ".events.tsv"; std::ofstream training_ofs(training_fn.str()); // write out a header for the training data StateTrainingData::write_header(training_ofs); // iterate over models: template, complement_pop1, complement_pop2 for(auto model_training_iter = model_training_data.begin(); model_training_iter != model_training_data.end(); model_training_iter++) { // Initialize the trained model from the input model auto current_model_iter = current_models.find(model_training_iter->first); assert(current_model_iter != current_models.end()); std::string model_name = model_training_iter->first; std::string model_short_name = current_model_iter->second.metadata.get_short_name(); // Initialize the new model from the current model PoreModel updated_model = current_model_iter->second; uint32_t k = updated_model.k; const std::vector<StateSummary>& summaries = model_training_iter->second; // Generate the complete set of kmers std::string gen_kmer(k, 'A'); std::vector<std::string> all_kmers; for(size_t ki = 0; ki < summaries.size(); ++ki) { all_kmers.push_back(gen_kmer); mtrain_alphabet->lexicographic_next(gen_kmer); } assert(gen_kmer == std::string(k, 'A')); assert(all_kmers.front() == std::string(k, 'A')); assert(all_kmers.back() == std::string(k, 'T')); // Update means for each kmer #pragma omp parallel for for(size_t ki = 0; ki < summaries.size(); ++ki) { assert(ki < all_kmers.size()); std::string kmer = all_kmers[ki]; // write the observed values to a tsv file #pragma omp critical { for(size_t ei = 0; ei < summaries[ki].events.size(); ++ei) { summaries[ki].events[ei].write_tsv(training_ofs, model_short_name, kmer); } } bool is_m_kmer = kmer.find('M') != std::string::npos; bool update_kmer = opt::training_target == TT_ALL_KMERS || (is_m_kmer && opt::training_target == TT_METHYLATED_KMERS) || (!is_m_kmer && opt::training_target == TT_UNMETHYLATED_KMERS); bool trained = false; // only train if there are a sufficient number of events for this kmer if(update_kmer && summaries[ki].events.size() >= opt::min_number_of_events_to_train) { // train a mixture model where a minority of k-mers aren't methylated ParamMixture mixture; float incomplete_methylation_rate = 0.05f; std::string um_kmer = mtrain_alphabet->unmethylate(kmer); size_t um_ki = mtrain_alphabet->kmer_rank(um_kmer.c_str(), k); // Initialize the training parameters. If this is a kmer containing // a methylation site we train a two component mixture, otherwise // just fit a gaussian float major_weight = is_m_kmer ? 1 - incomplete_methylation_rate : 1.0f; mixture.log_weights.push_back(log(major_weight)); mixture.params.push_back(current_model_iter->second.get_parameters(ki)); if(is_m_kmer) { // add second unmethylated component mixture.log_weights.push_back(std::log(incomplete_methylation_rate)); mixture.params.push_back(current_model_iter->second.get_parameters(um_ki)); } if(opt::verbose > 1) { fprintf(stderr, "INIT__MIX %s\t%s\t[%.2lf %.2lf %.2lf]\t[%.2lf %.2lf %.2lf]\n", model_training_iter->first.c_str(), kmer.c_str(), std::exp(mixture.log_weights[0]), mixture.params[0].level_mean, mixture.params[0].level_stdv, std::exp(mixture.log_weights[1]), mixture.params[1].level_mean, mixture.params[1].level_stdv); } ParamMixture trained_mixture = train_gaussian_mixture(summaries[ki].events, mixture); if(opt::verbose > 1) { fprintf(stderr, "TRAIN_MIX %s\t%s\t[%.2lf %.2lf %.2lf]\t[%.2lf %.2lf %.2lf]\n", model_training_iter->first.c_str(), kmer.c_str(), std::exp(trained_mixture.log_weights[0]), trained_mixture.params[0].level_mean, trained_mixture.params[0].level_stdv, std::exp(trained_mixture.log_weights[1]), trained_mixture.params[1].level_mean, trained_mixture.params[1].level_stdv); } #pragma omp critical updated_model.states[ki] = trained_mixture.params[0]; if (model_stdv()) { ParamMixture ig_mixture; // weights ig_mixture.log_weights = trained_mixture.log_weights; // states ig_mixture.params.emplace_back(trained_mixture.params[0]); if(is_m_kmer) { ig_mixture.params.emplace_back(current_model_iter->second.get_parameters(um_ki)); } // run training auto trained_ig_mixture = train_invgaussian_mixture(summaries[ki].events, ig_mixture); LOG("methyltrain", debug) << "IG_INIT__MIX " << model_training_iter->first.c_str() << " " << kmer.c_str() << " [" << std::fixed << std::setprecision(5) << ig_mixture.params[0].sd_mean << " " << ig_mixture.params[1].sd_mean << "]" << std::endl << "IG_TRAIN_MIX " << model_training_iter->first.c_str() << " " << kmer.c_str() << " [" << trained_ig_mixture.params[0].sd_mean << " " << trained_ig_mixture.params[1].sd_mean << "]" << std::endl; // update state #pragma omp critical { updated_model.states[ki] = trained_ig_mixture.params[0]; } } trained = true; } #pragma omp critical { fprintf(summary_fp, "%s\t%s\t%d\t%d\t%d\t%zu\t%d\t%.2lf\t%.2lf\n", model_short_name.c_str(), kmer.c_str(), summaries[ki].num_matches, summaries[ki].num_skips, summaries[ki].num_stays, summaries[ki].events.size(), trained, updated_model.states[ki].level_mean, updated_model.states[ki].level_stdv); } // add the updated model into the collection (or replace what is already there) PoreModelSet::insert_model(opt::trained_model_type, updated_model); } } // cleanup records for(size_t i = 0; i < records.size(); ++i) { bam_destroy1(records[i]); } // cleanup sam_itr_destroy(itr); bam_hdr_destroy(hdr); fai_destroy(fai); sam_close(bam_fh); hts_idx_destroy(bam_idx); fclose(summary_fp); }
int extract_main(int argc, char *argv[]) { char *opref = NULL, *oname, *p; int c, i; Config config; //Defaults config.keepCpG = 1; config.keepCHG = 0; config.keepCHH = 0; config.minMapq = 10; config.minPhred = 5; config.keepDupes = 0; config.keepSingleton = 0, config.keepDiscordant = 0; config.merge = 0; config.maxDepth = 2000; config.fai = NULL; config.fp = NULL; config.bai = NULL; config.reg = NULL; config.bedName = NULL; config.bed = NULL; config.fraction = 0; config.counts = 0; config.logit = 0; for(i=0; i<16; i++) config.bounds[i] = 0; static struct option lopts[] = { {"opref", 1, NULL, 'o'}, {"fraction", 0, NULL, 'f'}, {"counts", 0, NULL, 'c'}, {"logit", 0, NULL, 'm'}, {"noCpG", 0, NULL, 1}, {"CHG", 0, NULL, 2}, {"CHH", 0, NULL, 3}, {"keepDupes", 0, NULL, 4}, {"keepSingleton",0, NULL, 5}, {"keepDiscordant",0,NULL, 6}, {"OT", 1, NULL, 7}, {"OB", 1, NULL, 8}, {"CTOT", 1, NULL, 9}, {"CTOB", 1, NULL, 10}, {"mergeContext", 0, NULL, 11}, {"help", 0, NULL, 'h'}, {0, 0, NULL, 0} }; while((c = getopt_long(argc, argv, "q:p:r:l:o:D:f:c:m:", lopts,NULL)) >=0) { switch(c) { case 'h' : extract_usage(); return 0; case 'o' : opref = strdup(optarg); break; case 'D' : config.maxDepth = atoi(optarg); break; case 'r': config.reg = strdup(optarg); break; case 'l' : config.bedName = optarg; break; case 1 : config.keepCpG = 0; break; case 2 : config.keepCHG = 1; break; case 3 : config.keepCHH = 1; break; case 4 : config.keepDupes = 1; break; case 5 : config.keepSingleton = 1; break; case 6 : config.keepDiscordant = 1; break; case 7 : parseBounds(optarg, config.bounds, 0); break; case 8 : parseBounds(optarg, config.bounds, 1); break; case 9 : parseBounds(optarg, config.bounds, 2); break; case 10 : parseBounds(optarg, config.bounds, 3); break; case 11 : config.merge = 1; break; case 'q' : config.minMapq = atoi(optarg); break; case 'p' : config.minPhred = atoi(optarg); break; case 'm' : config.logit = 1; break; case 'f' : config.fraction = 1; break; case 'c' : config.counts = 1; break; case '?' : default : fprintf(stderr, "Invalid option '%c'\n", c); extract_usage(); return 1; } } if(argc == 1) { extract_usage(); return 0; } if(argc-optind != 2) { fprintf(stderr, "You must supply a reference genome in fasta format and an input BAM file!!!\n"); extract_usage(); return -1; } //Are the options reasonable? if(config.minPhred < 1) { fprintf(stderr, "-p %i is invalid. resetting to 1, which is the lowest possible value.\n", config.minPhred); config.minPhred = 1; } if(config.minMapq < 0) { fprintf(stderr, "-q %i is invalid. Resetting to 0, which is the lowest possible value.\n", config.minMapq); config.minMapq = 0; } if(config.fraction+config.counts+config.logit > 1) { fprintf(stderr, "More than one of --fraction, --counts, and --logit were specified. These are mutually exclusive.\n"); extract_usage(); return 1; } //Has more than one output format been requested? if(config.fraction + config.counts + config.logit > 1) { fprintf(stderr, "You may specify AT MOST one of -c/--counts, -f/--fraction, or -m/--logit.\n"); return -6; } //Is there still a metric to output? if(!(config.keepCpG + config.keepCHG + config.keepCHH)) { fprintf(stderr, "You haven't specified any metrics to output!\nEither don't use the --noCpG option or specify --CHG and/or --CHH.\n"); return -1; } //Open the files if((config.fai = fai_load(argv[optind])) == NULL) { fprintf(stderr, "Couldn't open the index for %s!\n", argv[optind]); extract_usage(); return -2; } if((config.fp = hts_open(argv[optind+1], "rb")) == NULL) { fprintf(stderr, "Couldn't open %s for reading!\n", argv[optind+1]); return -4; } if((config.bai = sam_index_load(config.fp, argv[optind+1])) == NULL) { fprintf(stderr, "Couldn't load the index for %s, will attempt to build it.\n", argv[optind+1]); if(bam_index_build(argv[optind+1], 0) < 0) { fprintf(stderr, "Couldn't build the index for %s! File corrupted?\n", argv[optind+1]); return -5; } if((config.bai = sam_index_load(config.fp, argv[optind+1])) == NULL) { fprintf(stderr, "Still couldn't load the index, quiting.\n"); return -5; } } //Output files config.output_fp = malloc(sizeof(FILE *) * 3); assert(config.output_fp); if(opref == NULL) { opref = strdup(argv[optind+1]); assert(opref); p = strrchr(opref, '.'); if(p != NULL) *p = '\0'; fprintf(stderr, "writing to prefix:'%s'\n", opref); } if(config.fraction) { oname = malloc(sizeof(char) * (strlen(opref)+19)); } else if(config.counts) { oname = malloc(sizeof(char) * (strlen(opref)+21)); } else if(config.logit) { oname = malloc(sizeof(char) * (strlen(opref)+20)); } else { oname = malloc(sizeof(char) * (strlen(opref)+14)); } assert(oname); if(config.keepCpG) { if(config.fraction) { sprintf(oname, "%s_CpG.meth.bedGraph", opref); } else if(config.counts) { sprintf(oname, "%s_CpG.counts.bedGraph", opref); } else if(config.logit) { sprintf(oname, "%s_CpG.logit.bedGraph", opref); } else { sprintf(oname, "%s_CpG.bedGraph", opref); } config.output_fp[0] = fopen(oname, "w"); if(config.output_fp[0] == NULL) { fprintf(stderr, "Couldn't open the output CpG metrics file for writing! Insufficient permissions?\n"); return -3; } printHeader(config.output_fp[0], "CpG", opref, config); } if(config.keepCHG) { if(config.fraction) { sprintf(oname, "%s_CHG.meth.bedGraph", opref); } else if(config.counts) { sprintf(oname, "%s_CHG.counts.bedGraph", opref); } else if(config.logit) { sprintf(oname, "%s_CHG.logit.bedGraph", opref); } else { sprintf(oname, "%s_CHG.bedGraph", opref); } config.output_fp[1] = fopen(oname, "w"); if(config.output_fp[1] == NULL) { fprintf(stderr, "Couldn't open the output CHG metrics file for writing! Insufficient permissions?\n"); return -3; } printHeader(config.output_fp[1], "CHG", opref, config); } if(config.keepCHH) { if(config.fraction) { sprintf(oname, "%s_CHH.meth.bedGraph", opref); } else if(config.counts) { sprintf(oname, "%s_CHH.counts.bedGraph", opref); } else if(config.logit) { sprintf(oname, "%s_CHH.logit.bedGraph", opref); } else { sprintf(oname, "%s_CHH.bedGraph", opref); } config.output_fp[2] = fopen(oname, "w"); if(config.output_fp[2] == NULL) { fprintf(stderr, "Couldn't open the output CHH metrics file for writing! Insufficient permissions?\n"); return -3; } printHeader(config.output_fp[2], "CHH", opref, config); } //Run the pileup extractCalls(&config); //Close things up hts_close(config.fp); fai_destroy(config.fai); if(config.keepCpG) fclose(config.output_fp[0]); if(config.keepCHG) fclose(config.output_fp[1]); if(config.keepCHH) fclose(config.output_fp[2]); hts_idx_destroy(config.bai); free(opref); if(config.reg) free(config.reg); if(config.bed) destroyBED(config.bed); free(oname); free(config.output_fp); return 0; }
int main_pad2unpad(int argc, char *argv[]) { samFile *in = 0, *out = 0; bam_hdr_t *h = 0, *h_fix = 0; faidx_t *fai = 0; int c, compress_level = -1, is_long_help = 0; char in_mode[5], out_mode[6], *fn_out = 0, *fn_list = 0; int ret=0; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 'T'), { NULL, 0, NULL, 0 } }; /* parse command-line options */ strcpy(in_mode, "r"); strcpy(out_mode, "w"); while ((c = getopt_long(argc, argv, "SCso:u1T:?", lopts, NULL)) >= 0) { switch (c) { case 'S': break; case 'C': hts_parse_format(&ga.out, "cram"); break; case 's': assert(compress_level == -1); hts_parse_format(&ga.out, "sam"); break; case 'o': fn_out = strdup(optarg); break; case 'u': compress_level = 0; if (ga.out.format == unknown_format) hts_parse_format(&ga.out, "bam"); break; case '1': compress_level = 1; if (ga.out.format == unknown_format) hts_parse_format(&ga.out, "bam"); break; case '?': is_long_help = 1; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n\n", c); return usage(is_long_help); } } if (argc == optind) return usage(is_long_help); strcat(out_mode, "h"); if (compress_level >= 0) { char tmp[2]; tmp[0] = compress_level + '0'; tmp[1] = '\0'; strcat(out_mode, tmp); } // Load FASTA reference (also needed for SAM -> BAM if missing header) if (ga.reference) { fn_list = samfaipath(ga.reference); fai = fai_load(ga.reference); } // open file handlers if ((in = sam_open_format(argv[optind], in_mode, &ga.in)) == 0) { fprintf(stderr, "[depad] failed to open \"%s\" for reading.\n", argv[optind]); ret = 1; goto depad_end; } if (fn_list && hts_set_fai_filename(in, fn_list) != 0) { fprintf(stderr, "[depad] failed to load reference file \"%s\".\n", fn_list); ret = 1; goto depad_end; } if ((h = sam_hdr_read(in)) == 0) { fprintf(stderr, "[depad] failed to read the header from \"%s\".\n", argv[optind]); ret = 1; goto depad_end; } if (fai) { h_fix = fix_header(h, fai); } else { fprintf(stderr, "[depad] Warning - reference lengths will not be corrected without FASTA reference\n"); h_fix = h; } char wmode[2]; strcat(out_mode, sam_open_mode(wmode, fn_out, NULL)==0 ? wmode : "b"); if ((out = sam_open_format(fn_out? fn_out : "-", out_mode, &ga.out)) == 0) { fprintf(stderr, "[depad] failed to open \"%s\" for writing.\n", fn_out? fn_out : "standard output"); ret = 1; goto depad_end; } // Reference-based CRAM won't work unless we also create a new reference. // We could embed this, but for now we take the easy option. if (ga.out.format == cram) hts_set_opt(out, CRAM_OPT_NO_REF, 1); if (sam_hdr_write(out, h_fix) != 0) { fprintf(stderr, "[depad] failed to write header.\n"); ret = 1; goto depad_end; } // Do the depad ret = bam_pad2unpad(in, out, h, fai); depad_end: // close files, free and return if (fai) fai_destroy(fai); if (h) bam_hdr_destroy(h); sam_close(in); sam_close(out); free(fn_list); free(fn_out); return ret; }
int main(int argc, char **argv) { htsFile *in = NULL; htsFile *out = NULL; char *in_name = "-"; char *out_name = "-"; char *ref_name = NULL; char *ref_seq = NULL; char modew[8] = "w"; faidx_t *fai = NULL; bam_hdr_t *hdr = NULL; bam1_t *rec = NULL; int c, res, last_ref = -1, ref_len = 0; int adjust = 0, extended = 0, recalc = 0, flags = 0; while ((c = getopt(argc, argv, "aef:hi:o:r")) >= 0) { switch (c) { case 'a': adjust = 1; break; case 'e': extended = 1; break; case 'f': ref_name = optarg; break; case 'h': usage(argv[0]); return EXIT_SUCCESS; case 'i': in_name = optarg; break; case 'o': out_name = optarg; break; case 'r': recalc = 1; break; default: usage(argv[0]); return EXIT_FAILURE; } } if (!ref_name) { usage(argv[0]); return EXIT_FAILURE; } flags = (adjust ? 1 : 0) | (extended ? 2 : 0) | (recalc ? 4 : 0); fai = fai_load(ref_name); if (!fai) { fprintf(stderr, "Couldn't load reference %s\n", ref_name); goto fail; } rec = bam_init1(); if (!rec) { perror(NULL); goto fail; } in = hts_open(in_name, "r"); if (!in) { fprintf(stderr, "Couldn't open %s : %s\n", in_name, strerror(errno)); goto fail; } hdr = sam_hdr_read(in); if (!hdr) { fprintf(stderr, "Couldn't read header for %s\n", in_name); goto fail; } out = hts_open(out_name, modew); if (!out) { fprintf(stderr, "Couldn't open %s : %s\n", out_name, strerror(errno)); goto fail; } if (sam_hdr_write(out, hdr) < 0) { fprintf(stderr, "Couldn't write header to %s : %s\n", out_name, strerror(errno)); goto fail; } while ((res = sam_read1(in, hdr, rec)) >= 0) { if (rec->core.tid >= hdr->n_targets) { fprintf(stderr, "Invalid BAM reference id %d\n", rec->core.tid); goto fail; } if (last_ref != rec->core.tid && rec->core.tid >= 0) { free(ref_seq); ref_seq = faidx_fetch_seq(fai, hdr->target_name[rec->core.tid], 0, INT_MAX, &ref_len); if (!ref_seq) { fprintf(stderr, "Couldn't get reference %s\n", hdr->target_name[rec->core.tid]); goto fail; } last_ref = rec->core.tid; } if (rec->core.tid >= 0) { res = sam_prob_realn(rec, ref_seq, ref_len, flags); if (res <= -4) { fprintf(stderr, "Error running sam_prob_realn : %s\n", strerror(errno)); goto fail; } } if (sam_write1(out, hdr, rec) < 0) { fprintf(stderr, "Error writing to %s\n", out_name); goto fail; } } res = hts_close(in); in = NULL; if (res < 0) { fprintf(stderr, "Error closing %s\n", in_name); goto fail; } res = hts_close(out); out = NULL; if (res < 0) { fprintf(stderr, "Error closing %s\n", out_name); goto fail; } bam_hdr_destroy(hdr); bam_destroy1(rec); free(ref_seq); fai_destroy(fai); return EXIT_SUCCESS; fail: if (hdr) bam_hdr_destroy(hdr); if (rec) bam_destroy1(rec); if (in) hts_close(in); if (out) hts_close(out); free(ref_seq); fai_destroy(fai); return EXIT_FAILURE; }
int main(int argc, const char *argv[]) { // // Parse parameters // Parameters parameters = Parameters(argc, argv); // // Open log file // FILE* logFile = fopen((parameters.outFilePrefix + ".log").c_str(), "w"); Output2FILE::Stream() = logFile; LOG(logINFO) << "HapMuC ver1.1" << std::endl; // // Open reference genome // faidx_t *fai = NULL; fai = fai_load(parameters.refFileName.c_str()); if (!fai) { LOG(logERROR) << "Cannot open reference sequence file." << std::endl; exit(1); } // // Prepare bam files // MyBam tumorBam = MyBam(parameters.tumorBam); MyBam normalBam = MyBam(parameters.normalBam); BamReader tumorBamReader = BamReader(&tumorBam, fai, parameters.maxReads); BamReader normalBamReader = BamReader(&normalBam, fai, parameters.maxReads); // // Prepare main algorithm // MutationCaller mutationCaller = MutationCaller(tumorBamReader, normalBamReader, parameters, fai); // // Open input file and output file // std::ifstream inputWindowStream(parameters.windowFile.c_str()); std::ofstream outStream((parameters.outFilePrefix + ".calls.txt").c_str()); // // Output column names // outStream << MutationCallResult::getHeader() << std::endl; std::string line; while (getline(inputWindowStream, line)) { // // Parse a candidate window // CandidateWindow window = CandidateWindow(line); LOG(logINFO) << "************************* target: " << window.info.chr << " " << window.info.start << " " << window.target << " *************************" << std::endl; try { // // Evaluate whether the candidate somatic mutation exists or not // by calculating Bayes factor // MutationCallResult result = mutationCaller.call(window); // // Output the result // outStream << result.getOutput() << std::endl; LOG(logDEBUG) << result.getOutput() << std::endl; } catch (std::string &s) { LOG(logERROR) << "something unexpected happened. exit." << std::endl; LOG(logERROR) << s << std::endl; exit(1); } } outStream.close(); inputWindowStream.close(); fai_destroy(fai); return 0; }
int methyltest_main(int argc, char** argv) { parse_methyltest_options(argc, argv); omp_set_num_threads(opt::num_threads); Fast5Map name_map(opt::reads_file); ModelMap models = read_models_fofn(opt::models_fofn, mtest_alphabet); // Open the BAM and iterate over reads // load bam file htsFile* bam_fh = sam_open(opt::bam_file.c_str(), "r"); assert(bam_fh != NULL); // load bam index file std::string index_filename = opt::bam_file + ".bai"; hts_idx_t* bam_idx = bam_index_load(index_filename.c_str()); assert(bam_idx != NULL); // read the bam header bam_hdr_t* hdr = sam_hdr_read(bam_fh); // load reference fai file faidx_t *fai = fai_load(opt::genome_file.c_str()); hts_itr_t* itr; // If processing a region of the genome, only emit events aligned to this window int clip_start = -1; int clip_end = -1; if(opt::region.empty()) { // TODO: is this valid? itr = sam_itr_queryi(bam_idx, HTS_IDX_START, 0, 0); } else { fprintf(stderr, "Region: %s\n", opt::region.c_str()); itr = sam_itr_querys(bam_idx, hdr, opt::region.c_str()); hts_parse_reg(opt::region.c_str(), &clip_start, &clip_end); } #ifndef H5_HAVE_THREADSAFE if(opt::num_threads > 1) { fprintf(stderr, "You enabled multi-threading but you do not have a threadsafe HDF5\n"); fprintf(stderr, "Please recompile nanopolish's built-in libhdf5 or run with -t 1\n"); exit(1); } #endif // Initialize writers OutputHandles handles; handles.site_writer = fopen(std::string(opt::bam_file + ".methyltest.sites.bed").c_str(), "w"); handles.read_writer = fopen(std::string(opt::bam_file + ".methyltest.reads.tsv").c_str(), "w"); handles.strand_writer = fopen(std::string(opt::bam_file + ".methyltest.strand.tsv").c_str(), "w"); // Write a header to the reads.tsv file fprintf(handles.read_writer, "name\tsum_ll_ratio\tn_cpg\tcomplement_model\ttags\n"); // strand header fprintf(handles.strand_writer, "name\tsum_ll_ratio\tn_cpg\tmodel\n"); // Initialize iteration std::vector<bam1_t*> records(opt::batch_size, NULL); for(size_t i = 0; i < records.size(); ++i) { records[i] = bam_init1(); } int result; size_t num_reads_processed = 0; size_t num_records_buffered = 0; Progress progress("[methyltest]"); do { assert(num_records_buffered < records.size()); // read a record into the next slot in the buffer result = sam_itr_next(bam_fh, itr, records[num_records_buffered]); num_records_buffered += result >= 0; // realign if we've hit the max buffer size or reached the end of file if(num_records_buffered == records.size() || result < 0) { #pragma omp parallel for for(size_t i = 0; i < num_records_buffered; ++i) { bam1_t* record = records[i]; size_t read_idx = num_reads_processed + i; if( (record->core.flag & BAM_FUNMAP) == 0) { calculate_methylation_for_read(models, name_map, fai, hdr, record, read_idx, handles); } } num_reads_processed += num_records_buffered; num_records_buffered = 0; } } while(result >= 0); assert(num_records_buffered == 0); progress.end(); // cleanup records for(size_t i = 0; i < records.size(); ++i) { bam_destroy1(records[i]); } // cleanup fclose(handles.site_writer); fclose(handles.read_writer); fclose(handles.strand_writer); sam_itr_destroy(itr); bam_hdr_destroy(hdr); fai_destroy(fai); sam_close(bam_fh); hts_idx_destroy(bam_idx); return EXIT_SUCCESS; }
int bam_mpileup(int argc, char *argv[]) { int c; const char *file_list = NULL; char **fn = NULL; int nfiles = 0, use_orphan = 0; mplp_conf_t mplp; memset(&mplp, 0, sizeof(mplp_conf_t)); #define MPLP_PRINT_POS 0x4000 mplp.max_mq = 60; mplp.min_baseQ = 13; mplp.capQ_thres = 0; mplp.max_depth = 250; mplp.max_indel_depth = 250; mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100; mplp.min_frac = 0.002; mplp.min_support = 1; mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN; while ((c = getopt(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:L:b:P:o:e:h:Im:F:EG:6Os")) >= 0) { switch (c) { case 'f': mplp.fai = fai_load(optarg); if (mplp.fai == 0) return 1; break; case 'd': mplp.max_depth = atoi(optarg); break; case 'r': mplp.reg = strdup(optarg); break; case 'l': mplp.bed = bed_read(optarg); break; case 'P': mplp.pl_list = strdup(optarg); break; case 'g': mplp.flag |= MPLP_GLF; break; case 'u': mplp.flag |= MPLP_NO_COMP | MPLP_GLF; break; case 'a': mplp.flag |= MPLP_NO_ORPHAN | MPLP_REALN; break; case 'B': mplp.flag &= ~MPLP_REALN; break; case 'D': mplp.flag |= MPLP_FMT_DP; break; case 'S': mplp.flag |= MPLP_FMT_SP; break; case 'I': mplp.flag |= MPLP_NO_INDEL; break; case 'E': mplp.flag |= MPLP_EXT_BAQ; break; case '6': mplp.flag |= MPLP_ILLUMINA13; break; case 'R': mplp.flag |= MPLP_IGNORE_RG; break; case 's': mplp.flag |= MPLP_PRINT_MAPQ; break; case 'O': mplp.flag |= MPLP_PRINT_POS; break; case 'C': mplp.capQ_thres = atoi(optarg); break; case 'M': mplp.max_mq = atoi(optarg); break; case 'q': mplp.min_mq = atoi(optarg); break; case 'Q': mplp.min_baseQ = atoi(optarg); break; case 'b': file_list = optarg; break; case 'o': mplp.openQ = atoi(optarg); break; case 'e': mplp.extQ = atoi(optarg); break; case 'h': mplp.tandemQ = atoi(optarg); break; case 'A': use_orphan = 1; break; case 'F': mplp.min_frac = atof(optarg); break; case 'm': mplp.min_support = atoi(optarg); break; case 'L': mplp.max_indel_depth = atoi(optarg); break; case 'G': { FILE *fp_rg; char buf[1024]; mplp.rghash = bcf_str2id_init(); if ((fp_rg = fopen(optarg, "r")) == 0) fprintf(stderr, "(%s) Fail to open file %s. Continue anyway.\n", __func__, optarg); while (!feof(fp_rg) && fscanf(fp_rg, "%s", buf) > 0) // this is not a good style, but forgive me... bcf_str2id_add(mplp.rghash, strdup(buf)); fclose(fp_rg); } break; } } if (use_orphan) mplp.flag &= ~MPLP_NO_ORPHAN; if (argc == 1) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: samtools mpileup [options] in1.bam [in2.bam [...]]\n\n"); fprintf(stderr, "Input options:\n\n"); fprintf(stderr, " -6 assume the quality is in the Illumina-1.3+ encoding\n"); fprintf(stderr, " -A count anomalous read pairs\n"); fprintf(stderr, " -B disable BAQ computation\n"); fprintf(stderr, " -b FILE list of input BAM files [null]\n"); fprintf(stderr, " -C INT parameter for adjusting mapQ; 0 to disable [0]\n"); fprintf(stderr, " -d INT max per-BAM depth to avoid excessive memory usage [%d]\n", mplp.max_depth); fprintf(stderr, " -E extended BAQ for higher sensitivity but lower specificity\n"); fprintf(stderr, " -f FILE faidx indexed reference sequence file [null]\n"); fprintf(stderr, " -G FILE exclude read groups listed in FILE [null]\n"); fprintf(stderr, " -l FILE list of positions (chr pos) or regions (BED) [null]\n"); fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", mplp.max_mq); fprintf(stderr, " -r STR region in which pileup is generated [null]\n"); fprintf(stderr, " -R ignore RG tags\n"); fprintf(stderr, " -q INT skip alignments with mapQ smaller than INT [%d]\n", mplp.min_mq); fprintf(stderr, " -Q INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp.min_baseQ); fprintf(stderr, "\nOutput options:\n\n"); fprintf(stderr, " -D output per-sample DP in BCF (require -g/-u)\n"); fprintf(stderr, " -g generate BCF output (genotype likelihoods)\n"); fprintf(stderr, " -O output base positions on reads (disabled by -g/-u)\n"); fprintf(stderr, " -s output mapping quality (disabled by -g/-u)\n"); fprintf(stderr, " -S output per-sample strand bias P-value in BCF (require -g/-u)\n"); fprintf(stderr, " -u generate uncompress BCF output\n"); fprintf(stderr, "\nSNP/INDEL genotype likelihoods options (effective with `-g' or `-u'):\n\n"); fprintf(stderr, " -e INT Phred-scaled gap extension seq error probability [%d]\n", mplp.extQ); fprintf(stderr, " -F FLOAT minimum fraction of gapped reads for candidates [%g]\n", mplp.min_frac); fprintf(stderr, " -h INT coefficient for homopolymer errors [%d]\n", mplp.tandemQ); fprintf(stderr, " -I do not perform indel calling\n"); fprintf(stderr, " -L INT max per-sample depth for INDEL calling [%d]\n", mplp.max_indel_depth); fprintf(stderr, " -m INT minimum gapped reads for indel candidates [%d]\n", mplp.min_support); fprintf(stderr, " -o INT Phred-scaled gap open sequencing error probability [%d]\n", mplp.openQ); fprintf(stderr, " -P STR comma separated list of platforms for indels [all]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Notes: Assuming diploid individuals.\n\n"); return 1; } if (file_list) { if ( read_file_list(file_list,&nfiles,&fn) ) return 1; mpileup(&mplp,nfiles,fn); for (c=0; c<nfiles; c++) free(fn[c]); free(fn); } else mpileup(&mplp, argc - optind, argv + optind); if (mplp.rghash) bcf_str2id_thorough_destroy(mplp.rghash); free(mplp.reg); free(mplp.pl_list); if (mplp.fai) fai_destroy(mplp.fai); if (mplp.bed) bed_destroy(mplp.bed); return 0; }
int bam_mpileup(int argc, char *argv[]) { int c; const char *file_list = NULL; char **fn = NULL; int nfiles = 0, use_orphan = 0; mplp_conf_t mplp; memset(&mplp, 0, sizeof(mplp_conf_t)); mplp.min_baseQ = 13; mplp.capQ_thres = 0; mplp.max_depth = 250; mplp.max_indel_depth = 250; mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100; mplp.min_frac = 0.002; mplp.min_support = 1; mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_SMART_OVERLAPS; mplp.argc = argc; mplp.argv = argv; mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP; mplp.output_fname = NULL; static const struct option lopts[] = { {"rf", required_argument, NULL, 1}, // require flag {"ff", required_argument, NULL, 2}, // filter flag {"incl-flags", required_argument, NULL, 1}, {"excl-flags", required_argument, NULL, 2}, {"output", required_argument, NULL, 3}, {"open-prob", required_argument, NULL, 4}, {"illumina1.3+", no_argument, NULL, '6'}, {"count-orphans", no_argument, NULL, 'A'}, {"bam-list", required_argument, NULL, 'b'}, {"no-BAQ", no_argument, NULL, 'B'}, {"no-baq", no_argument, NULL, 'B'}, {"adjust-MQ", required_argument, NULL, 'C'}, {"adjust-mq", required_argument, NULL, 'C'}, {"max-depth", required_argument, NULL, 'd'}, {"redo-BAQ", no_argument, NULL, 'E'}, {"redo-baq", no_argument, NULL, 'E'}, {"fasta-ref", required_argument, NULL, 'f'}, {"exclude-RG", required_argument, NULL, 'G'}, {"exclude-rg", required_argument, NULL, 'G'}, {"positions", required_argument, NULL, 'l'}, {"region", required_argument, NULL, 'r'}, {"ignore-RG", no_argument, NULL, 'R'}, {"ignore-rg", no_argument, NULL, 'R'}, {"min-MQ", required_argument, NULL, 'q'}, {"min-mq", required_argument, NULL, 'q'}, {"min-BQ", required_argument, NULL, 'Q'}, {"min-bq", required_argument, NULL, 'Q'}, {"ignore-overlaps", no_argument, NULL, 'x'}, {"BCF", no_argument, NULL, 'g'}, {"bcf", no_argument, NULL, 'g'}, {"VCF", no_argument, NULL, 'v'}, {"vcf", no_argument, NULL, 'v'}, {"output-BP", no_argument, NULL, 'O'}, {"output-bp", no_argument, NULL, 'O'}, {"output-MQ", no_argument, NULL, 's'}, {"output-mq", no_argument, NULL, 's'}, {"output-tags", required_argument, NULL, 't'}, {"uncompressed", no_argument, NULL, 'u'}, {"ext-prob", required_argument, NULL, 'e'}, {"gap-frac", required_argument, NULL, 'F'}, {"tandem-qual", required_argument, NULL, 'h'}, {"skip-indels", no_argument, NULL, 'I'}, {"max-idepth", required_argument, NULL, 'L'}, {"min-ireads ", required_argument, NULL, 'm'}, {"per-sample-mF", no_argument, NULL, 'p'}, {"per-sample-mf", no_argument, NULL, 'p'}, {"platforms", required_argument, NULL, 'P'}, {NULL, 0, NULL, 0} }; while ((c = getopt_long(argc, argv, "Agf:r:l:q:Q:uRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsVvxt:",lopts,NULL)) >= 0) { switch (c) { case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break; case 1 : mplp.rflag_require = bam_str2flag(optarg); if ( mplp.rflag_require<0 ) { fprintf(stderr,"Could not parse --rf %s\n", optarg); return 1; } break; case 2 : mplp.rflag_filter = bam_str2flag(optarg); if ( mplp.rflag_filter<0 ) { fprintf(stderr,"Could not parse --ff %s\n", optarg); return 1; } break; case 3 : mplp.output_fname = optarg; break; case 4 : mplp.openQ = atoi(optarg); break; case 'f': mplp.fai = fai_load(optarg); if (mplp.fai == 0) return 1; mplp.fai_fname = optarg; break; case 'd': mplp.max_depth = atoi(optarg); break; case 'r': mplp.reg = strdup(optarg); break; case 'l': // In the original version the whole BAM was streamed which is inefficient // with few BED intervals and big BAMs. Todo: devise a heuristic to determine // best strategy, that is streaming or jumping. mplp.bed = bed_read(optarg); if (!mplp.bed) { print_error_errno("Could not read file \"%s\"", optarg); return 1; } break; case 'P': mplp.pl_list = strdup(optarg); break; case 'p': mplp.flag |= MPLP_PER_SAMPLE; break; case 'g': mplp.flag |= MPLP_BCF; break; case 'v': mplp.flag |= MPLP_BCF | MPLP_VCF; break; case 'u': mplp.flag |= MPLP_NO_COMP | MPLP_BCF; break; case 'B': mplp.flag &= ~MPLP_REALN; break; case 'D': mplp.fmt_flag |= B2B_FMT_DP; fprintf(stderr, "[warning] samtools mpileup option `-D` is functional, but deprecated. Please switch to `-t DP` in future.\n"); break; case 'S': mplp.fmt_flag |= B2B_FMT_SP; fprintf(stderr, "[warning] samtools mpileup option `-S` is functional, but deprecated. Please switch to `-t SP` in future.\n"); break; case 'V': mplp.fmt_flag |= B2B_FMT_DV; fprintf(stderr, "[warning] samtools mpileup option `-V` is functional, but deprecated. Please switch to `-t DV` in future.\n"); break; case 'I': mplp.flag |= MPLP_NO_INDEL; break; case 'E': mplp.flag |= MPLP_REDO_BAQ; break; case '6': mplp.flag |= MPLP_ILLUMINA13; break; case 'R': mplp.flag |= MPLP_IGNORE_RG; break; case 's': mplp.flag |= MPLP_PRINT_MAPQ; break; case 'O': mplp.flag |= MPLP_PRINT_POS; break; case 'C': mplp.capQ_thres = atoi(optarg); break; case 'q': mplp.min_mq = atoi(optarg); break; case 'Q': mplp.min_baseQ = atoi(optarg); break; case 'b': file_list = optarg; break; case 'o': { char *end; long value = strtol(optarg, &end, 10); // Distinguish between -o INT and -o FILE (a bit of a hack!) if (*end == '\0') mplp.openQ = value; else mplp.output_fname = optarg; } break; case 'e': mplp.extQ = atoi(optarg); break; case 'h': mplp.tandemQ = atoi(optarg); break; case 'A': use_orphan = 1; break; case 'F': mplp.min_frac = atof(optarg); break; case 'm': mplp.min_support = atoi(optarg); break; case 'L': mplp.max_indel_depth = atoi(optarg); break; case 'G': { FILE *fp_rg; char buf[1024]; mplp.rghash = khash_str2int_init(); if ((fp_rg = fopen(optarg, "r")) == 0) fprintf(stderr, "(%s) Fail to open file %s. Continue anyway.\n", __func__, optarg); while (!feof(fp_rg) && fscanf(fp_rg, "%s", buf) > 0) // this is not a good style, but forgive me... khash_str2int_inc(mplp.rghash, strdup(buf)); fclose(fp_rg); } break; case 't': mplp.fmt_flag |= parse_format_flag(optarg); break; default: fprintf(stderr,"Invalid option: '%c'\n", c); return 1; } } if ( !(mplp.flag&MPLP_REALN) && mplp.flag&MPLP_REDO_BAQ ) { fprintf(stderr,"Error: The -B option cannot be combined with -E\n"); return 1; } if (use_orphan) mplp.flag &= ~MPLP_NO_ORPHAN; if (argc == 1) { print_usage(stderr, &mplp); return 1; } int ret; if (file_list) { if ( read_file_list(file_list,&nfiles,&fn) ) return 1; ret = mpileup(&mplp,nfiles,fn); for (c=0; c<nfiles; c++) free(fn[c]); free(fn); } else ret = mpileup(&mplp, argc - optind, argv + optind); if (mplp.rghash) khash_str2int_destroy_free(mplp.rghash); free(mplp.reg); free(mplp.pl_list); if (mplp.fai) fai_destroy(mplp.fai); if (mplp.bed) bed_destroy(mplp.bed); return ret; }