int main(int argc, char **argv) { if (argc < 4) errx(1, "usage\t:%s <bam> <split out> <discord out> (optional #threads)", argv[0]); char *bam_file_name = argv[1]; char *split_file_name = argv[2]; char *disc_file_name = argv[3]; int threads = 2; if (argc == 5) { threads = atoi(argv[4]); } samFile *disc = sam_open(disc_file_name, "wb"); samFile *split = sam_open(split_file_name, "wb"); samFile *in = sam_open(bam_file_name, "rb"); if(in == NULL) errx(1, "Unable to open BAM/SAM file."); // TODO: handle cram. if (threads > 1) { bgzf_mt(in->fp.bgzf, threads, 256); } hts_idx_t *idx = sam_index_load(in, bam_file_name); if(idx == NULL) errx(1,"Unable to open BAM/SAM index."); bam_hdr_t *hdr = sam_hdr_read(in); int r = sam_hdr_write(disc, hdr); r = sam_hdr_write(split, hdr); bam1_t *aln = bam_init1(); int ret; while(ret = sam_read1(in, hdr, aln) >= 0) { if (((aln->core.flag) & 1294) == 0) r = sam_write1(disc, hdr, aln); uint8_t *sa = bam_aux_get(aln, "SA"); if (sa != 0) { char *sa_tag = strdup(bam_aux2Z(sa)); if ( count_tags(sa_tag) == 1) { char *chrm, strand, *cigar; uint32_t pos; split_sa_tag(sa_tag, &chrm, &pos, &strand, &cigar); struct line sa, al; calcOffsets(cigar, pos, strand, &sa); sa.chrm = chrm; sa.strand = strand; calcAlnOffsets(bam_get_cigar(aln), aln->core.n_cigar, aln->core.pos, bam_is_rev(aln) ? '-' : '+', &al); al.chrm = hdr->target_name[aln->core.tid]; al.strand = bam_is_rev(aln) ? '-' : '+'; struct line *left = &al, *right = &sa; if (left->SQO > right->SQO) { left = &sa; right = &al; } int overlap = MAX(1 + MIN(left->EQO, right->EQO) - MAX(left->SQO, right->SQO), 0); int alen1 = 1 + left->EQO - left->SQO; int alen2 = 1 + right->EQO - right->SQO; int mno = MIN(alen1-overlap, alen2-overlap); if (mno < MIN_NON_OVERLAP) continue; if ( (strcmp(left->chrm, right->chrm) == 0) && (left->strand == right->strand) ) { int leftDiag, rightDiag, insSize; if (left->strand == '-') { leftDiag = left->rapos - left->sclip; rightDiag = (right->rapos + right->raLen) - (right->sclip + right->qaLen); insSize = rightDiag - leftDiag; } else { leftDiag = (left->rapos + left->raLen) - (left->sclip + left->qaLen); rightDiag = right->rapos - right->sclip; insSize = leftDiag - rightDiag; } int desert = right->SQO - left->EQO - 1; if ((abs(insSize) < MIN_INDEL_SIZE) || ((desert > 0) && ( (desert - (int)MAX(0, insSize)) > MAX_UNMAPPED_BASES))) continue; } char *qname = bam_get_qname(aln); if ((aln->core.flag & 64) == 64) qname[0] = 'A'; else qname[0] = 'B'; r = sam_write1(split, hdr, aln); } free(sa_tag); } } bam_destroy1(aln); hts_idx_destroy(idx); bam_hdr_destroy(hdr); sam_close(in); sam_close(disc); sam_close(split); if(ret < -1) { errx(1, "lumpy_filter: error reading bam: %s\n", bam_file_name); } }
int extract_main(int argc, char *argv[]) { char *opref = NULL, *oname, *p; int c, i; Config config; //Defaults config.keepCpG = 1; config.keepCHG = 0; config.keepCHH = 0; config.minMapq = 10; config.minPhred = 5; config.keepDupes = 0; config.keepSingleton = 0, config.keepDiscordant = 0; config.merge = 0; config.maxDepth = 2000; config.fai = NULL; config.fp = NULL; config.bai = NULL; config.reg = NULL; config.bedName = NULL; config.bed = NULL; config.fraction = 0; config.counts = 0; config.logit = 0; for(i=0; i<16; i++) config.bounds[i] = 0; static struct option lopts[] = { {"opref", 1, NULL, 'o'}, {"fraction", 0, NULL, 'f'}, {"counts", 0, NULL, 'c'}, {"logit", 0, NULL, 'm'}, {"noCpG", 0, NULL, 1}, {"CHG", 0, NULL, 2}, {"CHH", 0, NULL, 3}, {"keepDupes", 0, NULL, 4}, {"keepSingleton",0, NULL, 5}, {"keepDiscordant",0,NULL, 6}, {"OT", 1, NULL, 7}, {"OB", 1, NULL, 8}, {"CTOT", 1, NULL, 9}, {"CTOB", 1, NULL, 10}, {"mergeContext", 0, NULL, 11}, {"help", 0, NULL, 'h'}, {0, 0, NULL, 0} }; while((c = getopt_long(argc, argv, "q:p:r:l:o:D:f:c:m:", lopts,NULL)) >=0) { switch(c) { case 'h' : extract_usage(); return 0; case 'o' : opref = strdup(optarg); break; case 'D' : config.maxDepth = atoi(optarg); break; case 'r': config.reg = strdup(optarg); break; case 'l' : config.bedName = optarg; break; case 1 : config.keepCpG = 0; break; case 2 : config.keepCHG = 1; break; case 3 : config.keepCHH = 1; break; case 4 : config.keepDupes = 1; break; case 5 : config.keepSingleton = 1; break; case 6 : config.keepDiscordant = 1; break; case 7 : parseBounds(optarg, config.bounds, 0); break; case 8 : parseBounds(optarg, config.bounds, 1); break; case 9 : parseBounds(optarg, config.bounds, 2); break; case 10 : parseBounds(optarg, config.bounds, 3); break; case 11 : config.merge = 1; break; case 'q' : config.minMapq = atoi(optarg); break; case 'p' : config.minPhred = atoi(optarg); break; case 'm' : config.logit = 1; break; case 'f' : config.fraction = 1; break; case 'c' : config.counts = 1; break; case '?' : default : fprintf(stderr, "Invalid option '%c'\n", c); extract_usage(); return 1; } } if(argc == 1) { extract_usage(); return 0; } if(argc-optind != 2) { fprintf(stderr, "You must supply a reference genome in fasta format and an input BAM file!!!\n"); extract_usage(); return -1; } //Are the options reasonable? if(config.minPhred < 1) { fprintf(stderr, "-p %i is invalid. resetting to 1, which is the lowest possible value.\n", config.minPhred); config.minPhred = 1; } if(config.minMapq < 0) { fprintf(stderr, "-q %i is invalid. Resetting to 0, which is the lowest possible value.\n", config.minMapq); config.minMapq = 0; } if(config.fraction+config.counts+config.logit > 1) { fprintf(stderr, "More than one of --fraction, --counts, and --logit were specified. These are mutually exclusive.\n"); extract_usage(); return 1; } //Has more than one output format been requested? if(config.fraction + config.counts + config.logit > 1) { fprintf(stderr, "You may specify AT MOST one of -c/--counts, -f/--fraction, or -m/--logit.\n"); return -6; } //Is there still a metric to output? if(!(config.keepCpG + config.keepCHG + config.keepCHH)) { fprintf(stderr, "You haven't specified any metrics to output!\nEither don't use the --noCpG option or specify --CHG and/or --CHH.\n"); return -1; } //Open the files if((config.fai = fai_load(argv[optind])) == NULL) { fprintf(stderr, "Couldn't open the index for %s!\n", argv[optind]); extract_usage(); return -2; } if((config.fp = hts_open(argv[optind+1], "rb")) == NULL) { fprintf(stderr, "Couldn't open %s for reading!\n", argv[optind+1]); return -4; } if((config.bai = sam_index_load(config.fp, argv[optind+1])) == NULL) { fprintf(stderr, "Couldn't load the index for %s, will attempt to build it.\n", argv[optind+1]); if(bam_index_build(argv[optind+1], 0) < 0) { fprintf(stderr, "Couldn't build the index for %s! File corrupted?\n", argv[optind+1]); return -5; } if((config.bai = sam_index_load(config.fp, argv[optind+1])) == NULL) { fprintf(stderr, "Still couldn't load the index, quiting.\n"); return -5; } } //Output files config.output_fp = malloc(sizeof(FILE *) * 3); assert(config.output_fp); if(opref == NULL) { opref = strdup(argv[optind+1]); assert(opref); p = strrchr(opref, '.'); if(p != NULL) *p = '\0'; fprintf(stderr, "writing to prefix:'%s'\n", opref); } if(config.fraction) { oname = malloc(sizeof(char) * (strlen(opref)+19)); } else if(config.counts) { oname = malloc(sizeof(char) * (strlen(opref)+21)); } else if(config.logit) { oname = malloc(sizeof(char) * (strlen(opref)+20)); } else { oname = malloc(sizeof(char) * (strlen(opref)+14)); } assert(oname); if(config.keepCpG) { if(config.fraction) { sprintf(oname, "%s_CpG.meth.bedGraph", opref); } else if(config.counts) { sprintf(oname, "%s_CpG.counts.bedGraph", opref); } else if(config.logit) { sprintf(oname, "%s_CpG.logit.bedGraph", opref); } else { sprintf(oname, "%s_CpG.bedGraph", opref); } config.output_fp[0] = fopen(oname, "w"); if(config.output_fp[0] == NULL) { fprintf(stderr, "Couldn't open the output CpG metrics file for writing! Insufficient permissions?\n"); return -3; } printHeader(config.output_fp[0], "CpG", opref, config); } if(config.keepCHG) { if(config.fraction) { sprintf(oname, "%s_CHG.meth.bedGraph", opref); } else if(config.counts) { sprintf(oname, "%s_CHG.counts.bedGraph", opref); } else if(config.logit) { sprintf(oname, "%s_CHG.logit.bedGraph", opref); } else { sprintf(oname, "%s_CHG.bedGraph", opref); } config.output_fp[1] = fopen(oname, "w"); if(config.output_fp[1] == NULL) { fprintf(stderr, "Couldn't open the output CHG metrics file for writing! Insufficient permissions?\n"); return -3; } printHeader(config.output_fp[1], "CHG", opref, config); } if(config.keepCHH) { if(config.fraction) { sprintf(oname, "%s_CHH.meth.bedGraph", opref); } else if(config.counts) { sprintf(oname, "%s_CHH.counts.bedGraph", opref); } else if(config.logit) { sprintf(oname, "%s_CHH.logit.bedGraph", opref); } else { sprintf(oname, "%s_CHH.bedGraph", opref); } config.output_fp[2] = fopen(oname, "w"); if(config.output_fp[2] == NULL) { fprintf(stderr, "Couldn't open the output CHH metrics file for writing! Insufficient permissions?\n"); return -3; } printHeader(config.output_fp[2], "CHH", opref, config); } //Run the pileup extractCalls(&config); //Close things up hts_close(config.fp); fai_destroy(config.fai); if(config.keepCpG) fclose(config.output_fp[0]); if(config.keepCHG) fclose(config.output_fp[1]); if(config.keepCHH) fclose(config.output_fp[2]); hts_idx_destroy(config.bai); free(opref); if(config.reg) free(config.reg); if(config.bed) destroyBED(config.bed); free(oname); free(config.output_fp); return 0; }
void train_one_round(const Fast5Map& name_map, size_t round) { const PoreModelMap& current_models = PoreModelSet::get_models(opt::trained_model_type); // Initialize the training summary stats for each kmer for each model ModelTrainingMap model_training_data; for(auto current_model_iter = current_models.begin(); current_model_iter != current_models.end(); current_model_iter++) { // one summary entry per kmer in the model std::vector<StateSummary> summaries(current_model_iter->second.get_num_states()); model_training_data[current_model_iter->first] = summaries; } // Open the BAM and iterate over reads // load bam file htsFile* bam_fh = sam_open(opt::bam_file.c_str(), "r"); assert(bam_fh != NULL); // load bam index file std::string index_filename = opt::bam_file + ".bai"; hts_idx_t* bam_idx = bam_index_load(index_filename.c_str()); assert(bam_idx != NULL); // read the bam header bam_hdr_t* hdr = sam_hdr_read(bam_fh); // load reference fai file faidx_t *fai = fai_load(opt::genome_file.c_str()); hts_itr_t* itr; // If processing a region of the genome, only emit events aligned to this window int clip_start = -1; int clip_end = -1; if(opt::region.empty()) { // TODO: is this valid? itr = sam_itr_queryi(bam_idx, HTS_IDX_START, 0, 0); } else { fprintf(stderr, "Region: %s\n", opt::region.c_str()); itr = sam_itr_querys(bam_idx, hdr, opt::region.c_str()); hts_parse_reg(opt::region.c_str(), &clip_start, &clip_end); } #ifndef H5_HAVE_THREADSAFE if(opt::num_threads > 1) { fprintf(stderr, "You enabled multi-threading but you do not have a threadsafe HDF5\n"); fprintf(stderr, "Please recompile nanopolish's built-in libhdf5 or run with -t 1\n"); exit(1); } #endif // Initialize iteration std::vector<bam1_t*> records(opt::batch_size, NULL); for(size_t i = 0; i < records.size(); ++i) { records[i] = bam_init1(); } int result; size_t num_reads_realigned = 0; size_t num_records_buffered = 0; Progress progress("[methyltrain]"); do { assert(num_records_buffered < records.size()); // read a record into the next slot in the buffer result = sam_itr_next(bam_fh, itr, records[num_records_buffered]); num_records_buffered += result >= 0; // realign if we've hit the max buffer size or reached the end of file if(num_records_buffered == records.size() || result < 0) { #pragma omp parallel for for(size_t i = 0; i < num_records_buffered; ++i) { bam1_t* record = records[i]; size_t read_idx = num_reads_realigned + i; if( (record->core.flag & BAM_FUNMAP) == 0) { add_aligned_events(name_map, fai, hdr, record, read_idx, clip_start, clip_end, round, model_training_data); } } num_reads_realigned += num_records_buffered; num_records_buffered = 0; } if(opt::progress) { fprintf(stderr, "Realigned %zu reads in %.1lfs\r", num_reads_realigned, progress.get_elapsed_seconds()); } } while(result >= 0); assert(num_records_buffered == 0); progress.end(); // open the summary file std::stringstream summary_fn; summary_fn << "methyltrain" << opt::out_suffix << ".summary"; FILE* summary_fp = fopen(summary_fn.str().c_str(), "w"); fprintf(summary_fp, "model_short_name\tkmer\tnum_matches\tnum_skips\t" "num_stays\tnum_events_for_training\twas_trained\t" "trained_level_mean\ttrained_level_stdv\n"); // open the tsv file with the raw training data std::stringstream training_fn; training_fn << "methyltrain" << opt::out_suffix << ".round" << round << ".events.tsv"; std::ofstream training_ofs(training_fn.str()); // write out a header for the training data StateTrainingData::write_header(training_ofs); // iterate over models: template, complement_pop1, complement_pop2 for(auto model_training_iter = model_training_data.begin(); model_training_iter != model_training_data.end(); model_training_iter++) { // Initialize the trained model from the input model auto current_model_iter = current_models.find(model_training_iter->first); assert(current_model_iter != current_models.end()); std::string model_name = model_training_iter->first; std::string model_short_name = current_model_iter->second.metadata.get_short_name(); // Initialize the new model from the current model PoreModel updated_model = current_model_iter->second; uint32_t k = updated_model.k; const std::vector<StateSummary>& summaries = model_training_iter->second; // Generate the complete set of kmers std::string gen_kmer(k, 'A'); std::vector<std::string> all_kmers; for(size_t ki = 0; ki < summaries.size(); ++ki) { all_kmers.push_back(gen_kmer); mtrain_alphabet->lexicographic_next(gen_kmer); } assert(gen_kmer == std::string(k, 'A')); assert(all_kmers.front() == std::string(k, 'A')); assert(all_kmers.back() == std::string(k, 'T')); // Update means for each kmer #pragma omp parallel for for(size_t ki = 0; ki < summaries.size(); ++ki) { assert(ki < all_kmers.size()); std::string kmer = all_kmers[ki]; // write the observed values to a tsv file #pragma omp critical { for(size_t ei = 0; ei < summaries[ki].events.size(); ++ei) { summaries[ki].events[ei].write_tsv(training_ofs, model_short_name, kmer); } } bool is_m_kmer = kmer.find('M') != std::string::npos; bool update_kmer = opt::training_target == TT_ALL_KMERS || (is_m_kmer && opt::training_target == TT_METHYLATED_KMERS) || (!is_m_kmer && opt::training_target == TT_UNMETHYLATED_KMERS); bool trained = false; // only train if there are a sufficient number of events for this kmer if(update_kmer && summaries[ki].events.size() >= opt::min_number_of_events_to_train) { // train a mixture model where a minority of k-mers aren't methylated ParamMixture mixture; float incomplete_methylation_rate = 0.05f; std::string um_kmer = mtrain_alphabet->unmethylate(kmer); size_t um_ki = mtrain_alphabet->kmer_rank(um_kmer.c_str(), k); // Initialize the training parameters. If this is a kmer containing // a methylation site we train a two component mixture, otherwise // just fit a gaussian float major_weight = is_m_kmer ? 1 - incomplete_methylation_rate : 1.0f; mixture.log_weights.push_back(log(major_weight)); mixture.params.push_back(current_model_iter->second.get_parameters(ki)); if(is_m_kmer) { // add second unmethylated component mixture.log_weights.push_back(std::log(incomplete_methylation_rate)); mixture.params.push_back(current_model_iter->second.get_parameters(um_ki)); } if(opt::verbose > 1) { fprintf(stderr, "INIT__MIX %s\t%s\t[%.2lf %.2lf %.2lf]\t[%.2lf %.2lf %.2lf]\n", model_training_iter->first.c_str(), kmer.c_str(), std::exp(mixture.log_weights[0]), mixture.params[0].level_mean, mixture.params[0].level_stdv, std::exp(mixture.log_weights[1]), mixture.params[1].level_mean, mixture.params[1].level_stdv); } ParamMixture trained_mixture = train_gaussian_mixture(summaries[ki].events, mixture); if(opt::verbose > 1) { fprintf(stderr, "TRAIN_MIX %s\t%s\t[%.2lf %.2lf %.2lf]\t[%.2lf %.2lf %.2lf]\n", model_training_iter->first.c_str(), kmer.c_str(), std::exp(trained_mixture.log_weights[0]), trained_mixture.params[0].level_mean, trained_mixture.params[0].level_stdv, std::exp(trained_mixture.log_weights[1]), trained_mixture.params[1].level_mean, trained_mixture.params[1].level_stdv); } #pragma omp critical updated_model.states[ki] = trained_mixture.params[0]; if (model_stdv()) { ParamMixture ig_mixture; // weights ig_mixture.log_weights = trained_mixture.log_weights; // states ig_mixture.params.emplace_back(trained_mixture.params[0]); if(is_m_kmer) { ig_mixture.params.emplace_back(current_model_iter->second.get_parameters(um_ki)); } // run training auto trained_ig_mixture = train_invgaussian_mixture(summaries[ki].events, ig_mixture); LOG("methyltrain", debug) << "IG_INIT__MIX " << model_training_iter->first.c_str() << " " << kmer.c_str() << " [" << std::fixed << std::setprecision(5) << ig_mixture.params[0].sd_mean << " " << ig_mixture.params[1].sd_mean << "]" << std::endl << "IG_TRAIN_MIX " << model_training_iter->first.c_str() << " " << kmer.c_str() << " [" << trained_ig_mixture.params[0].sd_mean << " " << trained_ig_mixture.params[1].sd_mean << "]" << std::endl; // update state #pragma omp critical { updated_model.states[ki] = trained_ig_mixture.params[0]; } } trained = true; } #pragma omp critical { fprintf(summary_fp, "%s\t%s\t%d\t%d\t%d\t%zu\t%d\t%.2lf\t%.2lf\n", model_short_name.c_str(), kmer.c_str(), summaries[ki].num_matches, summaries[ki].num_skips, summaries[ki].num_stays, summaries[ki].events.size(), trained, updated_model.states[ki].level_mean, updated_model.states[ki].level_stdv); } // add the updated model into the collection (or replace what is already there) PoreModelSet::insert_model(opt::trained_model_type, updated_model); } } // cleanup records for(size_t i = 0; i < records.size(); ++i) { bam_destroy1(records[i]); } // cleanup sam_itr_destroy(itr); bam_hdr_destroy(hdr); fai_destroy(fai); sam_close(bam_fh); hts_idx_destroy(bam_idx); fclose(summary_fp); }
/* * Performs pileup * @param conf configuration for this pileup * @param n number of files specified in fn * @param fn filenames */ static int mpileup(mplp_conf_t *conf, int n, char **fn) { extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list); extern void bcf_call_del_rghash(void *rghash); mplp_aux_t **data; int i, tid, pos, *n_plp, beg0 = 0, end0 = INT_MAX, ref_len, max_depth, max_indel_depth; const bam_pileup1_t **plp; mplp_ref_t mp_ref = MPLP_REF_INIT; bam_mplp_t iter; bam_hdr_t *h = NULL; /* header of first file in input list */ char *ref; void *rghash = NULL; FILE *pileup_fp = NULL; bcf_callaux_t *bca = NULL; bcf_callret1_t *bcr = NULL; bcf_call_t bc; htsFile *bcf_fp = NULL; bcf_hdr_t *bcf_hdr = NULL; bam_sample_t *sm = NULL; kstring_t buf; mplp_pileup_t gplp; memset(&gplp, 0, sizeof(mplp_pileup_t)); memset(&buf, 0, sizeof(kstring_t)); memset(&bc, 0, sizeof(bcf_call_t)); data = calloc(n, sizeof(mplp_aux_t*)); plp = calloc(n, sizeof(bam_pileup1_t*)); n_plp = calloc(n, sizeof(int)); sm = bam_smpl_init(); if (n == 0) { fprintf(pysam_stderr,"[%s] no input file/data given\n", __func__); exit(EXIT_FAILURE); } // read the header of each file in the list and initialize data for (i = 0; i < n; ++i) { bam_hdr_t *h_tmp; data[i] = calloc(1, sizeof(mplp_aux_t)); data[i]->fp = sam_open_format(fn[i], "rb", &conf->ga.in); if ( !data[i]->fp ) { fprintf(pysam_stderr, "[%s] failed to open %s: %s\n", __func__, fn[i], strerror(errno)); exit(EXIT_FAILURE); } if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { fprintf(pysam_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); exit(EXIT_FAILURE); } if (conf->fai_fname && hts_set_fai_filename(data[i]->fp, conf->fai_fname) != 0) { fprintf(pysam_stderr, "[%s] failed to process %s: %s\n", __func__, conf->fai_fname, strerror(errno)); exit(EXIT_FAILURE); } data[i]->conf = conf; data[i]->ref = &mp_ref; h_tmp = sam_hdr_read(data[i]->fp); if ( !h_tmp ) { fprintf(pysam_stderr,"[%s] fail to read the header of %s\n", __func__, fn[i]); exit(EXIT_FAILURE); } bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); // Collect read group IDs with PL (platform) listed in pl_list (note: fragile, strstr search) rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); if (conf->reg) { hts_idx_t *idx = sam_index_load(data[i]->fp, fn[i]); if (idx == NULL) { fprintf(pysam_stderr, "[%s] fail to load index for %s\n", __func__, fn[i]); exit(EXIT_FAILURE); } if ( (data[i]->iter=sam_itr_querys(idx, h_tmp, conf->reg)) == 0) { fprintf(pysam_stderr, "[E::%s] fail to parse region '%s' with %s\n", __func__, conf->reg, fn[i]); exit(EXIT_FAILURE); } if (i == 0) beg0 = data[i]->iter->beg, end0 = data[i]->iter->end; hts_idx_destroy(idx); } else data[i]->iter = NULL; if (i == 0) h = data[i]->h = h_tmp; // save the header of the first file else { // FIXME: check consistency between h and h_tmp bam_hdr_destroy(h_tmp); // we store only the first file's header; it's (alleged to be) // compatible with the i-th file's target_name lookup needs data[i]->h = h; } } // allocate data storage proportionate to number of samples being studied sm->n gplp.n = sm->n; gplp.n_plp = calloc(sm->n, sizeof(int)); gplp.m_plp = calloc(sm->n, sizeof(int)); gplp.plp = calloc(sm->n, sizeof(bam_pileup1_t*)); fprintf(pysam_stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n); // write the VCF header if (conf->flag & MPLP_BCF) { const char *mode; if ( conf->flag & MPLP_VCF ) mode = (conf->flag&MPLP_NO_COMP)? "wu" : "wz"; // uncompressed VCF or compressed VCF else mode = (conf->flag&MPLP_NO_COMP)? "wub" : "wb"; // uncompressed BCF or compressed BCF bcf_fp = bcf_open(conf->output_fname? conf->output_fname : "-", mode); if (bcf_fp == NULL) { fprintf(pysam_stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno)); exit(EXIT_FAILURE); } // BCF header creation bcf_hdr = bcf_hdr_init("w"); kstring_t str = {0,0,NULL}; ksprintf(&str, "##samtoolsVersion=%s+htslib-%s\n",samtools_version(),hts_version()); bcf_hdr_append(bcf_hdr, str.s); str.l = 0; ksprintf(&str, "##samtoolsCommand=samtools mpileup"); for (i=1; i<conf->argc; i++) ksprintf(&str, " %s", conf->argv[i]); kputc('\n', &str); bcf_hdr_append(bcf_hdr, str.s); if (conf->fai_fname) { str.l = 0; ksprintf(&str, "##reference=file://%s\n", conf->fai_fname); bcf_hdr_append(bcf_hdr, str.s); } // Translate BAM @SQ tags to BCF ##contig tags // todo: use/write new BAM header manipulation routines, fill also UR, M5 for (i=0; i<h->n_targets; i++) { str.l = 0; ksprintf(&str, "##contig=<ID=%s,length=%d>", h->target_name[i], h->target_len[i]); bcf_hdr_append(bcf_hdr, str.s); } free(str.s); bcf_hdr_append(bcf_hdr,"##ALT=<ID=*,Description=\"Represents allele(s) other than observed.\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=IDV,Number=1,Type=Integer,Description=\"Maximum number of reads supporting an indel\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=IMF,Number=1,Type=Float,Description=\"Maximum fraction of reads supporting an indel\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)\",Version=\"3\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">"); #if CDF_MWU_TESTS bcf_hdr_append(bcf_hdr,"##INFO=<ID=RPB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias [CDF] (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias [CDF] (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=BQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias [CDF] (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQSB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias [CDF] (bigger is better)\">"); #endif bcf_hdr_append(bcf_hdr,"##INFO=<ID=SGB,Number=1,Type=Float,Description=\"Segregation based metric.\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQ0F,Number=1,Type=Float,Description=\"Fraction of MQ0 reads (smaller is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=I16,Number=16,Type=Float,Description=\"Auxiliary tag used for calling, see description of bcf_callret1_t in bam2bcf.h\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=QS,Number=R,Type=Float,Description=\"Auxiliary tag used for calling\">"); bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods\">"); if ( conf->fmt_flag&B2B_FMT_DP ) bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Number of high-quality bases\">"); if ( conf->fmt_flag&B2B_FMT_DV ) bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DV,Number=1,Type=Integer,Description=\"Number of high-quality non-reference bases\">"); if ( conf->fmt_flag&B2B_FMT_DPR ) bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">"); if ( conf->fmt_flag&B2B_INFO_DPR ) bcf_hdr_append(bcf_hdr,"##INFO=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">"); if ( conf->fmt_flag&B2B_FMT_DP4 ) bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-fwd, ref-reverse, alt-fwd and alt-reverse bases\">"); if ( conf->fmt_flag&B2B_FMT_SP ) bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Phred-scaled strand bias P-value\">"); if ( conf->fmt_flag&B2B_FMT_AD ) bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths\">"); if ( conf->fmt_flag&B2B_FMT_ADF ) bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=ADF,Number=R,Type=Integer,Description=\"Allelic depths on the forward strand\">"); if ( conf->fmt_flag&B2B_FMT_ADR ) bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=ADR,Number=R,Type=Integer,Description=\"Allelic depths on the reverse strand\">"); if ( conf->fmt_flag&B2B_INFO_AD ) bcf_hdr_append(bcf_hdr,"##INFO=<ID=AD,Number=R,Type=Integer,Description=\"Total allelic depths\">"); if ( conf->fmt_flag&B2B_INFO_ADF ) bcf_hdr_append(bcf_hdr,"##INFO=<ID=ADF,Number=R,Type=Integer,Description=\"Total allelic depths on the forward strand\">"); if ( conf->fmt_flag&B2B_INFO_ADR ) bcf_hdr_append(bcf_hdr,"##INFO=<ID=ADR,Number=R,Type=Integer,Description=\"Total allelic depths on the reverse strand\">"); for (i=0; i<sm->n; i++) bcf_hdr_add_sample(bcf_hdr, sm->smpl[i]); bcf_hdr_add_sample(bcf_hdr, NULL); bcf_hdr_write(bcf_fp, bcf_hdr); // End of BCF header creation // Initialise the calling algorithm bca = bcf_call_init(-1., conf->min_baseQ); bcr = calloc(sm->n, sizeof(bcf_callret1_t)); bca->rghash = rghash; bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ; bca->min_frac = conf->min_frac; bca->min_support = conf->min_support; bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE; bc.bcf_hdr = bcf_hdr; bc.n = sm->n; bc.PL = malloc(15 * sm->n * sizeof(*bc.PL)); if (conf->fmt_flag) { assert( sizeof(float)==sizeof(int32_t) ); bc.DP4 = malloc(sm->n * sizeof(int32_t) * 4); bc.fmt_arr = malloc(sm->n * sizeof(float)); // all fmt_flag fields if ( conf->fmt_flag&(B2B_INFO_DPR|B2B_FMT_DPR|B2B_INFO_AD|B2B_INFO_ADF|B2B_INFO_ADR|B2B_FMT_AD|B2B_FMT_ADF|B2B_FMT_ADR) ) { // first B2B_MAX_ALLELES fields for total numbers, the rest per-sample bc.ADR = (int32_t*) malloc((sm->n+1)*B2B_MAX_ALLELES*sizeof(int32_t)); bc.ADF = (int32_t*) malloc((sm->n+1)*B2B_MAX_ALLELES*sizeof(int32_t)); for (i=0; i<sm->n; i++) { bcr[i].ADR = bc.ADR + (i+1)*B2B_MAX_ALLELES; bcr[i].ADF = bc.ADF + (i+1)*B2B_MAX_ALLELES; } } } } else { pileup_fp = conf->output_fname? fopen(conf->output_fname, "w") : pysam_stdout; if (pileup_fp == NULL) { fprintf(pysam_stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname, strerror(errno)); exit(EXIT_FAILURE); } } // init pileup iter = bam_mplp_init(n, mplp_func, (void**)data); if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(iter); max_depth = conf->max_depth; if (max_depth * sm->n > 1<<20) fprintf(pysam_stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__); if (max_depth * sm->n < 8000) { max_depth = 8000 / sm->n; fprintf(pysam_stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth); } max_indel_depth = conf->max_indel_depth * sm->n; bam_mplp_set_maxcnt(iter, max_depth); bcf1_t *bcf_rec = bcf_init1(); int ret; // begin pileup while ( (ret=bam_mplp_auto(iter, &tid, &pos, n_plp, plp)) > 0) { if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; mplp_get_ref(data[0], tid, &ref, &ref_len); //printf("tid=%d len=%d ref=%p/%s\n", tid, ref_len, ref, ref); if (conf->flag & MPLP_BCF) { int total_depth, _ref0, ref16; for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i]; group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG); _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; ref16 = seq_nt16_table[_ref0]; bcf_callaux_clean(bca, &bc); for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i); bc.tid = tid; bc.pos = pos; bcf_call_combine(gplp.n, bcr, bca, ref16, &bc); bcf_clear1(bcf_rec); bcf_call2bcf(&bc, bcf_rec, bcr, conf->fmt_flag, 0, 0); bcf_write1(bcf_fp, bcf_hdr, bcf_rec); // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring? if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { bcf_callaux_clean(bca, &bc); for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i); if (bcf_call_combine(gplp.n, bcr, bca, -1, &bc) >= 0) { bcf_clear1(bcf_rec); bcf_call2bcf(&bc, bcf_rec, bcr, conf->fmt_flag, bca, ref); bcf_write1(bcf_fp, bcf_hdr, bcf_rec); } } } else { fprintf(pileup_fp, "%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); for (i = 0; i < n; ++i) { int j, cnt; for (j = cnt = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; int c = p->qpos < p->b->core.l_qseq ? bam_get_qual(p->b)[p->qpos] : 0; if (c >= conf->min_baseQ) ++cnt; } fprintf(pileup_fp, "\t%d\t", cnt); if (n_plp[i] == 0) { fputs("*\t*", pileup_fp); if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", pileup_fp); if (conf->flag & MPLP_PRINT_POS) fputs("\t*", pileup_fp); } else { for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; int c = p->qpos < p->b->core.l_qseq ? bam_get_qual(p->b)[p->qpos] : 0; if (c >= conf->min_baseQ) pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref); } putc('\t', pileup_fp); for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; int c = p->qpos < p->b->core.l_qseq ? bam_get_qual(p->b)[p->qpos] : 0; if (c >= conf->min_baseQ) { c = c + 33 < 126? c + 33 : 126; putc(c, pileup_fp); } } if (conf->flag & MPLP_PRINT_MAPQ) { putc('\t', pileup_fp); for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; int c = bam_get_qual(p->b)[p->qpos]; if ( c < conf->min_baseQ ) continue; c = plp[i][j].b->core.qual + 33; if (c > 126) c = 126; putc(c, pileup_fp); } } if (conf->flag & MPLP_PRINT_POS) { putc('\t', pileup_fp); int last = 0; for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; int c = bam_get_qual(p->b)[p->qpos]; if ( c < conf->min_baseQ ) continue; if (last++) putc(',', pileup_fp); fprintf(pileup_fp, "%d", plp[i][j].qpos + 1); // FIXME: fprintf(pysam_stdout, ) is very slow... } } } } putc('\n', pileup_fp); } } // clean up free(bc.tmp.s); bcf_destroy1(bcf_rec); if (bcf_fp) { hts_close(bcf_fp); bcf_hdr_destroy(bcf_hdr); bcf_call_destroy(bca); free(bc.PL); free(bc.DP4); free(bc.ADR); free(bc.ADF); free(bc.fmt_arr); free(bcr); } if (pileup_fp && conf->output_fname) fclose(pileup_fp); bam_smpl_destroy(sm); free(buf.s); for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]); free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp); bcf_call_del_rghash(rghash); bam_mplp_destroy(iter); bam_hdr_destroy(h); for (i = 0; i < n; ++i) { sam_close(data[i]->fp); if (data[i]->iter) hts_itr_destroy(data[i]->iter); free(data[i]); } free(data); free(plp); free(n_plp); free(mp_ref.ref[0]); free(mp_ref.ref[1]); return ret; }
int scorereads_main(int argc, char** argv) { parse_scorereads_options(argc, argv); omp_set_num_threads(opt::num_threads); Fast5Map name_map(opt::reads_file); ModelMap models; if (!opt::models_fofn.empty()) models = read_models_fofn(opt::models_fofn); // Open the BAM and iterate over reads // load bam file htsFile* bam_fh = sam_open(opt::bam_file.c_str(), "r"); assert(bam_fh != NULL); // load bam index file std::string index_filename = opt::bam_file + ".bai"; hts_idx_t* bam_idx = bam_index_load(index_filename.c_str()); assert(bam_idx != NULL); // read the bam header bam_hdr_t* hdr = sam_hdr_read(bam_fh); // load reference fai file faidx_t *fai = fai_load(opt::genome_file.c_str()); hts_itr_t* itr; // If processing a region of the genome, only emit events aligned to this window int clip_start = -1; int clip_end = -1; if(opt::region.empty()) { // TODO: is this valid? itr = sam_itr_queryi(bam_idx, HTS_IDX_START, 0, 0); } else { fprintf(stderr, "Region: %s\n", opt::region.c_str()); itr = sam_itr_querys(bam_idx, hdr, opt::region.c_str()); hts_parse_reg(opt::region.c_str(), &clip_start, &clip_end); } #ifndef H5_HAVE_THREADSAFE if(opt::num_threads > 1) { fprintf(stderr, "You enabled multi-threading but you do not have a threadsafe HDF5\n"); fprintf(stderr, "Please recompile nanopolish's built-in libhdf5 or run with -t 1\n"); exit(1); } #endif // Initialize iteration std::vector<bam1_t*> records(opt::batch_size, NULL); for(size_t i = 0; i < records.size(); ++i) { records[i] = bam_init1(); } int result; size_t num_reads_realigned = 0; size_t num_records_buffered = 0; do { assert(num_records_buffered < records.size()); // read a record into the next slot in the buffer result = sam_itr_next(bam_fh, itr, records[num_records_buffered]); num_records_buffered += result >= 0; // realign if we've hit the max buffer size or reached the end of file if(num_records_buffered == records.size() || result < 0) { #pragma omp parallel for schedule(dynamic) for(size_t i = 0; i < num_records_buffered; ++i) { bam1_t* record = records[i]; size_t read_idx = num_reads_realigned + i; if( (record->core.flag & BAM_FUNMAP) == 0) { //load read std::string read_name = bam_get_qname(record); std::string fast5_path = name_map.get_path(read_name); SquiggleRead sr(read_name, fast5_path); // TODO: early exit when have processed all of the reads in readnames if (!opt::readnames.empty() && std::find(opt::readnames.begin(), opt::readnames.end(), read_name) == opt::readnames.end() ) continue; for(size_t strand_idx = 0; strand_idx < NUM_STRANDS; ++strand_idx) { std::vector<EventAlignment> ao = alignment_from_read(sr, strand_idx, read_idx, models, fai, hdr, record, clip_start, clip_end); if (ao.size() == 0) continue; // Update pore model based on alignment if ( opt::calibrate ) recalibrate_model(sr, strand_idx, ao, false); double score = model_score(sr, strand_idx, fai, ao, 500); if (score > 0) continue; #pragma omp critical(print) std::cout << read_name << " " << ( strand_idx ? "complement" : "template" ) << " " << sr.pore_model[strand_idx].name << " " << score << std::endl; } } } num_reads_realigned += num_records_buffered; num_records_buffered = 0; } } while(result >= 0); // cleanup records for(size_t i = 0; i < records.size(); ++i) { bam_destroy1(records[i]); } // cleanup sam_itr_destroy(itr); bam_hdr_destroy(hdr); fai_destroy(fai); sam_close(bam_fh); hts_idx_destroy(bam_idx); return 0; }
void bam_access_closehts(){ if(fholder->idx) hts_idx_destroy(fholder->idx); if(fholder->in) hts_close(fholder->in); if(fholder) free(fholder); return; }
int main_bedcov(int argc, char *argv[]) { gzFile fp; kstring_t str; kstream_t *ks; hts_idx_t **idx; aux_t **aux; int *n_plp, dret, i, n, c, min_mapQ = 0; int64_t *cnt; const bam_pileup1_t **plp; int usage = 0; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0), { NULL, 0, NULL, 0 } }; while ((c = getopt_long(argc, argv, "Q:", lopts, NULL)) >= 0) { switch (c) { case 'Q': min_mapQ = atoi(optarg); break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': usage = 1; break; } if (usage) break; } if (usage || optind + 2 > argc) { fprintf(stderr, "Usage: samtools bedcov [options] <in.bed> <in1.bam> [...]\n\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, " -Q <int> mapping quality threshold [0]\n"); sam_global_opt_help(stderr, "-.--."); return 1; } memset(&str, 0, sizeof(kstring_t)); n = argc - optind - 1; aux = calloc(n, sizeof(aux_t*)); idx = calloc(n, sizeof(hts_idx_t*)); for (i = 0; i < n; ++i) { aux[i] = calloc(1, sizeof(aux_t)); aux[i]->min_mapQ = min_mapQ; aux[i]->fp = sam_open_format(argv[i+optind+1], "r", &ga.in); if (aux[i]->fp) idx[i] = sam_index_load(aux[i]->fp, argv[i+optind+1]); if (aux[i]->fp == 0 || idx[i] == 0) { fprintf(stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]); return 2; } // TODO bgzf_set_cache_size(aux[i]->fp, 20); aux[i]->header = sam_hdr_read(aux[i]->fp); if (aux[i]->header == NULL) { fprintf(stderr, "ERROR: failed to read header for '%s'\n", argv[i+optind+1]); return 2; } } cnt = calloc(n, 8); fp = gzopen(argv[optind], "rb"); ks = ks_init(fp); n_plp = calloc(n, sizeof(int)); plp = calloc(n, sizeof(bam_pileup1_t*)); while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) { char *p, *q; int tid, beg, end, pos; bam_mplp_t mplp; for (p = q = str.s; *p && *p != '\t'; ++p); if (*p != '\t') goto bed_error; *p = 0; tid = bam_name2id(aux[0]->header, q); *p = '\t'; if (tid < 0) goto bed_error; for (q = p = p + 1; isdigit(*p); ++p); if (*p != '\t') goto bed_error; *p = 0; beg = atoi(q); *p = '\t'; for (q = p = p + 1; isdigit(*p); ++p); if (*p == '\t' || *p == 0) { int c = *p; *p = 0; end = atoi(q); *p = c; } else goto bed_error; for (i = 0; i < n; ++i) { if (aux[i]->iter) hts_itr_destroy(aux[i]->iter); aux[i]->iter = sam_itr_queryi(idx[i], tid, beg, end); } mplp = bam_mplp_init(n, read_bam, (void**)aux); bam_mplp_set_maxcnt(mplp, 64000); memset(cnt, 0, 8 * n); while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) if (pos >= beg && pos < end) for (i = 0; i < n; ++i) cnt[i] += n_plp[i]; for (i = 0; i < n; ++i) { kputc('\t', &str); kputl(cnt[i], &str); } puts(str.s); bam_mplp_destroy(mplp); continue; bed_error: fprintf(stderr, "Errors in BED line '%s'\n", str.s); } free(n_plp); free(plp); ks_destroy(ks); gzclose(fp); free(cnt); for (i = 0; i < n; ++i) { if (aux[i]->iter) hts_itr_destroy(aux[i]->iter); hts_idx_destroy(idx[i]); bam_hdr_destroy(aux[i]->header); sam_close(aux[i]->fp); free(aux[i]); } free(aux); free(idx); free(str.s); sam_global_args_free(&ga); return 0; }
static int query_regions(args_t *args, char *fname, char **regs, int nregs) { int i; htsFile *fp = hts_open(fname,"r"); if ( !fp ) error("Could not read %s\n", fname); enum htsExactFormat format = hts_get_format(fp)->format; regidx_t *reg_idx = NULL; if ( args->targets_fname ) { reg_idx = regidx_init(args->targets_fname, NULL, NULL, 0, NULL); if ( !reg_idx ) error("Could not read %s\n", args->targets_fname); } if ( format == bcf ) { htsFile *out = hts_open("-","w"); if ( !out ) error("Could not open stdout\n", fname); hts_idx_t *idx = bcf_index_load(fname); if ( !idx ) error("Could not load .csi index of %s\n", fname); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Could not read the header: %s\n", fname); if ( args->print_header ) if ( bcf_hdr_write(out,hdr)!=0 ) error("Failed to write to %s\n", fname); if ( !args->header_only ) { bcf1_t *rec = bcf_init(); for (i=0; i<nregs; i++) { hts_itr_t *itr = bcf_itr_querys(idx,hdr,regs[i]); while ( bcf_itr_next(fp, itr, rec) >=0 ) { if ( reg_idx && !regidx_overlap(reg_idx, bcf_seqname(hdr,rec),rec->pos,rec->pos+rec->rlen-1, NULL) ) continue; if ( bcf_write(out,hdr,rec)!=0 ) error("Failed to write to %s\n", fname); } tbx_itr_destroy(itr); } bcf_destroy(rec); } if ( hts_close(out) ) error("hts_close returned non-zero status for stdout\n"); bcf_hdr_destroy(hdr); hts_idx_destroy(idx); } else if ( format==vcf || format==sam || format==unknown_format ) { tbx_t *tbx = tbx_index_load(fname); if ( !tbx ) error("Could not load .tbi/.csi index of %s\n", fname); kstring_t str = {0,0,0}; if ( args->print_header ) { while ( hts_getline(fp, KS_SEP_LINE, &str) >= 0 ) { if ( !str.l || str.s[0]!=tbx->conf.meta_char ) break; puts(str.s); } } if ( !args->header_only ) { int nseq; const char **seq = NULL; if ( reg_idx ) seq = tbx_seqnames(tbx, &nseq); for (i=0; i<nregs; i++) { hts_itr_t *itr = tbx_itr_querys(tbx, regs[i]); if ( !itr ) continue; while (tbx_itr_next(fp, tbx, itr, &str) >= 0) { if ( reg_idx && !regidx_overlap(reg_idx,seq[itr->curr_tid],itr->curr_beg,itr->curr_end-1, NULL) ) continue; puts(str.s); } tbx_itr_destroy(itr); } free(seq); } free(str.s); tbx_destroy(tbx); } else if ( format==bam ) error("Please use \"samtools view\" for querying BAM files.\n"); if ( reg_idx ) regidx_destroy(reg_idx); if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname); for (i=0; i<nregs; i++) free(regs[i]); free(regs); return 0; }
int convert(int argc, char **argv) { if (argc < 2) return convert_help(); int c; char *in=NULL, *out=NULL, *bim=NULL, *vid=NULL, *tmp_dir=NULL, *ped=NULL; uint32_t num_fields, num_records, col = 2; int i_is_set = 0, o_is_set = 0, f_is_set = 0, b_is_set = 0, v_is_set = 0, t_is_set = 0, p_is_set = 0, r_is_set = 0; while((c = getopt (argc, argv, "hi:o:f:r:b:v:t:p:c:")) != -1) { switch (c) { case 'c': col = atoi(optarg); break; case 'p': p_is_set = 1; ped = optarg; break; case 't': t_is_set = 1; tmp_dir = optarg; break; case 'v': v_is_set = 1; vid = optarg; break; case 'b': b_is_set = 1; bim = optarg; break; case 'i': i_is_set = 1; in = optarg; break; case 'o': o_is_set = 1; out = optarg; break; case 'f': f_is_set = 1; num_fields = atoi(optarg); break; case 'r': r_is_set = 1; num_records = atoi(optarg); break; case 'h': convert_help(); return 1; case '?': if ( (optopt == 'i') || (optopt == 'f') || (optopt == 'r') || (optopt == 't') || (optopt == 's') || (optopt == 'p') || (optopt == 'c') || (optopt == 'o') ) fprintf (stderr, "Option -%c requires an argument.\n", optopt); else if (isprint (optopt)) fprintf (stderr, "Unknown option `-%c'.\n", optopt); else fprintf (stderr, "Unknown option character `\\x%x'.\n", optopt); default: convert_help(); return 1; } } char *type = argv[0]; if (i_is_set == 0) { printf("Input file is not set\n"); return convert_help(); } if (strcmp(type, "bcf") == 0) { if ( (f_is_set == 0) || (r_is_set == 0) ) { fprintf(stderr,"Attempting to autodetect num of records " "and fields from %s\n", in); //Try and auto detect the sizes, need the index tbx_t *tbx = NULL; hts_idx_t *idx = NULL; htsFile *fp = hts_open(in,"rb"); if ( !fp ) { fprintf(stderr,"Could not read %s\n", in); return 1; } bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) { fprintf(stderr,"Could not read the header: %s\n", in); return 1; } if (hts_get_format(fp)->format==vcf) { tbx = tbx_index_load(in); if ( !tbx ) { fprintf(stderr,"Could not load TBI index: %s\n", in); return 1; } } else if ( hts_get_format(fp)->format==bcf ) { idx = bcf_index_load(in); if ( !idx ) { fprintf(stderr,"Could not load CSI index: %s\n", in); return 1; } } else { fprintf(stderr, "Could not detect the file type as VCF or BCF: %s\n", in); return 1; } num_fields = hdr->n[BCF_DT_SAMPLE]; num_records = 0; const char **seq; int nseq; seq = tbx ? tbx_seqnames(tbx, &nseq) : bcf_index_seqnames(idx, hdr, &nseq); int i; uint32_t sum = 0; for (i = 0; i < nseq; ++i) { uint64_t records, v; hts_idx_get_stat(tbx ? tbx->idx: idx, i, &records, &v); num_records += records; } fprintf(stderr, "Number of records:%u\tNumber of fields:%u\n", num_records, num_fields); free(seq); hts_close(fp); bcf_hdr_destroy(hdr); if (idx) hts_idx_destroy(idx); if (tbx) tbx_destroy(tbx); } if (o_is_set == 0) { out = (char*)malloc(strlen(in) + 5); // 5 for ext and \0 strcpy(out,in); strcat(out, ".gqt"); } if (b_is_set == 0) { bim = (char*)malloc(strlen(in) + 5); // 5 for ext and \0 strcpy(bim,in); strcat(bim, ".bim"); } if (v_is_set == 0) { vid = (char*)malloc(strlen(in) + 5); // 5 for ext and \0 strcpy(vid,in); strcat(vid, ".vid"); } if (t_is_set == 0) { tmp_dir = (char*)malloc(3*sizeof(char)); // "./\0" strcpy(tmp_dir,"./"); } int r = bcf_wahbm(in, out, bim, vid, tmp_dir, num_fields, num_records); return r; } if (strcmp(type, "ped") == 0) { if (o_is_set == 0) { if (p_is_set == 1) { out = (char*)malloc(strlen(ped) + 4); // 4 for ext and \0 strcpy(out,ped); strcat(out, ".db"); } else { out = (char*)malloc(strlen(in) + 4); // 4 for ext and \0 strcpy(out,in); strcat(out, ".db"); } } fprintf(stderr, "Creating sample database %s\n", out); return ped_ped(in, ped, col, out); } return convert_help(); }