/* mpi_worker() * The main control for an MPI worker process. */ static void mpi_worker(ESL_GETOPTS *go, struct cfg_s *cfg) { int xstatus = eslOK; int status; P7_HMM *hmm = NULL; char *wbuf = NULL; double *xv = NULL; /* result: array of N scores */ int *av = NULL; /* optional result: array of N alignment lengths */ int wn = 0; char errbuf[eslERRBUFSIZE]; int pos; /* Worker initializes */ if ((status = minimum_mpi_working_buffer(go, cfg->N, &wn)) != eslOK) xstatus = status; ESL_ALLOC(wbuf, wn * sizeof(char)); ESL_ALLOC(xv, cfg->N * sizeof(double) + 2); if (esl_opt_GetBoolean(go, "-a")) ESL_ALLOC(av, cfg->N * sizeof(int)); /* Main worker loop */ while (p7_hmm_mpi_Recv(0, 0, MPI_COMM_WORLD, &wbuf, &wn, &(cfg->abc), &hmm) == eslOK) { if (esl_opt_GetBoolean(go, "--recal")) { if (( status = recalibrate_model(go, cfg, errbuf, hmm)) != eslOK) goto CLEANERROR; } if ((status = process_workunit(go, cfg, errbuf, hmm, xv, av)) != eslOK) goto CLEANERROR; pos = 0; MPI_Pack(&status, 1, MPI_INT, wbuf, wn, &pos, MPI_COMM_WORLD); MPI_Pack(xv, cfg->N, MPI_DOUBLE, wbuf, wn, &pos, MPI_COMM_WORLD); if (esl_opt_GetBoolean(go, "-a")) MPI_Pack(av, cfg->N, MPI_INT, wbuf, wn, &pos, MPI_COMM_WORLD); MPI_Send(wbuf, pos, MPI_PACKED, 0, 0, MPI_COMM_WORLD); p7_hmm_Destroy(hmm); } free(wbuf); free(xv); if (av != NULL) free(av); return; CLEANERROR: pos = 0; MPI_Pack(&status, 1, MPI_INT, wbuf, wn, &pos, MPI_COMM_WORLD); MPI_Pack(errbuf, eslERRBUFSIZE, MPI_CHAR, wbuf, wn, &pos, MPI_COMM_WORLD); MPI_Send(wbuf, pos, MPI_PACKED, 0, 0, MPI_COMM_WORLD); if (wbuf != NULL) free(wbuf); if (hmm != NULL) p7_hmm_Destroy(hmm); if (xv != NULL) free(xv); if (av != NULL) free(av); return; ERROR: p7_Fail("Allocation error in mpi_worker"); }
static void serial_master(ESL_GETOPTS *go, struct cfg_s *cfg) { P7_HMM *hmm = NULL; double *xv = NULL; /* results: array of N scores */ int *av = NULL; /* optional results: array of N alignment lengths */ char errbuf[eslERRBUFSIZE]; int status; if ((status = init_master_cfg(go, cfg, errbuf)) != eslOK) p7_Fail(errbuf); if ((xv = malloc(sizeof(double) * cfg->N)) == NULL) p7_Fail("allocation failed"); if (esl_opt_GetBoolean(go, "-a") && (av = malloc(sizeof(int) * cfg->N)) == NULL) p7_Fail("allocation failed"); while ((status = p7_hmmfile_Read(cfg->hfp, &(cfg->abc), &hmm)) != eslEOF) { if (status == eslEOD) p7_Fail("read failed, HMM file %s may be truncated?", cfg->hmmfile); else if (status == eslEFORMAT) p7_Fail("bad file format in HMM file %s", cfg->hmmfile); else if (status == eslEINCOMPAT) p7_Fail("HMM file %s contains different alphabets", cfg->hmmfile); else if (status != eslOK) p7_Fail("Unexpected error in reading HMMs from %s", cfg->hmmfile); if (cfg->bg == NULL) { if (esl_opt_GetBoolean(go, "--bgflat")) cfg->bg = p7_bg_CreateUniform(cfg->abc); else cfg->bg = p7_bg_Create(cfg->abc); p7_bg_SetLength(cfg->bg, esl_opt_GetInteger(go, "-L")); /* set the null model background length in both master and workers. */ } if (esl_opt_GetBoolean(go, "--recal")) { if (recalibrate_model(go, cfg, errbuf, hmm) != eslOK) p7_Fail(errbuf); } if (process_workunit(go, cfg, errbuf, hmm, xv, av) != eslOK) p7_Fail(errbuf); if (output_result (go, cfg, errbuf, hmm, xv, av) != eslOK) p7_Fail(errbuf); p7_hmm_Destroy(hmm); } free(xv); if (av != NULL) free(av); }
// Update the training data with aligned events from a read void add_aligned_events(const Fast5Map& name_map, const faidx_t* fai, const bam_hdr_t* hdr, const bam1_t* record, size_t read_idx, int region_start, int region_end, size_t round, ModelTrainingMap& training) { // Load a squiggle read for the mapped read std::string read_name = bam_get_qname(record); std::string fast5_path = name_map.get_path(read_name); // load read SquiggleRead sr(read_name, fast5_path); // replace the models that are built into the read with the current trained model sr.replace_models(opt::trained_model_type); for(size_t strand_idx = 0; strand_idx < NUM_STRANDS; ++strand_idx) { // skip if 1D reads and this is the wrong strand if(!sr.has_events_for_strand(strand_idx)) { continue; } // set k uint32_t k = sr.pore_model[strand_idx].k; // Align to the new model EventAlignmentParameters params; params.sr = &sr; params.fai = fai; params.hdr = hdr; params.record = record; params.strand_idx = strand_idx; params.alphabet = mtrain_alphabet; params.read_idx = read_idx; params.region_start = region_start; params.region_end = region_end; std::vector<EventAlignment> alignment_output = align_read_to_ref(params); if (alignment_output.size() == 0) return; // Update pore model based on alignment std::string curr_model = sr.pore_model[strand_idx].metadata.get_short_name(); double orig_score = -INFINITY; if (opt::output_scores) { orig_score = model_score(sr, strand_idx, fai, alignment_output, 500, NULL); #pragma omp critical(print) std::cout << round << " " << curr_model << " " << read_idx << " " << strand_idx << " Original " << orig_score << std::endl; } if ( opt::calibrate ) { double resid = 0.; recalibrate_model(sr, strand_idx, alignment_output, mtrain_alphabet, resid, true); if (opt::output_scores) { double rescaled_score = model_score(sr, strand_idx, fai, alignment_output, 500, NULL); #pragma omp critical(print) { std::cout << round << " " << curr_model << " " << read_idx << " " << strand_idx << " Rescaled " << rescaled_score << std::endl; std::cout << round << " " << curr_model << " " << read_idx << " " << strand_idx << " Delta " << rescaled_score-orig_score << std::endl; } } } // Get the training data for this model auto& emission_map = training[curr_model]; for(size_t i = 0; i < alignment_output.size(); ++i) { const EventAlignment& ea = alignment_output[i]; std::string model_kmer = ea.model_kmer; // Grab the previous/next model kmer from the alignment_output table. // If the read is from the same strand as the reference // the next kmer comes from the next alignment_output (and vice-versa) // other the indices are swapped int next_stride = ea.rc ? -1 : 1; std::string prev_kmer = ""; std::string next_kmer = ""; if(i > 0 && i < alignment_output.size() - 1) { // check that the event indices are correct for the next expected position assert(alignment_output[i + next_stride].event_idx - ea.event_idx == 1); assert(alignment_output[i - next_stride].event_idx - ea.event_idx == -1); // only set the previous/next when there was exactly one base of movement along the referenc if( std::abs(alignment_output[i + next_stride].ref_position - ea.ref_position) == 1) { next_kmer = alignment_output[i + next_stride].model_kmer; } if( std::abs(alignment_output[i - next_stride].ref_position - ea.ref_position) == 1) { prev_kmer = alignment_output[i - next_stride].model_kmer; } } // Get the rank of the kmer that we aligned to (on the sequencing strand, = model_kmer) uint32_t rank = mtrain_alphabet->kmer_rank(model_kmer.c_str(), k); assert(rank < emission_map.size()); auto& kmer_summary = emission_map[rank]; // We only use this event for training if its not at the end of the alignment // (to avoid bad alignments around the read edges) and if its not too short (to // avoid bad measurements from effecting the levels too much) bool use_for_training = i > opt::min_distance_from_alignment_end && i + opt::min_distance_from_alignment_end < alignment_output.size() && alignment_output[i].hmm_state == 'M' && sr.get_duration( alignment_output[i].event_idx, strand_idx) >= opt::min_event_duration && sr.get_fully_scaled_level(alignment_output[i].event_idx, strand_idx) >= 1.0; if(use_for_training) { StateTrainingData std(sr, ea, rank, prev_kmer, next_kmer); #pragma omp critical(kmer) kmer_summary.events.push_back(std); } if(ea.hmm_state == 'M') { #pragma omp atomic kmer_summary.num_matches += 1; } else if(ea.hmm_state == 'E') { #pragma omp atomic kmer_summary.num_stays += 1; } } } // for strands }
int scorereads_main(int argc, char** argv) { parse_scorereads_options(argc, argv); omp_set_num_threads(opt::num_threads); Fast5Map name_map(opt::reads_file); ModelMap models; if (!opt::models_fofn.empty()) models = read_models_fofn(opt::models_fofn); // Open the BAM and iterate over reads // load bam file htsFile* bam_fh = sam_open(opt::bam_file.c_str(), "r"); assert(bam_fh != NULL); // load bam index file std::string index_filename = opt::bam_file + ".bai"; hts_idx_t* bam_idx = bam_index_load(index_filename.c_str()); assert(bam_idx != NULL); // read the bam header bam_hdr_t* hdr = sam_hdr_read(bam_fh); // load reference fai file faidx_t *fai = fai_load(opt::genome_file.c_str()); hts_itr_t* itr; // If processing a region of the genome, only emit events aligned to this window int clip_start = -1; int clip_end = -1; if(opt::region.empty()) { // TODO: is this valid? itr = sam_itr_queryi(bam_idx, HTS_IDX_START, 0, 0); } else { fprintf(stderr, "Region: %s\n", opt::region.c_str()); itr = sam_itr_querys(bam_idx, hdr, opt::region.c_str()); hts_parse_reg(opt::region.c_str(), &clip_start, &clip_end); } #ifndef H5_HAVE_THREADSAFE if(opt::num_threads > 1) { fprintf(stderr, "You enabled multi-threading but you do not have a threadsafe HDF5\n"); fprintf(stderr, "Please recompile nanopolish's built-in libhdf5 or run with -t 1\n"); exit(1); } #endif // Initialize iteration std::vector<bam1_t*> records(opt::batch_size, NULL); for(size_t i = 0; i < records.size(); ++i) { records[i] = bam_init1(); } int result; size_t num_reads_realigned = 0; size_t num_records_buffered = 0; do { assert(num_records_buffered < records.size()); // read a record into the next slot in the buffer result = sam_itr_next(bam_fh, itr, records[num_records_buffered]); num_records_buffered += result >= 0; // realign if we've hit the max buffer size or reached the end of file if(num_records_buffered == records.size() || result < 0) { #pragma omp parallel for schedule(dynamic) for(size_t i = 0; i < num_records_buffered; ++i) { bam1_t* record = records[i]; size_t read_idx = num_reads_realigned + i; if( (record->core.flag & BAM_FUNMAP) == 0) { //load read std::string read_name = bam_get_qname(record); std::string fast5_path = name_map.get_path(read_name); SquiggleRead sr(read_name, fast5_path); // TODO: early exit when have processed all of the reads in readnames if (!opt::readnames.empty() && std::find(opt::readnames.begin(), opt::readnames.end(), read_name) == opt::readnames.end() ) continue; for(size_t strand_idx = 0; strand_idx < NUM_STRANDS; ++strand_idx) { std::vector<EventAlignment> ao = alignment_from_read(sr, strand_idx, read_idx, models, fai, hdr, record, clip_start, clip_end); if (ao.size() == 0) continue; // Update pore model based on alignment if ( opt::calibrate ) recalibrate_model(sr, strand_idx, ao, false); double score = model_score(sr, strand_idx, fai, ao, 500); if (score > 0) continue; #pragma omp critical(print) std::cout << read_name << " " << ( strand_idx ? "complement" : "template" ) << " " << sr.pore_model[strand_idx].name << " " << score << std::endl; } } } num_reads_realigned += num_records_buffered; num_records_buffered = 0; } } while(result >= 0); // cleanup records for(size_t i = 0; i < records.size(); ++i) { bam_destroy1(records[i]); } // cleanup sam_itr_destroy(itr); bam_hdr_destroy(hdr); fai_destroy(fai); sam_close(bam_fh); hts_idx_destroy(bam_idx); return 0; }