예제 #1
0
파일: hmmsim.c 프로젝트: EddyRivasLab/hmmer
/* mpi_worker()
 * The main control for an MPI worker process.
 */
static void
mpi_worker(ESL_GETOPTS *go, struct cfg_s *cfg)
{
  int             xstatus = eslOK;
  int             status;
  P7_HMM         *hmm     = NULL;
  char           *wbuf    = NULL;
  double         *xv      = NULL; /* result: array of N scores */
  int            *av      = NULL; /* optional result: array of N alignment lengths */
  int             wn      = 0;
  char            errbuf[eslERRBUFSIZE];
  int             pos;
 
  /* Worker initializes */
  if ((status = minimum_mpi_working_buffer(go, cfg->N, &wn)) != eslOK) xstatus = status;
  ESL_ALLOC(wbuf, wn * sizeof(char));
  ESL_ALLOC(xv,   cfg->N * sizeof(double) + 2);	
  if (esl_opt_GetBoolean(go, "-a"))
    ESL_ALLOC(av, cfg->N * sizeof(int));

  /* Main worker loop */
  while (p7_hmm_mpi_Recv(0, 0, MPI_COMM_WORLD, &wbuf, &wn, &(cfg->abc), &hmm) == eslOK) 
    {
      if (esl_opt_GetBoolean(go, "--recal")) {
	if (( status = recalibrate_model(go, cfg, errbuf, hmm))     != eslOK) goto CLEANERROR;
      }
      if ((status = process_workunit(go, cfg, errbuf, hmm, xv, av)) != eslOK) goto CLEANERROR;

      pos = 0;
      MPI_Pack(&status, 1,      MPI_INT,    wbuf, wn, &pos, MPI_COMM_WORLD);
      MPI_Pack(xv,      cfg->N, MPI_DOUBLE, wbuf, wn, &pos, MPI_COMM_WORLD);
      if (esl_opt_GetBoolean(go, "-a"))
	MPI_Pack(av,    cfg->N, MPI_INT,    wbuf, wn, &pos, MPI_COMM_WORLD);
      MPI_Send(wbuf, pos, MPI_PACKED, 0, 0, MPI_COMM_WORLD);
      p7_hmm_Destroy(hmm);
    }

  free(wbuf);
  free(xv);
  if (av != NULL) free(av);
  return;

 CLEANERROR:
  pos = 0;
  MPI_Pack(&status, 1,                MPI_INT,  wbuf, wn, &pos, MPI_COMM_WORLD);
  MPI_Pack(errbuf,  eslERRBUFSIZE,    MPI_CHAR, wbuf, wn, &pos, MPI_COMM_WORLD);
  MPI_Send(wbuf, pos, MPI_PACKED, 0, 0, MPI_COMM_WORLD);
  if (wbuf != NULL) free(wbuf);
  if (hmm  != NULL) p7_hmm_Destroy(hmm);
  if (xv   != NULL) free(xv);
  if (av   != NULL) free(av);
  return;

 ERROR:
  p7_Fail("Allocation error in mpi_worker");
}
예제 #2
0
파일: hmmsim.c 프로젝트: EddyRivasLab/hmmer
static void
serial_master(ESL_GETOPTS *go, struct cfg_s *cfg)
{
  P7_HMM     *hmm = NULL;     
  double     *xv  = NULL;	/* results: array of N scores */
  int        *av  = NULL;	/* optional results: array of N alignment lengths */
  char        errbuf[eslERRBUFSIZE];
  int         status;


  if ((status = init_master_cfg(go, cfg, errbuf)) != eslOK) p7_Fail(errbuf);
  if ((xv = malloc(sizeof(double) * cfg->N)) == NULL)       p7_Fail("allocation failed");
  if (esl_opt_GetBoolean(go, "-a") && 
      (av = malloc(sizeof(int)    * cfg->N)) == NULL)       p7_Fail("allocation failed");

  while ((status = p7_hmmfile_Read(cfg->hfp, &(cfg->abc), &hmm)) != eslEOF) 
    {
      if      (status == eslEOD)       p7_Fail("read failed, HMM file %s may be truncated?", cfg->hmmfile);
      else if (status == eslEFORMAT)   p7_Fail("bad file format in HMM file %s",             cfg->hmmfile);
      else if (status == eslEINCOMPAT) p7_Fail("HMM file %s contains different alphabets",   cfg->hmmfile);
      else if (status != eslOK)        p7_Fail("Unexpected error in reading HMMs from %s",   cfg->hmmfile);

      if (cfg->bg == NULL) {
        if (esl_opt_GetBoolean(go, "--bgflat")) cfg->bg = p7_bg_CreateUniform(cfg->abc);
        else                                    cfg->bg = p7_bg_Create(cfg->abc);
        p7_bg_SetLength(cfg->bg, esl_opt_GetInteger(go, "-L"));  /* set the null model background length in both master and workers. */
      }


      if (esl_opt_GetBoolean(go, "--recal")) {
        if (recalibrate_model(go, cfg, errbuf, hmm)      != eslOK) p7_Fail(errbuf);
      }
      if (process_workunit(go, cfg, errbuf, hmm, xv, av) != eslOK) p7_Fail(errbuf);
      if (output_result   (go, cfg, errbuf, hmm, xv, av) != eslOK) p7_Fail(errbuf);

      p7_hmm_Destroy(hmm);      
    }
  free(xv);
  if (av != NULL) free(av);
}
// Update the training data with aligned events from a read
void add_aligned_events(const Fast5Map& name_map,
                        const faidx_t* fai,
                        const bam_hdr_t* hdr,
                        const bam1_t* record,
                        size_t read_idx,
                        int region_start,
                        int region_end,
                        size_t round,
                        ModelTrainingMap& training)
{
    // Load a squiggle read for the mapped read
    std::string read_name = bam_get_qname(record);
    std::string fast5_path = name_map.get_path(read_name);

    // load read
    SquiggleRead sr(read_name, fast5_path);

    // replace the models that are built into the read with the current trained model
    sr.replace_models(opt::trained_model_type);

    for(size_t strand_idx = 0; strand_idx < NUM_STRANDS; ++strand_idx) {

        // skip if 1D reads and this is the wrong strand
        if(!sr.has_events_for_strand(strand_idx)) {
            continue;
        }

        // set k
        uint32_t k = sr.pore_model[strand_idx].k;

        // Align to the new model
        EventAlignmentParameters params;
        params.sr = &sr;
        params.fai = fai;
        params.hdr = hdr;
        params.record = record;
        params.strand_idx = strand_idx;

        params.alphabet = mtrain_alphabet;
        params.read_idx = read_idx;
        params.region_start = region_start;
        params.region_end = region_end;
        std::vector<EventAlignment> alignment_output = align_read_to_ref(params);
        if (alignment_output.size() == 0)
            return;

        // Update pore model based on alignment
        std::string curr_model = sr.pore_model[strand_idx].metadata.get_short_name();
        double orig_score = -INFINITY;

        if (opt::output_scores) {
            orig_score = model_score(sr, strand_idx, fai, alignment_output, 500, NULL);

            #pragma omp critical(print)
            std::cout << round << " " << curr_model << " " << read_idx << " " << strand_idx << " Original " << orig_score << std::endl;
        }

        if ( opt::calibrate ) {
            double resid = 0.;
            recalibrate_model(sr, strand_idx, alignment_output, mtrain_alphabet, resid, true);

            if (opt::output_scores) {
                double rescaled_score = model_score(sr, strand_idx, fai, alignment_output, 500, NULL);
                #pragma omp critical(print)
                {
                    std::cout << round << " " << curr_model << " " << read_idx << " " << strand_idx << " Rescaled " << rescaled_score << std::endl;
                    std::cout << round << " " << curr_model << " " << read_idx << " " << strand_idx << " Delta " << rescaled_score-orig_score << std::endl;
                }
            }
        }

        // Get the training data for this model
        auto& emission_map = training[curr_model];

        for(size_t i = 0; i < alignment_output.size(); ++i) {
            const EventAlignment& ea = alignment_output[i];
            std::string model_kmer = ea.model_kmer;

            // Grab the previous/next model kmer from the alignment_output table.
            // If the read is from the same strand as the reference
            // the next kmer comes from the next alignment_output (and vice-versa)
            // other the indices are swapped
            int next_stride = ea.rc ? -1 : 1;

            std::string prev_kmer = "";
            std::string next_kmer = "";

            if(i > 0 && i < alignment_output.size() - 1) {

                // check that the event indices are correct for the next expected position
                assert(alignment_output[i + next_stride].event_idx - ea.event_idx == 1);
                assert(alignment_output[i - next_stride].event_idx - ea.event_idx == -1);

                // only set the previous/next when there was exactly one base of movement along the referenc
                if( std::abs(alignment_output[i + next_stride].ref_position - ea.ref_position) == 1) {
                    next_kmer = alignment_output[i + next_stride].model_kmer;
                }

                if( std::abs(alignment_output[i - next_stride].ref_position - ea.ref_position) == 1) {
                    prev_kmer = alignment_output[i - next_stride].model_kmer;
                }
            }

            // Get the rank of the kmer that we aligned to (on the sequencing strand, = model_kmer)
            uint32_t rank = mtrain_alphabet->kmer_rank(model_kmer.c_str(), k);
            assert(rank < emission_map.size());
            auto& kmer_summary = emission_map[rank];

            // We only use this event for training if its not at the end of the alignment
            // (to avoid bad alignments around the read edges) and if its not too short (to
            // avoid bad measurements from effecting the levels too much)
            bool use_for_training = i > opt::min_distance_from_alignment_end &&
                i + opt::min_distance_from_alignment_end < alignment_output.size() &&
                alignment_output[i].hmm_state == 'M' &&
                sr.get_duration( alignment_output[i].event_idx, strand_idx) >= opt::min_event_duration &&
                sr.get_fully_scaled_level(alignment_output[i].event_idx, strand_idx) >= 1.0;

            if(use_for_training) {
                StateTrainingData std(sr, ea, rank, prev_kmer, next_kmer);
                #pragma omp critical(kmer)
                kmer_summary.events.push_back(std);
            }

            if(ea.hmm_state == 'M')  {
                #pragma omp atomic
                kmer_summary.num_matches += 1;
            } else if(ea.hmm_state == 'E') {
                #pragma omp atomic
                kmer_summary.num_stays += 1;
            }
        }
    } // for strands
}
int scorereads_main(int argc, char** argv)
{
    parse_scorereads_options(argc, argv);
    omp_set_num_threads(opt::num_threads);

    Fast5Map name_map(opt::reads_file);
    ModelMap models;
    if (!opt::models_fofn.empty())
        models = read_models_fofn(opt::models_fofn);
    
    // Open the BAM and iterate over reads

    // load bam file
    htsFile* bam_fh = sam_open(opt::bam_file.c_str(), "r");
    assert(bam_fh != NULL);

    // load bam index file
    std::string index_filename = opt::bam_file + ".bai";
    hts_idx_t* bam_idx = bam_index_load(index_filename.c_str());
    assert(bam_idx != NULL);

    // read the bam header
    bam_hdr_t* hdr = sam_hdr_read(bam_fh);
    
    // load reference fai file
    faidx_t *fai = fai_load(opt::genome_file.c_str());

    hts_itr_t* itr;

    // If processing a region of the genome, only emit events aligned to this window
    int clip_start = -1;
    int clip_end = -1;

    if(opt::region.empty()) {
        // TODO: is this valid?
        itr = sam_itr_queryi(bam_idx, HTS_IDX_START, 0, 0);
    } else {

        fprintf(stderr, "Region: %s\n", opt::region.c_str());
        itr = sam_itr_querys(bam_idx, hdr, opt::region.c_str());
        hts_parse_reg(opt::region.c_str(), &clip_start, &clip_end);
    }

#ifndef H5_HAVE_THREADSAFE
    if(opt::num_threads > 1) {
        fprintf(stderr, "You enabled multi-threading but you do not have a threadsafe HDF5\n");
        fprintf(stderr, "Please recompile nanopolish's built-in libhdf5 or run with -t 1\n");
        exit(1);
    }
#endif

    // Initialize iteration
    std::vector<bam1_t*> records(opt::batch_size, NULL);
    for(size_t i = 0; i < records.size(); ++i) {
        records[i] = bam_init1();
    }

    int result;
    size_t num_reads_realigned = 0;
    size_t num_records_buffered = 0;

    do {
        assert(num_records_buffered < records.size());
        
        // read a record into the next slot in the buffer
        result = sam_itr_next(bam_fh, itr, records[num_records_buffered]);
        num_records_buffered += result >= 0;

        // realign if we've hit the max buffer size or reached the end of file
        if(num_records_buffered == records.size() || result < 0) {
            #pragma omp parallel for schedule(dynamic)
            for(size_t i = 0; i < num_records_buffered; ++i) {
                bam1_t* record = records[i];
                size_t read_idx = num_reads_realigned + i;
                if( (record->core.flag & BAM_FUNMAP) == 0) {

                    //load read
                    std::string read_name = bam_get_qname(record);
                    std::string fast5_path = name_map.get_path(read_name);
                    SquiggleRead sr(read_name, fast5_path);

                    // TODO: early exit when have processed all of the reads in readnames
                    if (!opt::readnames.empty() && 
                         std::find(opt::readnames.begin(), opt::readnames.end(), read_name) == opt::readnames.end() )
                            continue;

                    for(size_t strand_idx = 0; strand_idx < NUM_STRANDS; ++strand_idx) {
                        std::vector<EventAlignment> ao = alignment_from_read(sr, strand_idx, read_idx,
                                                                             models, fai, hdr,
                                                                             record, clip_start, clip_end);
                        if (ao.size() == 0)
                            continue;

                        // Update pore model based on alignment
                        if ( opt::calibrate ) 
                            recalibrate_model(sr, strand_idx, ao, false);

                        double score = model_score(sr, strand_idx, fai, ao, 500);
                        if (score > 0) 
                            continue;
                        #pragma omp critical(print)
                        std::cout << read_name << " " << ( strand_idx ? "complement" : "template" ) 
                                  << " " << sr.pore_model[strand_idx].name << " " << score << std::endl;
                    } 
                }
            }

            num_reads_realigned += num_records_buffered;
            num_records_buffered = 0;
        }

    } while(result >= 0);
    
    // cleanup records
    for(size_t i = 0; i < records.size(); ++i) {
        bam_destroy1(records[i]);
    }

    // cleanup
    sam_itr_destroy(itr);
    bam_hdr_destroy(hdr);
    fai_destroy(fai);
    sam_close(bam_fh);
    hts_idx_destroy(bam_idx);
    return 0;
}