Ejemplo n.º 1
0
RcppExport SEXP treePhaser(SEXP Rsignal, SEXP RkeyFlow, SEXP RflowCycle, SEXP Rcf, SEXP Rie, SEXP Rdr, SEXP Rbasecaller)
{
  SEXP ret = R_NilValue;
  char *exceptionMesg = NULL;

  try {
    RcppMatrix<double>   signal(Rsignal);
    RcppVector<int>      keyFlow(RkeyFlow);
    string flowCycle   = Rcpp::as<string>(RflowCycle);
    double cf          = Rcpp::as<double>(Rcf);
    double ie          = Rcpp::as<double>(Rie);
    double dr          = Rcpp::as<double>(Rdr);
    string basecaller  = Rcpp::as<string>(Rbasecaller);
  
    unsigned int nFlow = signal.cols();
    unsigned int nRead = signal.rows();

    if(basecaller != "treephaser-swan" && basecaller != "dp-treephaser" && basecaller != "treephaser-adaptive") {
      std::string exception = "base value for basecaller supplied: " + basecaller;
      exceptionMesg = copyMessageToR(exception.c_str());
    } else if (flowCycle.length() < nFlow) {
      std::string exception = "Flow cycle is shorter than number of flows to solve";
      exceptionMesg = copyMessageToR(exception.c_str());
    } else {

      // Prepare objects for holding and passing back results
      RcppMatrix<double>        predicted_out(nRead,nFlow);
      RcppMatrix<double>        residual_out(nRead,nFlow);
      RcppMatrix<int>           hpFlow_out(nRead,nFlow);
      std::vector< std::string> seq_out(nRead);

      // Set up key flow vector
      int nKeyFlow = keyFlow.size(); 
      vector <int> keyVec(nKeyFlow);
      for(int iFlow=0; iFlow < nKeyFlow; iFlow++)
        keyVec[iFlow] = keyFlow(iFlow);

      // Iterate over all reads
      vector <float> sigVec(nFlow);
      string result;
      for(unsigned int iRead=0; iRead < nRead; iRead++) {
        for(unsigned int iFlow=0; iFlow < nFlow; iFlow++)
          sigVec[iFlow] = (float) signal(iRead,iFlow);
        BasecallerRead read;
        read.SetDataAndKeyNormalize(&(sigVec[0]), (int)nFlow, &(keyVec[0]), nKeyFlow-1);
        DPTreephaser dpTreephaser(flowCycle.c_str(), flowCycle.length(), 8);
        if (basecaller == "dp-treephaser")
          dpTreephaser.SetModelParameters(cf, ie, dr);
        else
          dpTreephaser.SetModelParameters(cf, ie, 0); // Adaptive normalization
          
        // Execute the iterative solving-normalization routine
        if (basecaller == "dp-treephaser")
          dpTreephaser.NormalizeAndSolve4(read, nFlow);
        else if (basecaller == "treephaser-adaptive")
          dpTreephaser.NormalizeAndSolve3(read, nFlow); // Adaptive normalization
        else
          dpTreephaser.NormalizeAndSolve5(read, nFlow); // sliding window adaptive normalization

        read.flowToString(flowCycle,seq_out[iRead]);
        for(unsigned int iFlow=0; iFlow < nFlow; iFlow++) {
          predicted_out(iRead,iFlow) = (double) read.prediction[iFlow];
          residual_out(iRead,iFlow)  = (double) read.normalizedMeasurements[iFlow] - read.prediction[iFlow];
          hpFlow_out(iRead,iFlow)    = (int)    read.solution[iFlow];
        }

        // Store results
        RcppResultSet rs;
        rs.add("seq",        seq_out);
        rs.add("predicted",  predicted_out);
        rs.add("residual",   residual_out);
        rs.add("hpFlow",     hpFlow_out);

        ret = rs.getReturnList();
      }
    }
  } catch(std::exception& ex) {
    exceptionMesg = copyMessageToR(ex.what());
  } catch(...) {
    exceptionMesg = copyMessageToR("unknown reason");
  }
    
  if(exceptionMesg != NULL)
    Rf_error(exceptionMesg);

  return ret;
}
Ejemplo n.º 2
0
    void AddElement(int tf, float *rawValues, uint16_t *corValues, int len, int x, int y, char *flowOrder,
                    const string & genome, const string & calls) {
        if ((!active) || (numRows <= 0))
            return;

        int numFlows = len;
        int flowOrderLength = strlen(flowOrder);

        //
        // Use alignments to generate "synchronized" reference and read ionograms
        //

        int numBases = min(genome.length(),calls.length());
        vector<int> refIonogram(numFlows, 0);
        vector<int> readIonogram(numFlows, 0);

        int numFlowsRead = 0;
        int numFlowsRef = 0;
        char gC = flowOrder[0];
        int gBC = 0;

        for (int iBase = 0; (iBase < numBases) && (numFlowsRead < numFlows) && (numFlowsRef < numFlows); iBase++) {

            // Conversion for reads (independent of reference)
            if (calls[iBase] != '-') {
                while ((calls[iBase] != flowOrder[numFlowsRead % flowOrderLength]) && (numFlowsRead < numFlows))
                    numFlowsRead++;
                if (numFlowsRead < numFlows)
                    readIonogram[numFlowsRead]++;
            }

            if (genome[iBase] != '-') {

                if (genome[iBase] != gC) {
                    // Since a new homopolymer begins, need to drop off the old one
                    while ((gC != flowOrder[numFlowsRef % flowOrderLength]) && (numFlowsRef < numFlows)) {
                        numFlowsRef++;
                        if (numFlowsRef < numFlows)
                            refIonogram[numFlowsRef] = 0;
                    }
                    if (numFlowsRef < numFlows)
                        refIonogram[numFlowsRef] = gBC;

                    gC = genome[iBase];
                    gBC = 0;
                }
                gBC++;

                if (genome[iBase] == calls[iBase])
                    numFlowsRef = numFlowsRead;
            }
        }


        for (int iFlow = 8; (iFlow < numFlowsRef-20) && (iFlow < numFlowsRead-20); iFlow++) {

            int baseIdx = 0;
            if      (flowOrder[iFlow % flowOrderLength] == 'C') baseIdx = 1;
            else if (flowOrder[iFlow % flowOrderLength] == 'G') baseIdx = 2;
            else if (flowOrder[iFlow % flowOrderLength] == 'T') baseIdx = 3;

            if (readIonogram[iFlow] == refIonogram[iFlow])
                tfCallCorrect2[tf][refIonogram[iFlow] + baseIdx*maxTFHPHist]++;
            else if (readIonogram[iFlow] > refIonogram[iFlow])
                tfCallOver2[tf][refIonogram[iFlow] + baseIdx*maxTFHPHist]++;
            else
                tfCallUnder2[tf][refIonogram[iFlow] + baseIdx*maxTFHPHist]++;
        }

        // Sparkline data
        for (int iFlow = 0; (iFlow < numFlowsRef) && (iFlow < numFlowsRead) && (iFlow < maxTFSparklineFlows); iFlow++) {
            tfCallTotal3[tf][iFlow]++;
            if (readIonogram[iFlow] == refIonogram[iFlow])
                tfCallCorrect3[tf][iFlow]++;
        }



        tfCount[tf]++;

        BasecallerRead well;
        well.SetDataAndKeyNormalize(rawValues, len, &(refIonogram[0]), 7);
        int lastCalledFlow = 0;
        for (int iFlow = 0; (iFlow < len); iFlow++) {
            well.solution[iFlow] = (char)((corValues[iFlow]+50)/100);
            if (well.solution[iFlow] > 0)
                lastCalledFlow = iFlow;
        }


        int cafieYinc = ceil(numRows / (double)numRegionRows);
        int cafieXinc = ceil(numCols / (double)numRegionCols);
        int iRegion = (y / cafieYinc) + (x / cafieXinc) * numRegionRows;

        DPTreephaser dpTreephaser(flowOrder, len, 8);
        dpTreephaser.SetModelParameters(CFbyRegion[iRegion], IEbyRegion[iRegion], 0);
        dpTreephaser.Simulate3(well, len);
        well.FitBaselineVector((len+49)/50,50);
        well.FitNormalizerVector((len+49)/50,50);

        for (int iFlow = 0; (iFlow < len) && (iFlow < numFlowsTFClassify); iFlow++) {
            avgTFSignal[tf][iFlow] += well.normalizedMeasurements[iFlow];
            avgTFCorrected[tf][iFlow] += corValues[iFlow] / 100.0;
            avgTFSignalSquared[tf][iFlow] += well.normalizedMeasurements[iFlow] * well.normalizedMeasurements[iFlow];

            int quantizedTFSignal = (int) rint(40.0 * well.normalizedMeasurements[iFlow]);
            quantizedTFSignal = min(max(quantizedTFSignal,0),maxTFSignalHist-1);
            tfSignalHist[tf][iFlow * maxTFSignalHist + quantizedTFSignal]++;
        }



    }
Ejemplo n.º 3
0
void BaseCallerLite::BasecallerWorker()
{

  while (true) {

    deque<int> wellX;
    deque<int> wellY;
    deque<vector<float> > wellMeasurements;

    pthread_mutex_lock(&wellsAccessMutex);

    if (nextRegionY >= numRegionsY) {
      pthread_mutex_unlock(&wellsAccessMutex);
      return;
    }

    int currentRegionX = nextRegionX;
    int currentRegionY = nextRegionY;
    int currentRegion = currentRegionX + numRegionsX * currentRegionY;


    int beginY = currentRegionY * regionYSize;
    int beginX = currentRegionX * regionXSize;
    int endY = min((currentRegionY+1) * regionYSize,rows);
    int endX = min((currentRegionX+1) * regionXSize,cols);
    wellsPtr->SetChunk(beginY, endY-beginY, beginX, endX-beginX, 0, numFlows);
    wellsPtr->ReadWells();
    for (int y = beginY; y < endY; y++) {
      for (int x = beginX; x < endX; x++) {
        if (!maskPtr->Match(x, y, MaskLib))
          continue;

        wellX.push_back(x);
        wellY.push_back(y);
        wellMeasurements.push_back(vector<float>());
        wellMeasurements.back().resize(numFlows);

        const WellData *w = wellsPtr->ReadXY(x, y);
        copy(w->flowValues, w->flowValues + numFlows, wellMeasurements.back().begin());
      }
    }

    if (currentRegionX == 0)
      printf("% 5d/% 5d: ", currentRegionY*regionYSize, rows);
    if (wellX.size() == 0)
      printf("  ");
    else if (wellX.size() < 750)
      printf(". ");
    else if (wellX.size() < 1500)
      printf("o ");
    else if (wellX.size() < 2250)
      printf("# ");
    else
      printf("$ ");

    nextRegionX++;
    if (nextRegionX == numRegionsX) {
      nextRegionX = 0;
      nextRegionY++;
      printf("\n");
    }
    fflush(NULL);

    pthread_mutex_unlock(&wellsAccessMutex);


    BasecallerRead currentRead;
    DPTreephaser dpTreephaser(flowOrder);
    dpTreephaser.SetModelParameters(CF, IE, 0);

    // Process the data
    deque<SFFEntry> libReads;

    deque<int>::iterator x = wellX.begin();
    deque<int>::iterator y = wellY.begin();
    deque<std::vector<float> >::iterator measurements = wellMeasurements.begin();

    for (; x != wellX.end() ; x++, y++, measurements++) {

      if (!maskPtr->Match(*x, *y, (MaskType)(MaskLib|MaskKeypass), MATCH_ALL))
        continue;

      libReads.push_back(SFFEntry());
      SFFEntry& readResults = libReads.back();
      stringstream wellNameStream;
      wellNameStream << runId << ":" << (*y) << ":" << (*x);
      readResults.name = wellNameStream.str();
      readResults.clip_qual_left = 4; // TODO
      readResults.clip_qual_right = 0;
      readResults.clip_adapter_left = 0;
      readResults.clip_adapter_right = 0;
      readResults.flowgram.resize(numFlows);

      int minReadLength = 8; // TODO

      currentRead.SetDataAndKeyNormalize(&(measurements->at(0)), numFlows, &libKeyFlows[0], libNumKeyFlows - 1);

      dpTreephaser.NormalizeAndSolve5(currentRead, numFlows); // sliding window adaptive normalization

      readResults.n_bases = 0;
      for (int iFlow = 0; iFlow < numFlows; iFlow++) {
        readResults.flowgram[iFlow] = 100 * currentRead.solution[iFlow];
        readResults.n_bases += currentRead.solution[iFlow];
      }

      if(readResults.n_bases < minReadLength) {
        libReads.pop_back();
        continue;
      }

      bool isFailKeypass = false;
      for (int iFlow = 0; iFlow < (libNumKeyFlows-1); iFlow++)
        if (libKeyFlows[iFlow] != currentRead.solution[iFlow])
          isFailKeypass = true;

      if(isFailKeypass) {
        libReads.pop_back();
        continue;
      }

      readResults.flow_index.reserve(readResults.n_bases);
      readResults.bases.reserve(readResults.n_bases);
      readResults.quality.reserve(readResults.n_bases);

      unsigned int prev_used_flow = 0;
      for (int iFlow = 0; iFlow < numFlows; iFlow++) {
        for (hpLen_t hp = 0; hp < currentRead.solution[iFlow]; hp++) {
          readResults.flow_index.push_back(1 + iFlow - prev_used_flow);
          readResults.bases.push_back(flowOrder[iFlow]);
          readResults.quality.push_back(20); // BaseCallerLite is stripped of QV generator
          prev_used_flow = iFlow + 1;
        }
      }

    }

    libSFF.WriteRegion(currentRegion,libReads);
  }
}
Ejemplo n.º 4
0
void * BasecallerWorker(void *input)
{
    BaseCallerContext& bc = *static_cast<BaseCallerContext*>(input);

    RawWells wells ("", bc.filename_wells.c_str());
    pthread_mutex_lock(&bc.mutex);
    wells.OpenForIncrementalRead();
    pthread_mutex_unlock(&bc.mutex);

    vector<float> residual(bc.flow_order.num_flows(), 0);
    vector<float> scaled_residual(bc.flow_order.num_flows(), 0);
    vector<float> wells_measurements(bc.flow_order.num_flows(), 0);
    vector<float> local_noise(bc.flow_order.num_flows(), 0);
    vector<float> minus_noise_overlap(bc.flow_order.num_flows(), 0);
    vector<float> homopolymer_rank(bc.flow_order.num_flows(), 0);
    vector<float> neighborhood_noise(bc.flow_order.num_flows(), 0);
    vector<float> phasing_parameters(3);
    vector<uint16_t>  flowgram(bc.flow_order.num_flows());
    vector<int16_t>   flowgram2(bc.flow_order.num_flows());
    vector<int16_t> filtering_details(13,0);

    vector<char> abParams;
    abParams.reserve(256);

    vector<uint8_t>   quality(3*bc.flow_order.num_flows());
    vector<int>       base_to_flow (3*bc.flow_order.num_flows());             //!< Flow of in-phase incorporation of each base.

    TreephaserSSE treephaser_sse(bc.flow_order, bc.windowSize);
    DPTreephaser  treephaser(bc.flow_order, bc.windowSize);
    treephaser.SetStateProgression(bc.diagonal_state_prog);
    treephaser.SkipRecalDuringNormalization(bc.skip_recal_during_norm);
    treephaser_sse.SkipRecalDuringNormalization(bc.skip_recal_during_norm);


    while (true) {

        //
        // Step 1. Retrieve next unprocessed region
        //

        pthread_mutex_lock(&bc.mutex);

        int current_region, begin_x, begin_y, end_x, end_y;
        if (not bc.chip_subset.GetCurrentRegionAndIncrement(current_region, begin_x, end_x, begin_y, end_y)) {
           wells.Close();
           pthread_mutex_unlock(&bc.mutex);
           return NULL;
        }

        int num_usable_wells = 0;
        for (int y = begin_y; y < end_y; ++y)
            for (int x = begin_x; x < end_x; ++x)
                if (bc.class_map[x + y * bc.chip_subset.GetChipSizeX()] >= 0)
                    num_usable_wells++;

        if      (begin_x == 0)            printf("\n% 5d/% 5d: ", begin_y, bc.chip_subset.GetChipSizeY());
        if      (num_usable_wells ==   0) printf("  ");
        else if (num_usable_wells <  750) printf(". ");
        else if (num_usable_wells < 1500) printf("o ");
        else if (num_usable_wells < 2250) printf("# ");
        else                              printf("##");
        fflush(NULL);

        if (begin_x == 0)
            SaveBaseCallerProgress(10 + (80*begin_y)/bc.chip_subset.GetChipSizeY(), bc.output_directory);

        pthread_mutex_unlock(&bc.mutex);

        // Process the data
        deque<ProcessedRead> lib_reads;                // Collection of template library reads
        deque<ProcessedRead> tf_reads;                 // Collection of test fragment reads
        deque<ProcessedRead> calib_reads;              // Collection of calibration library reads
        deque<ProcessedRead> unfiltered_reads;         // Random subset of lib_reads
        deque<ProcessedRead> unfiltered_trimmed_reads; // Random subset of lib_reads

        if (num_usable_wells == 0) { // There is nothing in this region. Don't even bother reading it
            bc.lib_writer.WriteRegion(current_region, lib_reads);
            if (bc.have_calibration_panel)
                bc.calib_writer.WriteRegion(current_region, calib_reads);
            if (bc.process_tfs)
                bc.tf_writer.WriteRegion(current_region, tf_reads);
            if (!bc.unfiltered_set.empty()) {
                bc.unfiltered_writer.WriteRegion(current_region,unfiltered_reads);
                bc.unfiltered_trimmed_writer.WriteRegion(current_region,unfiltered_trimmed_reads);
            }
            continue;
        }

        wells.SetChunk(begin_y, end_y-begin_y, begin_x, end_x-begin_x, 0, bc.flow_order.num_flows());
        wells.ReadWells();

        for (int y = begin_y; y < end_y; ++y)
            for (int x = begin_x; x < end_x; ++x) {   // Loop over wells within current region

                //
                // Step 2. Retrieve additional information needed to process this read
                //

                unsigned int read_index = x + y * bc.chip_subset.GetChipSizeX();
                int read_class = bc.class_map[read_index];
                if (read_class < 0)
                    continue;
                bool is_random_calibration_read = false;
                if (read_class == 2){
                  is_random_calibration_read = true;
                  read_class = 0; // Calibration reads are library beads;
                }
                bool is_random_unfiltered  = bc.unfiltered_set.count(read_index) > 0;

                if (not is_random_unfiltered and bc.only_process_unfiltered_set)
                  continue;

                bc.filters->SetValid(read_index); // Presume valid until some filter proves otherwise

                if (read_class == 0)
                    lib_reads.push_back(ProcessedRead(bc.barcodes->NoBarcodeReadGroup()));
                else
                    tf_reads.push_back(ProcessedRead(0));
                ProcessedRead& processed_read = (read_class==0) ? lib_reads.back() : tf_reads.back();

                // Respect filter decisions from Background Model
                if (bc.mask->Match(read_index, MaskFilteredBadResidual))
                    bc.filters->SetBkgmodelHighPPF(read_index, processed_read.filter);

                if (bc.mask->Match(read_index, MaskFilteredBadPPF))
                    bc.filters->SetBkgmodelPolyclonal(read_index, processed_read.filter);

                if (bc.mask->Match(read_index, MaskFilteredBadKey))
                    bc.filters->SetBkgmodelFailedKeypass(read_index, processed_read.filter);

                if (!is_random_unfiltered and !bc.filters->IsValid(read_index)) // No reason to waste more time
                    continue;

                float cf = bc.estimator.GetWellCF(x,y);
                float ie = bc.estimator.GetWellIE(x,y);
                float dr = bc.estimator.GetWellDR(x,y);

                for (int flow = 0; flow < bc.flow_order.num_flows(); ++flow)
                    wells_measurements[flow] = wells.At(y,x,flow);

                // Sanity check. If there are NaNs in this read, print warning
                vector<int> nanflow;
                for (int flow = 0; flow < bc.flow_order.num_flows(); ++flow) {
                    if (!isnan(wells_measurements[flow]))
                        continue;
                    wells_measurements[flow] = 0;
                    nanflow.push_back(flow);
                }
                if (nanflow.size() > 0) {
                    fprintf(stderr, "ERROR: BaseCaller read NaNs from wells file, x=%d y=%d flow=%d", x, y, nanflow[0]);
                    for (unsigned int flow=1; flow < nanflow.size(); flow++) {
                        fprintf(stderr, ",%d", nanflow[flow]);
                    }
                    fprintf(stderr, "\n");
                    fflush(stderr);
                }

                //
                // Step 3. Perform base calling and quality value calculation
                //

                BasecallerRead read;
                bool key_pass = true;
                if (bc.keynormalizer == "keynorm-new") {
                  key_pass = read.SetDataAndKeyNormalizeNew(&wells_measurements[0], wells_measurements.size(), bc.keys[read_class].flows(), bc.keys[read_class].flows_length() - 1, false);
                } else { // if (bc.keynormalizer == "keynorm-old") {
                  key_pass = read.SetDataAndKeyNormalize(&wells_measurements[0], wells_measurements.size(), bc.keys[read_class].flows(), bc.keys[read_class].flows_length() - 1);
                }

                // Get rid of outliers quickly
                bc.filters->FilterHighPPFAndPolyclonal (read_index, read_class, processed_read.filter, read.raw_measurements, bc.polyclonal_filter);
                if (not key_pass)
                  bc.filters->FilterFailedKeypass (read_index, read_class, processed_read.filter, read.sequence);
                if (!is_random_unfiltered and !bc.filters->IsValid(read_index)) // No reason to waste more time
                  continue;

                // Check if this read is either from the calibration panel or from the random calibration set
                if(bc.calibration_training and bc.have_calibration_panel) {
                  if (!is_random_calibration_read and !bc.calibration_barcodes->MatchesBarcodeSignal(read)) {
                	bc.filters->SetFiltered(read_index, read_class, processed_read.filter); // Set as filtered
                    continue;  // And move on along
                  }
                }

                // Equal recalibration opportunity for everybody! (except TFs!)
                const vector<vector<vector<float> > > * aPtr = 0;
                const vector<vector<vector<float> > > * bPtr = 0;
                if (bc.recalModel.is_enabled() && read_class == 0) { //do not recalibrate TF read bc.chip_subset.GetChipSizeX()
                  aPtr = bc.recalModel.getAs(x+bc.chip_subset.GetColOffset(), y+bc.chip_subset.GetRowOffset());
                  bPtr = bc.recalModel.getBs(x+bc.chip_subset.GetColOffset(), y+bc.chip_subset.GetRowOffset());
                }

                // Execute the iterative solving-normalization routine - switch by specified algorithm
                if (bc.dephaser == "treephaser-sse") {
                  treephaser_sse.SetAsBs(aPtr, bPtr);  // Set/delete recalibration model for this read
                  treephaser_sse.SetModelParameters(cf, ie); // sse version has no hookup for droop.
                  treephaser_sse.NormalizeAndSolve(read);
                  treephaser.SetModelParameters(cf, ie); // Adapter trimming uses the cpp treephaser

                } else { // Setup cpp treephaser
                  if (bc.skip_droop)
                    treephaser.SetModelParameters(cf, ie);
                  else
                    treephaser.SetModelParameters(cf, ie, dr);
                  treephaser.SetAsBs(aPtr, bPtr); // Set/delete recalibration model for this read

                  if (bc.dephaser == "dp-treephaser") {
                    // Single parameter gain estimation
                    treephaser.NormalizeAndSolve_GainNorm(read, bc.flow_order.num_flows());
                  } else if (bc.dephaser == "treephaser-adaptive") {
                    // Adaptive nortmalization - resolving read from start in each iteration
                    treephaser.NormalizeAndSolve_Adaptive(read, bc.flow_order.num_flows());
                  } else { //if (bc.dephaser == "treephaser-swan") {
                    // Default corresponding to (approximately) what the sse version is doing
                	// Adaptive normalization - sliding window without resolving start
                	treephaser.NormalizeAndSolve_SWnorm(read, bc.flow_order.num_flows());
                  }

                  // Need this function to calculate inphase population for cpp version
                  treephaser.ComputeQVmetrics(read);
                }

                // If recalibration is enabled, generate adjusted sequence and normalized_measurements, and recompute QV metrics
                bool calibrate_read = (bc.recalibration.is_enabled() && read_class == 0); //do not recalibrate TF read
                if (calibrate_read) {
                	// Change base sequence for low hps
                    bc.recalibration.CalibrateRead(x+bc.chip_subset.GetColOffset(),y+bc.chip_subset.GetRowOffset(),read.sequence, read.normalized_measurements, read.prediction, read.state_inphase);
                    if (bc.dephaser == "treephaser-sse")
                      treephaser_sse.ComputeQVmetrics(read);
                    else
                      treephaser.ComputeQVmetrics(read);
                } else if (bc.dephaser == "treephaser-sse") {
                  // in case we didn't calibrate low hps, still want to have QV metrics for sse output
                  treephaser_sse.ComputeQVmetrics(read);
                }

                // Misc data management: Generate residual, scaled_residual
                for (int flow = 0; flow < bc.flow_order.num_flows(); ++flow) {
                    residual[flow] = read.normalized_measurements[flow] - read.prediction[flow];
                    scaled_residual[flow] = residual[flow] / read.state_inphase[flow];
                }

                // Misc data management: Put base calls in proper string form
                processed_read.filter.n_bases = read.sequence.size();
                processed_read.filter.is_called = true;

                // Misc data management: Generate base_to_flow

                base_to_flow.clear();
                base_to_flow.reserve(processed_read.filter.n_bases);
                for (int base = 0, flow = 0; base < processed_read.filter.n_bases; ++base) {
                    while (flow < bc.flow_order.num_flows() and read.sequence[base] != bc.flow_order[flow])
                        flow++;
                    base_to_flow.push_back(flow);
                }


                // Misc data management: Populate some trivial read properties

                char read_name[256];
                sprintf(read_name, "%s:%05d:%05d", bc.run_id.c_str(), bc.chip_subset.GetRowOffset() + y, bc.chip_subset.GetColOffset() + x);
                processed_read.bam.Name = read_name;
                processed_read.bam.SetIsMapped(false);

                phasing_parameters[0] = cf;
                phasing_parameters[1] = ie;
                phasing_parameters[2] = dr;
                processed_read.bam.AddTag("ZP", phasing_parameters);


                // Calculation of quality values
                // Predictor 1 - Treephaser residual penalty
                // Predictor 2 - Local noise/flowalign - 'noise' in the input base's measured val.  Noise is max[abs(val - round(val))] within +-1 BASES
                // Predictor 3 - Read Noise/Overlap - mean & stdev of the 0-mers & 1-mers in the read
                // Predictor 3 (new) - Beverly Events
                // Predictor 4 - Transformed homopolymer length
                // Predictor 5 - Treephaser: Penalty indicating deletion after the called base
                // Predictor 6 - Neighborhood noise - mean of 'noise' +-5 BASES around a base.  Noise is mean{abs(val - round(val))}

                int num_predictor_bases = min(bc.flow_order.num_flows(), processed_read.filter.n_bases);

                PerBaseQual::PredictorLocalNoise(local_noise, num_predictor_bases, base_to_flow, read.normalized_measurements, read.prediction);
                PerBaseQual::PredictorNeighborhoodNoise(neighborhood_noise, num_predictor_bases, base_to_flow, read.normalized_measurements, read.prediction);
                //PerBaseQual::PredictorNoiseOverlap(minus_noise_overlap, num_predictor_bases, read.normalized_measurements, read.prediction);
                PerBaseQual::PredictorBeverlyEvents(minus_noise_overlap, num_predictor_bases, base_to_flow, scaled_residual);
                PerBaseQual::PredictorHomopolymerRank(homopolymer_rank, num_predictor_bases, read.sequence);

                quality.clear();
                bc.quality_generator.GenerateBaseQualities(processed_read.bam.Name, processed_read.filter.n_bases, bc.flow_order.num_flows(),
                        read.penalty_residual, local_noise, minus_noise_overlap, // <- predictors 1,2,3
                        homopolymer_rank, read.penalty_mismatch, neighborhood_noise, // <- predictors 4,5,6
                        base_to_flow, quality,
                        read.additive_correction,
                        read.multiplicative_correction,
                        read.state_inphase);

                //
                // Step 4a. Barcode classification of library reads
                //

                if (processed_read.filter.n_bases_filtered == -1)
                    processed_read.filter.n_bases_filtered = processed_read.filter.n_bases;

                processed_read.filter.n_bases_key = min(bc.keys[read_class].bases_length(), processed_read.filter.n_bases);
                processed_read.filter.n_bases_prefix = processed_read.filter.n_bases_key;

                processed_read.barcode_n_errors = 0;
                if (read_class == 0)
                {   // Library beads - first separate out calibration barcodes
                	processed_read.read_group_index = -1;
                	if (bc.have_calibration_panel){
                	  bc.calibration_barcodes->ClassifyAndTrimBarcode(read_index, processed_read, read, base_to_flow);
                	  processed_read.is_control_barcode = (processed_read.read_group_index >= 0);
                	}
                    if (processed_read.read_group_index < 0)
                      bc.barcodes->ClassifyAndTrimBarcode(read_index, processed_read, read, base_to_flow);
                }

                //
                // Step 4b. Custom mod: Trim extra bases after key and barcode. Make it look like barcode trimming.
                //

                if (bc.extra_trim_left > 0)
                    processed_read.filter.n_bases_prefix = min(processed_read.filter.n_bases_prefix + bc.extra_trim_left, processed_read.filter.n_bases);


                //
                // Step 4. Calculate/save read metrics and apply filters
                //

                bc.filters->FilterZeroBases     (read_index, read_class, processed_read.filter);
                bc.filters->FilterShortRead     (read_index, read_class, processed_read.filter);
                bc.filters->FilterFailedKeypass (read_index, read_class, processed_read.filter, read.sequence);
                bc.filters->FilterHighResidual  (read_index, read_class, processed_read.filter, residual);
                bc.filters->FilterBeverly       (read_index, read_class, processed_read.filter, scaled_residual, base_to_flow);
                bc.filters->FilterQuality       (read_index, read_class, processed_read.filter, quality);
                bc.filters->TrimAdapter         (read_index, read_class, processed_read, scaled_residual, base_to_flow, treephaser, read);
                bc.filters->TrimQuality         (read_index, read_class, processed_read.filter, quality);
                bc.filters->TrimAvalanche       (read_index, read_class, processed_read.filter, quality);

                //! New mechanism for dumping potentially useful metrics.
                if (bc.metric_saver->save_anything() and (is_random_unfiltered or !bc.metric_saver->save_subset_only())) {
                    pthread_mutex_lock(&bc.mutex);

                    bc.metric_saver->SaveRawMeasurements          (y,x,read.raw_measurements);
                    bc.metric_saver->SaveAdditiveCorrection       (y,x,read.additive_correction);
                    bc.metric_saver->SaveMultiplicativeCorrection (y,x,read.multiplicative_correction);
                    bc.metric_saver->SaveNormalizedMeasurements   (y,x,read.normalized_measurements);
                    bc.metric_saver->SavePrediction               (y,x,read.prediction);
                    bc.metric_saver->SaveStateInphase             (y,x,read.state_inphase);
                    bc.metric_saver->SaveStateTotal               (y,x,read.state_total);
                    bc.metric_saver->SavePenaltyResidual          (y,x,read.penalty_residual);
                    bc.metric_saver->SavePenaltyMismatch          (y,x,read.penalty_mismatch);
                    bc.metric_saver->SaveLocalNoise               (y,x,local_noise);
                    bc.metric_saver->SaveNoiseOverlap             (y,x,minus_noise_overlap);
                    bc.metric_saver->SaveHomopolymerRank          (y,x,homopolymer_rank);
                    bc.metric_saver->SaveNeighborhoodNoise        (y,x,neighborhood_noise);

                    pthread_mutex_unlock(&bc.mutex);
                }


                //
                // Step 4b. Add flow signal information to ZM tag in BAM record.
                //

                flowgram2.clear();
                int max_flow = min(bc.flow_order.num_flows(),16);
                if (processed_read.filter.n_bases_filtered > 0)
                    max_flow = min(bc.flow_order.num_flows(), base_to_flow[processed_read.filter.n_bases_filtered-1] + 16);

                vector<int> out_of_boud_flows;
                for (int flow = 0; flow < max_flow; ++flow){
                    float temp_flowgram = 128*read.normalized_measurements[flow];
                    if (temp_flowgram < -16383.0f or temp_flowgram > 16383.0f) {
                        out_of_boud_flows.push_back(flow);
                        temp_flowgram = min(max(-16383.0f,temp_flowgram), 16383.0f);
                    }
                    //flowgram2.push_back(2*(int16_t)(128*read.normalized_measurements[flow]));
                    flowgram2.push_back(2*(int16_t)temp_flowgram);
                }
                // Do not spam stderr
                /*if (out_of_boud_flows.size() > 0) {
                  cerr << "BaseCaller WARNING: Normalized signal out of bounds in well y="
                       << y << ", x=" << x << ", in flows ";
                  for (unsigned int flow = 0; flow < out_of_boud_flows.size()-1; ++flow)
                    cerr << out_of_boud_flows.at(flow) << ',';
                  cerr << out_of_boud_flows.at(out_of_boud_flows.size()-1) << endl;
                } */
                processed_read.bam.AddTag("ZM", flowgram2);
                //flowgram2.push_back(1*(int16_t)(256*read.normalized_measurements[flow]));
                //flowgram2.push_back(2*(int16_t)(128*read.normalized_measurements[flow]));
                //flowgram2.push_back(4*(int16_t)(64*read.normalized_measurements[flow]));
                //flowgram2.push_back(8*(int16_t)(32*read.normalized_measurements[flow]));

                //
                // Step 4c. Populate FZ tag in BAM record.
                //

                flowgram.clear();
                if (bc.flow_signals_type == "wells") {
                    for (int flow = 0; flow < bc.flow_order.num_flows(); ++flow)
                        flowgram.push_back(max(0,(int)(100.0*wells_measurements[flow]+0.5)));
                    processed_read.bam.AddTag("FZ", flowgram); // Will be phased out soon

                } else if (bc.flow_signals_type == "key-normalized") {
                    for (int flow = 0; flow < bc.flow_order.num_flows(); ++flow)
                        flowgram.push_back(max(0,(int)(100.0*read.raw_measurements[flow]+0.5)));
                    processed_read.bam.AddTag("FZ", flowgram); // Will be phased out soon

                } else if (bc.flow_signals_type == "adaptive-normalized") {
                    for (int flow = 0; flow < bc.flow_order.num_flows(); ++flow)
                        flowgram.push_back(max(0,(int)(100.0*read.normalized_measurements[flow]+0.5)));
                    processed_read.bam.AddTag("FZ", flowgram); // Will be phased out soon

                } else if (bc.flow_signals_type == "residual") {
                    for (int flow = 0; flow < bc.flow_order.num_flows(); ++flow)
                        flowgram.push_back(max(0,(int)(1000 + 100*residual[flow])));
                    processed_read.bam.AddTag("FZ", flowgram); // Will be phased out soon

                } else if (bc.flow_signals_type == "scaled-residual") { // This settings is necessary part of calibration training
                    for (int flow = 0; flow < bc.flow_order.num_flows(); ++flow) {
                        //between 0 and 98
                        float adjustment = min(0.49f, max(-0.49f, scaled_residual[flow]));
                        flowgram.push_back(max(0,(int)(49.5 + 100*adjustment)));
                    }
                    processed_read.bam.AddTag("FZ", flowgram);
                }

                //
                // Step 5. Pass basecalled reads to appropriate writers
                //

                // Create BAM entries
                if (processed_read.filter.n_bases > 0) {
                    processed_read.bam.QueryBases.reserve(processed_read.filter.n_bases);
                    processed_read.bam.Qualities.reserve(processed_read.filter.n_bases);
                    for (int base = processed_read.filter.n_bases_prefix; base < processed_read.filter.n_bases_filtered; ++base) {
                        processed_read.bam.QueryBases.push_back(read.sequence[base]);
                        processed_read.bam.Qualities.push_back(quality[base] + 33);
                    }
                    processed_read.bam.AddTag("ZF","i", base_to_flow[processed_read.filter.n_bases_prefix]);
                } else
                    processed_read.bam.AddTag("ZF","i", 0);

                // Randomly selected library beads - excluding calibration reads
                if (is_random_unfiltered and (not processed_read.is_control_barcode)) {
                    unfiltered_trimmed_reads.push_back(processed_read);
                    unfiltered_reads.push_back(processed_read);

                    ProcessedRead& untrimmed_read = unfiltered_reads.back();

                    processed_read.filter.GenerateZDVector(filtering_details);
                    untrimmed_read.bam.AddTag("ZD", filtering_details);

                    if (processed_read.filter.n_bases > 0) {
                        untrimmed_read.bam.QueryBases.reserve(processed_read.filter.n_bases);
                        untrimmed_read.bam.Qualities.reserve(processed_read.filter.n_bases);
                        for (int base = max(processed_read.filter.n_bases_filtered,processed_read.filter.n_bases_prefix); base < processed_read.filter.n_bases; ++base) {
                            untrimmed_read.bam.QueryBases.push_back(read.sequence[base]);
                            untrimmed_read.bam.Qualities.push_back(quality[base] + 33);
                        }
                    }

                    // Temporary workaround: provide fake FZ tag for unfiltered.trimmed and unfiltered.untrimmed sets.
                    if (bc.flow_signals_type == "none") {
                        flowgram.assign(1,0);
                        unfiltered_reads.back().bam.AddTag("FZ", flowgram);
                        unfiltered_trimmed_reads.back().bam.AddTag("FZ", flowgram);
                    }


                    // If this read was supposed to have "early filtering", make sure we emulate that here
                    if (processed_read.filter.n_bases_after_bkgmodel_bad_key >= 0 or
                            processed_read.filter.n_bases_after_bkgmodel_high_ppf >= 0 or
                            processed_read.filter.n_bases_after_bkgmodel_polyclonal >= 0 or
                            processed_read.filter.n_bases_after_high_ppf >= 0 or
                            processed_read.filter.n_bases_after_polyclonal >= 0)
                        processed_read.filter.n_bases = -1;
                }

                // Move read from lib_reads stack to calib_reads if necessary
                // This invalidates the processed_read reference and needs to be at the very end
                if (processed_read.is_control_barcode) {
                  calib_reads.push_back(processed_read);
                  lib_reads.pop_back();
                }
            }

        bc.lib_writer.WriteRegion(current_region, lib_reads);
        if (bc.have_calibration_panel)
            bc.calib_writer.WriteRegion(current_region, calib_reads);
        if (bc.process_tfs)
            bc.tf_writer.WriteRegion(current_region, tf_reads);
        if (!bc.unfiltered_set.empty()) {
            bc.unfiltered_writer.WriteRegion(current_region,unfiltered_reads);
            bc.unfiltered_trimmed_writer.WriteRegion(current_region,unfiltered_trimmed_reads);
        }
    }
}
Ejemplo n.º 5
0
RcppExport SEXP treePhaser(SEXP Rsignal, SEXP RkeyFlow, SEXP RflowCycle,
                           SEXP Rcf, SEXP Rie, SEXP Rdr, SEXP Rbasecaller, SEXP RdiagonalStates,
                           SEXP RmodelFile, SEXP RmodelThreshold, SEXP Rxval, SEXP Ryval)
{
  SEXP ret = R_NilValue;
  char *exceptionMesg = NULL;

  try {
    Rcpp::NumericMatrix      signal(Rsignal);
    Rcpp::IntegerVector      keyFlow(RkeyFlow);
    string flowCycle       = Rcpp::as<string>(RflowCycle);
    Rcpp::NumericVector      cf_vec(Rcf);
    Rcpp::NumericVector      ie_vec(Rie);
    Rcpp::NumericVector      dr_vec(Rdr);
    string basecaller      = Rcpp::as<string>(Rbasecaller);
    unsigned int diagonalStates = Rcpp::as<int>(RdiagonalStates);

    // Recalibration Variables
    string model_file      = Rcpp::as<string>(RmodelFile);
    int model_threshold    = Rcpp::as<int>(RmodelThreshold);
    Rcpp::IntegerVector      x_values(Rxval);
    Rcpp::IntegerVector      y_values(Ryval);
    RecalibrationModel       recalModel;
    if (model_file.length() > 0) {
      recalModel.InitializeModel(model_file, model_threshold);
    }


    ion::FlowOrder flow_order(flowCycle, flowCycle.length());
    unsigned int nFlow = signal.cols();
    unsigned int nRead = signal.rows();

    if(basecaller != "treephaser-swan" && basecaller != "treephaser-solve" && basecaller != "dp-treephaser" && basecaller != "treephaser-adaptive") {
      std::string exception = "base value for basecaller supplied: " + basecaller;
      exceptionMesg = strdup(exception.c_str());
    } else if (flowCycle.length() < nFlow) {
      std::string exception = "Flow cycle is shorter than number of flows to solve";
      exceptionMesg = strdup(exception.c_str());
    } else {

      // Prepare objects for holding and passing back results
      Rcpp::NumericMatrix        predicted_out(nRead,nFlow);
      Rcpp::NumericMatrix        residual_out(nRead,nFlow);
      Rcpp::NumericMatrix        norm_additive_out(nRead,nFlow);
      Rcpp::NumericMatrix        norm_multipl_out(nRead,nFlow);
      std::vector< std::string> seq_out(nRead);

      // Set up key flow vector
      int nKeyFlow = keyFlow.size(); 
      vector <int> keyVec(nKeyFlow);
      for(int iFlow=0; iFlow < nKeyFlow; iFlow++)
        keyVec[iFlow] = keyFlow(iFlow);

      // Iterate over all reads
      vector <float> sigVec(nFlow);
      BasecallerRead read;
      DPTreephaser dpTreephaser(flow_order);
      dpTreephaser.SetStateProgression((diagonalStates>0));

      // In contrast to pipeline, we always use droop here.
      // To have the same behavior of treephaser-swan as in the pipeline, supply dr=0
      bool per_read_phasing = true;
      if (cf_vec.size() == 1) {
        per_read_phasing = false;
        dpTreephaser.SetModelParameters((double)cf_vec(0), (double)ie_vec(0), (double)dr_vec(0));
      }
 
      // Main loop iterating over reads and solving them
      for(unsigned int iRead=0; iRead < nRead; iRead++) {

        // Set phasing parameters for this read
        if (per_read_phasing)
          dpTreephaser.SetModelParameters((double)cf_vec(iRead), (double)ie_vec(iRead), (double)dr_vec(iRead));
        // And load recalibration model
        if (recalModel.is_enabled()) {
          int my_x = (int)x_values(iRead);
          int my_y = (int)y_values(iRead);
          const vector<vector<vector<float> > > * aPtr = 0;
          const vector<vector<vector<float> > > * bPtr = 0;
          aPtr = recalModel.getAs(my_x, my_y);
          bPtr = recalModel.getBs(my_x, my_y);
          if (aPtr == 0 or bPtr == 0) {
            cout << "Error finding a recalibration model for x: " << x_values(iRead) << " y: " << y_values(iRead);
            cout << endl;
          }
          dpTreephaser.SetAsBs(aPtr, bPtr);
        }

        for(unsigned int iFlow=0; iFlow < nFlow; iFlow++)
          sigVec[iFlow] = (float) signal(iRead,iFlow);
        
        // Interface to just solve without any normalization
        if (basecaller == "treephaser-solve") { // Interface to just solve without any normalization
          read.SetData(sigVec, (int)nFlow);
        } 
        else {
          read.SetDataAndKeyNormalize(&(sigVec[0]), (int)nFlow, &(keyVec[0]), nKeyFlow-1);
        }
          
        // Execute the iterative solving-normalization routine
        if (basecaller == "dp-treephaser") {
          dpTreephaser.NormalizeAndSolve_GainNorm(read, nFlow);
        }
        else if (basecaller == "treephaser-solve") {
          dpTreephaser.Solve(read, nFlow);
        }
        else if (basecaller == "treephaser-adaptive") {
          dpTreephaser.NormalizeAndSolve_Adaptive(read, nFlow); // Adaptive normalization
        }
        else {
          dpTreephaser.NormalizeAndSolve_SWnorm(read, nFlow); // sliding window adaptive normalization
        }

        seq_out[iRead].assign(read.sequence.begin(), read.sequence.end());
        for(unsigned int iFlow=0; iFlow < nFlow; iFlow++) {
          predicted_out(iRead,iFlow)     = (double) read.prediction[iFlow];
          residual_out(iRead,iFlow)      = (double) read.normalized_measurements[iFlow] - read.prediction[iFlow];
          norm_multipl_out(iRead,iFlow)  = (double) read.multiplicative_correction.at(iFlow);
          norm_additive_out(iRead,iFlow) = (double) read.additive_correction.at(iFlow);
        }
      }

      // Store results
      ret = Rcpp::List::create(Rcpp::Named("seq")       = seq_out,
                               Rcpp::Named("predicted") = predicted_out,
                               Rcpp::Named("residual")  = residual_out,
                               Rcpp::Named("norm_additive") = norm_additive_out,
                               Rcpp::Named("norm_multipl")  = norm_multipl_out);
    }
  } catch(std::exception& ex) {
    forward_exception_to_r(ex);
  } catch(...) {
    ::Rf_error("c++ exception (unknown reason)");
  }
    
  if(exceptionMesg != NULL)
    Rf_error(exceptionMesg);

  return ret;
}