Esempio n. 1
0
// Run dindel on a pair of samples
DindelReturnCode DindelUtil::runDindelPairMatePair(const std::string& id,
                                                   const StringVector& base_haplotypes,
                                                   const StringVector& variant_haplotypes,
                                                   const GraphCompareParameters& parameters,
                                                   std::ostream& baseOut,
                                                   std::ostream& variantOut,
                                                   std::ostream& callsOut,
                                                   DindelReadReferenceAlignmentVector* pReadAlignments)
{
    PROFILE_FUNC("runDindelPairMatePair")

    StringVector inHaplotypes;
    inHaplotypes.insert(inHaplotypes.end(), base_haplotypes.begin(), base_haplotypes.end());
    inHaplotypes.insert(inHaplotypes.end(), variant_haplotypes.begin(), variant_haplotypes.end());

    //
    // First, extract the reads from the normal and variant data sets that match each haplotype
    //
    assert(inHaplotypes.size() > 0);

    // Get canidate alignments for the input haplotypes
    HapgenAlignmentVector candidateAlignments;

    // Choose the kmer size for alignment
    size_t align_kmer = 31;
    for(size_t i = 0; i < inHaplotypes.size(); ++i)
    {
        HapgenAlignmentVector thisCandidateAlignments;
        HapgenUtil::alignHaplotypeToReferenceKmer(align_kmer,
                                                  inHaplotypes[i],
                                                  parameters.referenceIndex,
                                                  parameters.pRefTable,
                                                  thisCandidateAlignments);

        candidateAlignments.insert(candidateAlignments.end(), thisCandidateAlignments.begin(), thisCandidateAlignments.end());
    }
   
    // Remove duplicate or bad alignment pairs
    HapgenUtil::coalesceAlignments(candidateAlignments);

    if(Verbosity::Instance().getPrintLevel() > 3)
        printf("runDindel -- %zu candidate alignments found\n", candidateAlignments.size());
    
    size_t MAX_ALIGNMENTS = 10;
    if(candidateAlignments.size() > MAX_ALIGNMENTS)
        return DRC_AMBIGUOUS_ALIGNMENT;

    // Join each haplotype with flanking sequence from the reference genome for each alignment
    // This function also adds a haplotype (with flanking sequence) for the piece of the reference
    int FLANKING_SIZE = 0;
    if (parameters.dindelRealignParameters.realignMatePairs)
        FLANKING_SIZE = 1000;
    StringVector flankingHaplotypes;

    // This vector contains the internal portion of the haplotypes, without the flanking sequence
    // It is used to extract reads
    StringVector candidateHaplotypes;
    for(size_t i = 0; i < candidateAlignments.size(); ++i)
    {
        HapgenUtil::makeFlankingHaplotypes(candidateAlignments[i],
                                           parameters.pRefTable,
                                           FLANKING_SIZE,
                                           inHaplotypes,
                                           flankingHaplotypes,
                                           candidateHaplotypes);
    
    }

    if(Verbosity::Instance().getPrintLevel() > 3)
        printf("runDindel -- made %zu flanking haplotypes\n", candidateHaplotypes.size());

    // Normal reads
    SeqRecordVector normalReads;
    SeqRecordVector normalRCReads;

    // Remove non-unique candidate haplotypes
    std::sort(candidateHaplotypes.begin(), candidateHaplotypes.end());
    StringVector::iterator haplotype_iterator = std::unique(candidateHaplotypes.begin(), candidateHaplotypes.end());
    candidateHaplotypes.resize(haplotype_iterator - candidateHaplotypes.begin());

    // Set the value to use for extracting reads that potentially match the haplotype
    // Do not use a kmer for extraction greater than this value
    size_t KMER_CEILING = 31;
    size_t extractionKmer = parameters.kmer < KMER_CEILING ? parameters.kmer : KMER_CEILING;
    
    bool extractOK = true;
    if(!parameters.bReferenceMode)
    {
        // Reads on the same strand as the haplotype
        extractOK = HapgenUtil::extractHaplotypeReads(candidateHaplotypes, parameters.baseIndex, extractionKmer, 
                                                      false, parameters.maxReads, parameters.maxExtractionIntervalSize, &normalReads, NULL);

        if(!extractOK)
            return DRC_OVER_DEPTH;

        // Reads on the reverse strand
        extractOK = HapgenUtil::extractHaplotypeReads(candidateHaplotypes, parameters.baseIndex, extractionKmer, 
                                                      true, parameters.maxReads, parameters.maxExtractionIntervalSize, &normalRCReads, NULL);

        if(!extractOK)
            return DRC_OVER_DEPTH;
    }

    // Variant reads
    SeqRecordVector variantReads;
    SeqRecordVector variantRCReads;

    extractOK = HapgenUtil::extractHaplotypeReads(candidateHaplotypes, parameters.variantIndex, extractionKmer, 
                                                  false, parameters.maxReads, parameters.maxExtractionIntervalSize, &variantReads, NULL);

    if(!extractOK)
        return DRC_OVER_DEPTH;

    extractOK = HapgenUtil::extractHaplotypeReads(candidateHaplotypes, parameters.variantIndex, extractionKmer, 
                                                  true, parameters.maxReads, parameters.maxExtractionIntervalSize, &variantRCReads, NULL);

    if(!extractOK)
        return DRC_OVER_DEPTH;

    size_t normal_reads = normalReads.size() + normalRCReads.size();
    size_t variant_reads = variantReads.size() + variantRCReads.size();
    size_t total_reads = normal_reads + variant_reads;
    
    if(Verbosity::Instance().getPrintLevel() > 3)
        printf("Extracted %zu normal reads, %zu variant reads\n", normal_reads, variant_reads);

    if(total_reads > parameters.maxReads)
        return DRC_OVER_DEPTH;

    if (total_reads == 0)
        return DRC_UNDER_DEPTH;

    // Generate the input haplotypes for dindel
    // We need at least 2 haplotypes (one is the reference)
    size_t totFlankingHaplotypes = flankingHaplotypes.size();

    if(totFlankingHaplotypes < 2)
        return DRC_NO_ALIGNMENT;

    // Ensure the reference haplotype is a non-empty string
    if(flankingHaplotypes[0].size() == 0)
        return DRC_NO_ALIGNMENT;

    // Make Dindel referenceMappings
    StringVector dindelHaplotypes;
    std::set<DindelReferenceMapping> refMappings;

    //
    for(size_t i = 0; i < candidateAlignments.size(); ++i)
    {
        std::string upstream, defined, downstream;
        std::string refName = parameters.pRefTable->getRead(candidateAlignments[i].referenceID).id;

        HapgenUtil::extractReferenceSubstrings(candidateAlignments[i],parameters.pRefTable, 
                                               FLANKING_SIZE, upstream, defined, downstream);

        std::string refSeq = upstream + defined + downstream;
     
        int refStart = candidateAlignments[i].position - int(upstream.size()) + 1;

        // Here the score is used as an estimate of how unique "defined" is in the reference sequence.
        // "defined" is not the reference sequence but a candidate haplotype.
        // It is conservative because the flanking sequence is not used in this estimation.
    	DindelReferenceMapping rm(refName, 
                                  refSeq, 
                                  refStart, 
                                  candidateAlignments[i].score+2*FLANKING_SIZE, 
                                  candidateAlignments[i].isRC);

        std::set<DindelReferenceMapping>::iterator rmit = refMappings.find(rm);
        if(rmit == refMappings.end())
        {
            refMappings.insert(rm);
        }
	    else
        {
            if(rm.referenceAlignmentScore > rmit->referenceAlignmentScore) 
                rmit->referenceAlignmentScore = rm.referenceAlignmentScore;
        }
    }
    
    // RESET MAPPING SCORES
    for(std::set<DindelReferenceMapping>::iterator it = refMappings.begin(); it != refMappings.end(); it++) 
        it->referenceAlignmentScore = 1000;
    
    // make flankingHaplotypes unique
    std::set< std::string > setFlanking(flankingHaplotypes.begin(), flankingHaplotypes.end());

    for(std::set< std::string >::const_iterator it = setFlanking.begin(); it != setFlanking.end(); it++)
    {
        dindelHaplotypes.push_back(*it);
        //dindelRefMappings[i] = std::vector<DindelReferenceMapping>(refMappings.begin(),refMappings.end());
    }

    std::vector<DindelReferenceMapping> dRefMappings(refMappings.begin(),refMappings.end());
    DindelWindow dWindow(dindelHaplotypes, dRefMappings);

    //
    // Run Dindel
    //
    
    // Initialize VCF collections
    VCFCollection vcfCollections[2];

    // If in multisample mode, load the sample names into the VCFCollection
    if(parameters.variantIndex.pPopIdx != NULL)
    {
        for(size_t i = 0; i <= 1; ++i)
            vcfCollections[i].samples = parameters.variantIndex.pPopIdx->getSamples();
    }

    size_t start_i = parameters.bReferenceMode ? 1 : 0;

    DindelRealignWindowResult *pThisResult = NULL;
    DindelRealignWindowResult *pPreviousResult = NULL;

    for(size_t i = start_i; i <= 1; ++i)
    {
        SeqRecordVector& fwdReads = (i == 0) ? normalReads : variantReads;
        SeqRecordVector& rcReads = (i == 0) ? normalRCReads : variantRCReads;
        const BWTIndexSet* indices = &parameters.variantIndex;

        // Create dindel reads
        // Mates must be at the end of the array.
        std::vector<DindelRead> dReads;
        for(size_t j = 0; j < fwdReads.size(); ++j)
            dReads.push_back(convertToDindelRead(indices, fwdReads[j], true));

        for(size_t j = 0; j < rcReads.size(); ++j)
        {
            rcReads[j].seq.reverseComplement();
            std::reverse(rcReads[j].qual.begin(), rcReads[j].qual.end());
            dReads.push_back(convertToDindelRead(indices, rcReads[j], false));
        }

        pThisResult = new DindelRealignWindowResult();

        std::stringstream out_ss;
        try
        {
            DindelRealignWindow dRealignWindow(&dWindow, dReads, parameters.dindelRealignParameters);
            dRealignWindow.run("hmm", vcfCollections[i], pReadAlignments, id, pThisResult, pPreviousResult, parameters.pRefTable);
        }
        catch(std::string e)
        {
            std::cerr << "Dindel Exception: " << e << "\n";
            exit(DRC_EXCEPTION);
        }


        if(i == 0)
            pPreviousResult = pThisResult;
    }

    // Copy raw VCFRecords to output
    for(size_t i = 0; i <= 1; ++i)
    {
        std::ostream& curr_out = i == 0 ? baseOut : variantOut;
        for(size_t j = 0; j < vcfCollections[i].records.size(); ++j)
            curr_out << vcfCollections[i].records[j] << "\n";
    }

    // Make comparative calls
    size_t VARIANT_IDX = 1;
    size_t BASE_IDX = 0;
    bool has_base_calls = !vcfCollections[BASE_IDX].records.empty();
    for(size_t i = 0; i < vcfCollections[1].records.size(); ++i)
    {
        bool not_called_in_base = true;
        if(has_base_calls)
            not_called_in_base = vcfCollections[BASE_IDX].records[i].passStr == "NoCall" ||
                                 vcfCollections[BASE_IDX].records[i].passStr == "NoSupp";

        bool called_in_variant = vcfCollections[VARIANT_IDX].records[i].passStr == "PASS";
        if(called_in_variant && not_called_in_base)
            callsOut << vcfCollections[VARIANT_IDX].records[i] << "\n";
    }

    baseOut.flush();
    variantOut.flush();

    delete pThisResult;
    delete pPreviousResult;

    return DRC_OK;
}
Esempio n. 2
0
// Main Program
int main( int nArgs, char **args ) {

  // Read command line parameters
  if ( nArgs < 3 ) {
    std::cout << "Please specify output filename and run time (seconds)." << std::endl;
    return 1;
  }
  std::string outFileName(args[1]);
  int runTime(30);
  { std::stringstream tmp; tmp << std::string(args[2]); tmp >> runTime; }
  std::cout << "LED triggered run for " << runTime << " seconds: " << outFileName << std::endl;

  // Create the ROOT Application (to draw canvases)
  TApplication *theApp = new TApplication("LEDRun",&nArgs,args);
  //gStyle->SetPalette(1);

  // Configure the Argonne Board
  int err(0);
  uint nSamples = 600; //1024; // max = 2046
  uint preTrigSamples = 50;
  err = ConfigArgoBoard_ExtTrig( nSamples, preTrigSamples );
  if ( err != 0 ) {
    std::cout << "Failed to configure ANL Digitizer." << std::endl;
    return err;
  }

  // Get ANL Digitizer configuration info
  uint ANL_nSamples(0); DeviceRead(lbneReg.readout_window[0],&ANL_nSamples);
  uint ANL_preTrigSize(0); DeviceRead(lbneReg.readout_pretrigger[0],&ANL_preTrigSize);
  uint ANL_eventSize = sizeof(Event_Header) + 2*nSamples; // in bytes
  uint m1Size(0); DeviceRead(lbneReg.m1_window[0],&m1Size);
  uint m2Size(0); DeviceRead(lbneReg.m2_window[0],&m2Size);
  uint pWindow(0); DeviceRead(lbneReg.p_window[0],&pWindow);
  uint kWindow(0); DeviceRead(lbneReg.k_window[0],&kWindow);
  uint iWindow(0); DeviceRead(lbneReg.i_window[0],&iWindow);
  uint dWindow(0); DeviceRead(lbneReg.d_window[0],&dWindow);

  // Output File
  TFile *outFile = new TFile(outFileName.c_str(),"RECREATE");
  TDirectory *waveDir = outFile->mkdir("waveforms");
  std::vector<TH1D*> waveformExamples;

  // Set up output ROOT tree for ANL Digitizer
  TTree *ANLTree = new TTree("ANLTree","ANLTree");
  Event_Packet ArPacket; Event ArEvent;
  ANLTree->Branch("channelID",&(ArEvent.channelID),"channelID/s");
  ANLTree->Branch("syncDelay",&(ArEvent.syncDelay),"syncDelay/i");
  ANLTree->Branch("syncCount",&(ArEvent.syncCount),"syncCount/i");
  ANLTree->Branch("timestamp",&(ArEvent.intTimestamp),"timestamp/l");
  ANLTree->Branch("peakSum",&(ArEvent.peakSum),"peakSum/I");
  ANLTree->Branch("peakTime",&(ArEvent.peakTime),"peakTime/C");
  ANLTree->Branch("prerise",&(ArEvent.prerise),"prerise/i");
  ANLTree->Branch("integratedSum",&(ArEvent.integratedSum),"integratedSum/i");
  ANLTree->Branch("baseline",&(ArEvent.baseline),"baseline/s");
  ANLTree->Branch("cfdPoint",&(ArEvent.cfdPoint),"cfdPoint[4]/S");
  ANLTree->Branch("nSamples",&(ArEvent.waveformWords),"nSamples/s");
  std::stringstream waveformDescr; waveformDescr << "waveform[" << nSamples << "]/s";
  ANLTree->Branch("waveform",&(ArEvent.waveform),waveformDescr.str().c_str());

  // Set up configuration ROOT tree
  TTree *configTree = new TTree("configTree","configTree");
  configTree->Branch("runTime",&runTime,"runTime/I");
  configTree->Branch("ANL_nSamples",&ANL_nSamples,"ANL_nSamples/i");
  configTree->Branch("ANL_preTrigSize",&ANL_preTrigSize,"ANL_preTrigSize/i");
  configTree->Branch("m1Size",&m1Size,"m1Size/i");
  configTree->Branch("m2Size",&m2Size,"m2Size/i");
  configTree->Branch("pWindow",&pWindow,"pWindow/i");
  configTree->Branch("kWindow",&kWindow,"kWindow/i");
  configTree->Branch("iWindow",&iWindow,"iWindow/i");
  configTree->Branch("dWindow",&dWindow,"dWindow/i");
  configTree->Fill();
  configTree->Write();

  // Monitoring histograms
  TH1D *ANL_pulseAmp[32];
  for ( int chan = 0; chan < 12; ++chan ) {
    std::stringstream histName; histName << "ANL_pulseAmp_CH" << chan;
    std::stringstream histTitle; histTitle << "Pulse Amplitudes, ANL Channel " << chan << ";ADC Counts";
    ANL_pulseAmp[chan] = new TH1D(histName.str().c_str(),histTitle.str().c_str(),3000,0,3000);
  }    

  std::cout << "Starting acquisition." << std::endl;
  // Enable boards
  err = DeviceTimeout(500);
  err = DeviceStart();

  // Loop for specified time
  int totArEvts(0);
  time_t tStart(0), tEnd(0);
  time(&tStart); time(&tEnd);
  while ( tEnd - tStart < runTime ) {

    /*** ANL Digitizer Readout ***/
    uint dataSize; err = DeviceQueueStatus(&dataSize); // How many full events have been collected?
    if ( err != 0 ) { std::cout << "Failed device queue status check." << std::endl; break; }
    uint ANL_nEvts = uint(dataSize) / ANL_eventSize;
    //if ( ANL_nEvts > 0 ) std::cout << "Received " << ANL_nEvts << " events on the Argonne Digitizer (" << dataSize << " bytes)." << std::endl;
    for ( uint evt = 0; evt < ANL_nEvts; ++evt ) {
      totArEvts++;
      uint dataReceived = 0;
      err = DeviceReceive(&ArPacket,&dataReceived);
      if ( err != 0 ) { std::cout << "Failed device receive." << std::endl; break; }
      err = LBNE_EventUnpack(&ArPacket,&ArEvent);
      if ( err != 0 ) {	std::cout << "Failed to unpack data." << std::endl; break; }
      ANLTree->Fill();
      ANL_pulseAmp[ArEvent.channelID]->Fill(ArEvent.prerise/double(kWindow)-ArEvent.peakSum/double(m1Size));
      /*
      std::stringstream waveHistName; waveHistName << "waveform_" << totArEvts;
      TH1D *hist = new TH1D(waveHistName.str().c_str(),waveHistName.str().c_str(),nSamples,0,nSamples);
      for ( uint samp = 0; samp < nSamples; ++samp ) {
	hist->SetBinContent(samp+1,ArEvent.waveform[samp] & 0x3FFF); // 0x3FFF drops time marks on waveform, not necessary in next firmware upgrade
      }
      waveformExamples.push_back(hist);
      */
    }

    //std::cout << tEnd-tStart << std::endl;
    time(&tEnd);
  }

  //// Finalize

  // Finalize Argonne Digitizer
  err = DeviceStopReset();
  if ( err != 0 ) {
    std::cout << "Failed to stop and reset board 0. ErrorCode " << err << std::endl;
    return 1;
  }
  err = DeviceDisconnect(commUSB);

  // Write and close output file
  ANLTree->Write();

  for ( int chan = 0; chan < 12; ++chan ) ANL_pulseAmp[chan]->Write();

  waveDir->cd();
  for ( uint i = 0; i < waveformExamples.size(); ++i ) {
    waveformExamples[i]->Write();
  }
  outFile->cd();

  outFile->Close();

  // Done!
  std::cout << "Done!  Collected " << totArEvts/12 << " events" << std::endl;

  return 0;
}