// Run dindel on a pair of samples DindelReturnCode DindelUtil::runDindelPairMatePair(const std::string& id, const StringVector& base_haplotypes, const StringVector& variant_haplotypes, const GraphCompareParameters& parameters, std::ostream& baseOut, std::ostream& variantOut, std::ostream& callsOut, DindelReadReferenceAlignmentVector* pReadAlignments) { PROFILE_FUNC("runDindelPairMatePair") StringVector inHaplotypes; inHaplotypes.insert(inHaplotypes.end(), base_haplotypes.begin(), base_haplotypes.end()); inHaplotypes.insert(inHaplotypes.end(), variant_haplotypes.begin(), variant_haplotypes.end()); // // First, extract the reads from the normal and variant data sets that match each haplotype // assert(inHaplotypes.size() > 0); // Get canidate alignments for the input haplotypes HapgenAlignmentVector candidateAlignments; // Choose the kmer size for alignment size_t align_kmer = 31; for(size_t i = 0; i < inHaplotypes.size(); ++i) { HapgenAlignmentVector thisCandidateAlignments; HapgenUtil::alignHaplotypeToReferenceKmer(align_kmer, inHaplotypes[i], parameters.referenceIndex, parameters.pRefTable, thisCandidateAlignments); candidateAlignments.insert(candidateAlignments.end(), thisCandidateAlignments.begin(), thisCandidateAlignments.end()); } // Remove duplicate or bad alignment pairs HapgenUtil::coalesceAlignments(candidateAlignments); if(Verbosity::Instance().getPrintLevel() > 3) printf("runDindel -- %zu candidate alignments found\n", candidateAlignments.size()); size_t MAX_ALIGNMENTS = 10; if(candidateAlignments.size() > MAX_ALIGNMENTS) return DRC_AMBIGUOUS_ALIGNMENT; // Join each haplotype with flanking sequence from the reference genome for each alignment // This function also adds a haplotype (with flanking sequence) for the piece of the reference int FLANKING_SIZE = 0; if (parameters.dindelRealignParameters.realignMatePairs) FLANKING_SIZE = 1000; StringVector flankingHaplotypes; // This vector contains the internal portion of the haplotypes, without the flanking sequence // It is used to extract reads StringVector candidateHaplotypes; for(size_t i = 0; i < candidateAlignments.size(); ++i) { HapgenUtil::makeFlankingHaplotypes(candidateAlignments[i], parameters.pRefTable, FLANKING_SIZE, inHaplotypes, flankingHaplotypes, candidateHaplotypes); } if(Verbosity::Instance().getPrintLevel() > 3) printf("runDindel -- made %zu flanking haplotypes\n", candidateHaplotypes.size()); // Normal reads SeqRecordVector normalReads; SeqRecordVector normalRCReads; // Remove non-unique candidate haplotypes std::sort(candidateHaplotypes.begin(), candidateHaplotypes.end()); StringVector::iterator haplotype_iterator = std::unique(candidateHaplotypes.begin(), candidateHaplotypes.end()); candidateHaplotypes.resize(haplotype_iterator - candidateHaplotypes.begin()); // Set the value to use for extracting reads that potentially match the haplotype // Do not use a kmer for extraction greater than this value size_t KMER_CEILING = 31; size_t extractionKmer = parameters.kmer < KMER_CEILING ? parameters.kmer : KMER_CEILING; bool extractOK = true; if(!parameters.bReferenceMode) { // Reads on the same strand as the haplotype extractOK = HapgenUtil::extractHaplotypeReads(candidateHaplotypes, parameters.baseIndex, extractionKmer, false, parameters.maxReads, parameters.maxExtractionIntervalSize, &normalReads, NULL); if(!extractOK) return DRC_OVER_DEPTH; // Reads on the reverse strand extractOK = HapgenUtil::extractHaplotypeReads(candidateHaplotypes, parameters.baseIndex, extractionKmer, true, parameters.maxReads, parameters.maxExtractionIntervalSize, &normalRCReads, NULL); if(!extractOK) return DRC_OVER_DEPTH; } // Variant reads SeqRecordVector variantReads; SeqRecordVector variantRCReads; extractOK = HapgenUtil::extractHaplotypeReads(candidateHaplotypes, parameters.variantIndex, extractionKmer, false, parameters.maxReads, parameters.maxExtractionIntervalSize, &variantReads, NULL); if(!extractOK) return DRC_OVER_DEPTH; extractOK = HapgenUtil::extractHaplotypeReads(candidateHaplotypes, parameters.variantIndex, extractionKmer, true, parameters.maxReads, parameters.maxExtractionIntervalSize, &variantRCReads, NULL); if(!extractOK) return DRC_OVER_DEPTH; size_t normal_reads = normalReads.size() + normalRCReads.size(); size_t variant_reads = variantReads.size() + variantRCReads.size(); size_t total_reads = normal_reads + variant_reads; if(Verbosity::Instance().getPrintLevel() > 3) printf("Extracted %zu normal reads, %zu variant reads\n", normal_reads, variant_reads); if(total_reads > parameters.maxReads) return DRC_OVER_DEPTH; if (total_reads == 0) return DRC_UNDER_DEPTH; // Generate the input haplotypes for dindel // We need at least 2 haplotypes (one is the reference) size_t totFlankingHaplotypes = flankingHaplotypes.size(); if(totFlankingHaplotypes < 2) return DRC_NO_ALIGNMENT; // Ensure the reference haplotype is a non-empty string if(flankingHaplotypes[0].size() == 0) return DRC_NO_ALIGNMENT; // Make Dindel referenceMappings StringVector dindelHaplotypes; std::set<DindelReferenceMapping> refMappings; // for(size_t i = 0; i < candidateAlignments.size(); ++i) { std::string upstream, defined, downstream; std::string refName = parameters.pRefTable->getRead(candidateAlignments[i].referenceID).id; HapgenUtil::extractReferenceSubstrings(candidateAlignments[i],parameters.pRefTable, FLANKING_SIZE, upstream, defined, downstream); std::string refSeq = upstream + defined + downstream; int refStart = candidateAlignments[i].position - int(upstream.size()) + 1; // Here the score is used as an estimate of how unique "defined" is in the reference sequence. // "defined" is not the reference sequence but a candidate haplotype. // It is conservative because the flanking sequence is not used in this estimation. DindelReferenceMapping rm(refName, refSeq, refStart, candidateAlignments[i].score+2*FLANKING_SIZE, candidateAlignments[i].isRC); std::set<DindelReferenceMapping>::iterator rmit = refMappings.find(rm); if(rmit == refMappings.end()) { refMappings.insert(rm); } else { if(rm.referenceAlignmentScore > rmit->referenceAlignmentScore) rmit->referenceAlignmentScore = rm.referenceAlignmentScore; } } // RESET MAPPING SCORES for(std::set<DindelReferenceMapping>::iterator it = refMappings.begin(); it != refMappings.end(); it++) it->referenceAlignmentScore = 1000; // make flankingHaplotypes unique std::set< std::string > setFlanking(flankingHaplotypes.begin(), flankingHaplotypes.end()); for(std::set< std::string >::const_iterator it = setFlanking.begin(); it != setFlanking.end(); it++) { dindelHaplotypes.push_back(*it); //dindelRefMappings[i] = std::vector<DindelReferenceMapping>(refMappings.begin(),refMappings.end()); } std::vector<DindelReferenceMapping> dRefMappings(refMappings.begin(),refMappings.end()); DindelWindow dWindow(dindelHaplotypes, dRefMappings); // // Run Dindel // // Initialize VCF collections VCFCollection vcfCollections[2]; // If in multisample mode, load the sample names into the VCFCollection if(parameters.variantIndex.pPopIdx != NULL) { for(size_t i = 0; i <= 1; ++i) vcfCollections[i].samples = parameters.variantIndex.pPopIdx->getSamples(); } size_t start_i = parameters.bReferenceMode ? 1 : 0; DindelRealignWindowResult *pThisResult = NULL; DindelRealignWindowResult *pPreviousResult = NULL; for(size_t i = start_i; i <= 1; ++i) { SeqRecordVector& fwdReads = (i == 0) ? normalReads : variantReads; SeqRecordVector& rcReads = (i == 0) ? normalRCReads : variantRCReads; const BWTIndexSet* indices = ¶meters.variantIndex; // Create dindel reads // Mates must be at the end of the array. std::vector<DindelRead> dReads; for(size_t j = 0; j < fwdReads.size(); ++j) dReads.push_back(convertToDindelRead(indices, fwdReads[j], true)); for(size_t j = 0; j < rcReads.size(); ++j) { rcReads[j].seq.reverseComplement(); std::reverse(rcReads[j].qual.begin(), rcReads[j].qual.end()); dReads.push_back(convertToDindelRead(indices, rcReads[j], false)); } pThisResult = new DindelRealignWindowResult(); std::stringstream out_ss; try { DindelRealignWindow dRealignWindow(&dWindow, dReads, parameters.dindelRealignParameters); dRealignWindow.run("hmm", vcfCollections[i], pReadAlignments, id, pThisResult, pPreviousResult, parameters.pRefTable); } catch(std::string e) { std::cerr << "Dindel Exception: " << e << "\n"; exit(DRC_EXCEPTION); } if(i == 0) pPreviousResult = pThisResult; } // Copy raw VCFRecords to output for(size_t i = 0; i <= 1; ++i) { std::ostream& curr_out = i == 0 ? baseOut : variantOut; for(size_t j = 0; j < vcfCollections[i].records.size(); ++j) curr_out << vcfCollections[i].records[j] << "\n"; } // Make comparative calls size_t VARIANT_IDX = 1; size_t BASE_IDX = 0; bool has_base_calls = !vcfCollections[BASE_IDX].records.empty(); for(size_t i = 0; i < vcfCollections[1].records.size(); ++i) { bool not_called_in_base = true; if(has_base_calls) not_called_in_base = vcfCollections[BASE_IDX].records[i].passStr == "NoCall" || vcfCollections[BASE_IDX].records[i].passStr == "NoSupp"; bool called_in_variant = vcfCollections[VARIANT_IDX].records[i].passStr == "PASS"; if(called_in_variant && not_called_in_base) callsOut << vcfCollections[VARIANT_IDX].records[i] << "\n"; } baseOut.flush(); variantOut.flush(); delete pThisResult; delete pPreviousResult; return DRC_OK; }
// Main Program int main( int nArgs, char **args ) { // Read command line parameters if ( nArgs < 3 ) { std::cout << "Please specify output filename and run time (seconds)." << std::endl; return 1; } std::string outFileName(args[1]); int runTime(30); { std::stringstream tmp; tmp << std::string(args[2]); tmp >> runTime; } std::cout << "LED triggered run for " << runTime << " seconds: " << outFileName << std::endl; // Create the ROOT Application (to draw canvases) TApplication *theApp = new TApplication("LEDRun",&nArgs,args); //gStyle->SetPalette(1); // Configure the Argonne Board int err(0); uint nSamples = 600; //1024; // max = 2046 uint preTrigSamples = 50; err = ConfigArgoBoard_ExtTrig( nSamples, preTrigSamples ); if ( err != 0 ) { std::cout << "Failed to configure ANL Digitizer." << std::endl; return err; } // Get ANL Digitizer configuration info uint ANL_nSamples(0); DeviceRead(lbneReg.readout_window[0],&ANL_nSamples); uint ANL_preTrigSize(0); DeviceRead(lbneReg.readout_pretrigger[0],&ANL_preTrigSize); uint ANL_eventSize = sizeof(Event_Header) + 2*nSamples; // in bytes uint m1Size(0); DeviceRead(lbneReg.m1_window[0],&m1Size); uint m2Size(0); DeviceRead(lbneReg.m2_window[0],&m2Size); uint pWindow(0); DeviceRead(lbneReg.p_window[0],&pWindow); uint kWindow(0); DeviceRead(lbneReg.k_window[0],&kWindow); uint iWindow(0); DeviceRead(lbneReg.i_window[0],&iWindow); uint dWindow(0); DeviceRead(lbneReg.d_window[0],&dWindow); // Output File TFile *outFile = new TFile(outFileName.c_str(),"RECREATE"); TDirectory *waveDir = outFile->mkdir("waveforms"); std::vector<TH1D*> waveformExamples; // Set up output ROOT tree for ANL Digitizer TTree *ANLTree = new TTree("ANLTree","ANLTree"); Event_Packet ArPacket; Event ArEvent; ANLTree->Branch("channelID",&(ArEvent.channelID),"channelID/s"); ANLTree->Branch("syncDelay",&(ArEvent.syncDelay),"syncDelay/i"); ANLTree->Branch("syncCount",&(ArEvent.syncCount),"syncCount/i"); ANLTree->Branch("timestamp",&(ArEvent.intTimestamp),"timestamp/l"); ANLTree->Branch("peakSum",&(ArEvent.peakSum),"peakSum/I"); ANLTree->Branch("peakTime",&(ArEvent.peakTime),"peakTime/C"); ANLTree->Branch("prerise",&(ArEvent.prerise),"prerise/i"); ANLTree->Branch("integratedSum",&(ArEvent.integratedSum),"integratedSum/i"); ANLTree->Branch("baseline",&(ArEvent.baseline),"baseline/s"); ANLTree->Branch("cfdPoint",&(ArEvent.cfdPoint),"cfdPoint[4]/S"); ANLTree->Branch("nSamples",&(ArEvent.waveformWords),"nSamples/s"); std::stringstream waveformDescr; waveformDescr << "waveform[" << nSamples << "]/s"; ANLTree->Branch("waveform",&(ArEvent.waveform),waveformDescr.str().c_str()); // Set up configuration ROOT tree TTree *configTree = new TTree("configTree","configTree"); configTree->Branch("runTime",&runTime,"runTime/I"); configTree->Branch("ANL_nSamples",&ANL_nSamples,"ANL_nSamples/i"); configTree->Branch("ANL_preTrigSize",&ANL_preTrigSize,"ANL_preTrigSize/i"); configTree->Branch("m1Size",&m1Size,"m1Size/i"); configTree->Branch("m2Size",&m2Size,"m2Size/i"); configTree->Branch("pWindow",&pWindow,"pWindow/i"); configTree->Branch("kWindow",&kWindow,"kWindow/i"); configTree->Branch("iWindow",&iWindow,"iWindow/i"); configTree->Branch("dWindow",&dWindow,"dWindow/i"); configTree->Fill(); configTree->Write(); // Monitoring histograms TH1D *ANL_pulseAmp[32]; for ( int chan = 0; chan < 12; ++chan ) { std::stringstream histName; histName << "ANL_pulseAmp_CH" << chan; std::stringstream histTitle; histTitle << "Pulse Amplitudes, ANL Channel " << chan << ";ADC Counts"; ANL_pulseAmp[chan] = new TH1D(histName.str().c_str(),histTitle.str().c_str(),3000,0,3000); } std::cout << "Starting acquisition." << std::endl; // Enable boards err = DeviceTimeout(500); err = DeviceStart(); // Loop for specified time int totArEvts(0); time_t tStart(0), tEnd(0); time(&tStart); time(&tEnd); while ( tEnd - tStart < runTime ) { /*** ANL Digitizer Readout ***/ uint dataSize; err = DeviceQueueStatus(&dataSize); // How many full events have been collected? if ( err != 0 ) { std::cout << "Failed device queue status check." << std::endl; break; } uint ANL_nEvts = uint(dataSize) / ANL_eventSize; //if ( ANL_nEvts > 0 ) std::cout << "Received " << ANL_nEvts << " events on the Argonne Digitizer (" << dataSize << " bytes)." << std::endl; for ( uint evt = 0; evt < ANL_nEvts; ++evt ) { totArEvts++; uint dataReceived = 0; err = DeviceReceive(&ArPacket,&dataReceived); if ( err != 0 ) { std::cout << "Failed device receive." << std::endl; break; } err = LBNE_EventUnpack(&ArPacket,&ArEvent); if ( err != 0 ) { std::cout << "Failed to unpack data." << std::endl; break; } ANLTree->Fill(); ANL_pulseAmp[ArEvent.channelID]->Fill(ArEvent.prerise/double(kWindow)-ArEvent.peakSum/double(m1Size)); /* std::stringstream waveHistName; waveHistName << "waveform_" << totArEvts; TH1D *hist = new TH1D(waveHistName.str().c_str(),waveHistName.str().c_str(),nSamples,0,nSamples); for ( uint samp = 0; samp < nSamples; ++samp ) { hist->SetBinContent(samp+1,ArEvent.waveform[samp] & 0x3FFF); // 0x3FFF drops time marks on waveform, not necessary in next firmware upgrade } waveformExamples.push_back(hist); */ } //std::cout << tEnd-tStart << std::endl; time(&tEnd); } //// Finalize // Finalize Argonne Digitizer err = DeviceStopReset(); if ( err != 0 ) { std::cout << "Failed to stop and reset board 0. ErrorCode " << err << std::endl; return 1; } err = DeviceDisconnect(commUSB); // Write and close output file ANLTree->Write(); for ( int chan = 0; chan < 12; ++chan ) ANL_pulseAmp[chan]->Write(); waveDir->cd(); for ( uint i = 0; i < waveformExamples.size(); ++i ) { waveformExamples[i]->Write(); } outFile->cd(); outFile->Close(); // Done! std::cout << "Done! Collected " << totArEvts/12 << " events" << std::endl; return 0; }