StringVector GetWeatherDBDataFileList(const std::string& filePath, const CWeatherDatabaseOptimization& zop) { StringVector filesList; if (IsHourlyDB(filePath)) { filesList.resize(zop.size()); for (size_t i = 0; i < zop.size(); i++) filesList[i] = CHourlyDatabase().GetDataFilePath(filePath, zop[i].GetDataFileName()); } else if (IsDailyDB(filePath)) { filesList.resize(zop.size()); for (size_t i = 0; i < zop.size(); i++) filesList[i] = CDailyDatabase().GetDataFilePath(filePath, zop[i].GetDataFileName()); } else if (IsNormalsDB(filePath)) { if (CNormalsDatabase::IsExtendedDatabase(filePath)) { filesList.resize(zop.size()); for (size_t i = 0; i < zop.size(); i++) filesList[i] = CNormalsDatabase().GetDataFilePath(filePath, zop[i].GetDataFileName()); } else { filesList.push_back(CNormalsDatabase::GetNormalsDataFilePath(filePath)); } } return filesList; }
// Run dindel on a pair of samples DindelReturnCode DindelUtil::runDindelPairMatePair(const std::string& id, const StringVector& base_haplotypes, const StringVector& variant_haplotypes, const GraphCompareParameters& parameters, std::ostream& baseOut, std::ostream& variantOut, std::ostream& callsOut, DindelReadReferenceAlignmentVector* pReadAlignments) { PROFILE_FUNC("runDindelPairMatePair") StringVector inHaplotypes; inHaplotypes.insert(inHaplotypes.end(), base_haplotypes.begin(), base_haplotypes.end()); inHaplotypes.insert(inHaplotypes.end(), variant_haplotypes.begin(), variant_haplotypes.end()); // // First, extract the reads from the normal and variant data sets that match each haplotype // assert(inHaplotypes.size() > 0); // Get canidate alignments for the input haplotypes HapgenAlignmentVector candidateAlignments; // Choose the kmer size for alignment size_t align_kmer = 31; for(size_t i = 0; i < inHaplotypes.size(); ++i) { HapgenAlignmentVector thisCandidateAlignments; HapgenUtil::alignHaplotypeToReferenceKmer(align_kmer, inHaplotypes[i], parameters.referenceIndex, parameters.pRefTable, thisCandidateAlignments); candidateAlignments.insert(candidateAlignments.end(), thisCandidateAlignments.begin(), thisCandidateAlignments.end()); } // Remove duplicate or bad alignment pairs HapgenUtil::coalesceAlignments(candidateAlignments); if(Verbosity::Instance().getPrintLevel() > 3) printf("runDindel -- %zu candidate alignments found\n", candidateAlignments.size()); size_t MAX_ALIGNMENTS = 10; if(candidateAlignments.size() > MAX_ALIGNMENTS) return DRC_AMBIGUOUS_ALIGNMENT; // Join each haplotype with flanking sequence from the reference genome for each alignment // This function also adds a haplotype (with flanking sequence) for the piece of the reference int FLANKING_SIZE = 0; if (parameters.dindelRealignParameters.realignMatePairs) FLANKING_SIZE = 1000; StringVector flankingHaplotypes; // This vector contains the internal portion of the haplotypes, without the flanking sequence // It is used to extract reads StringVector candidateHaplotypes; for(size_t i = 0; i < candidateAlignments.size(); ++i) { HapgenUtil::makeFlankingHaplotypes(candidateAlignments[i], parameters.pRefTable, FLANKING_SIZE, inHaplotypes, flankingHaplotypes, candidateHaplotypes); } if(Verbosity::Instance().getPrintLevel() > 3) printf("runDindel -- made %zu flanking haplotypes\n", candidateHaplotypes.size()); // Normal reads SeqRecordVector normalReads; SeqRecordVector normalRCReads; // Remove non-unique candidate haplotypes std::sort(candidateHaplotypes.begin(), candidateHaplotypes.end()); StringVector::iterator haplotype_iterator = std::unique(candidateHaplotypes.begin(), candidateHaplotypes.end()); candidateHaplotypes.resize(haplotype_iterator - candidateHaplotypes.begin()); // Set the value to use for extracting reads that potentially match the haplotype // Do not use a kmer for extraction greater than this value size_t KMER_CEILING = 31; size_t extractionKmer = parameters.kmer < KMER_CEILING ? parameters.kmer : KMER_CEILING; bool extractOK = true; if(!parameters.bReferenceMode) { // Reads on the same strand as the haplotype extractOK = HapgenUtil::extractHaplotypeReads(candidateHaplotypes, parameters.baseIndex, extractionKmer, false, parameters.maxReads, parameters.maxExtractionIntervalSize, &normalReads, NULL); if(!extractOK) return DRC_OVER_DEPTH; // Reads on the reverse strand extractOK = HapgenUtil::extractHaplotypeReads(candidateHaplotypes, parameters.baseIndex, extractionKmer, true, parameters.maxReads, parameters.maxExtractionIntervalSize, &normalRCReads, NULL); if(!extractOK) return DRC_OVER_DEPTH; } // Variant reads SeqRecordVector variantReads; SeqRecordVector variantRCReads; extractOK = HapgenUtil::extractHaplotypeReads(candidateHaplotypes, parameters.variantIndex, extractionKmer, false, parameters.maxReads, parameters.maxExtractionIntervalSize, &variantReads, NULL); if(!extractOK) return DRC_OVER_DEPTH; extractOK = HapgenUtil::extractHaplotypeReads(candidateHaplotypes, parameters.variantIndex, extractionKmer, true, parameters.maxReads, parameters.maxExtractionIntervalSize, &variantRCReads, NULL); if(!extractOK) return DRC_OVER_DEPTH; size_t normal_reads = normalReads.size() + normalRCReads.size(); size_t variant_reads = variantReads.size() + variantRCReads.size(); size_t total_reads = normal_reads + variant_reads; if(Verbosity::Instance().getPrintLevel() > 3) printf("Extracted %zu normal reads, %zu variant reads\n", normal_reads, variant_reads); if(total_reads > parameters.maxReads) return DRC_OVER_DEPTH; if (total_reads == 0) return DRC_UNDER_DEPTH; // Generate the input haplotypes for dindel // We need at least 2 haplotypes (one is the reference) size_t totFlankingHaplotypes = flankingHaplotypes.size(); if(totFlankingHaplotypes < 2) return DRC_NO_ALIGNMENT; // Ensure the reference haplotype is a non-empty string if(flankingHaplotypes[0].size() == 0) return DRC_NO_ALIGNMENT; // Make Dindel referenceMappings StringVector dindelHaplotypes; std::set<DindelReferenceMapping> refMappings; // for(size_t i = 0; i < candidateAlignments.size(); ++i) { std::string upstream, defined, downstream; std::string refName = parameters.pRefTable->getRead(candidateAlignments[i].referenceID).id; HapgenUtil::extractReferenceSubstrings(candidateAlignments[i],parameters.pRefTable, FLANKING_SIZE, upstream, defined, downstream); std::string refSeq = upstream + defined + downstream; int refStart = candidateAlignments[i].position - int(upstream.size()) + 1; // Here the score is used as an estimate of how unique "defined" is in the reference sequence. // "defined" is not the reference sequence but a candidate haplotype. // It is conservative because the flanking sequence is not used in this estimation. DindelReferenceMapping rm(refName, refSeq, refStart, candidateAlignments[i].score+2*FLANKING_SIZE, candidateAlignments[i].isRC); std::set<DindelReferenceMapping>::iterator rmit = refMappings.find(rm); if(rmit == refMappings.end()) { refMappings.insert(rm); } else { if(rm.referenceAlignmentScore > rmit->referenceAlignmentScore) rmit->referenceAlignmentScore = rm.referenceAlignmentScore; } } // RESET MAPPING SCORES for(std::set<DindelReferenceMapping>::iterator it = refMappings.begin(); it != refMappings.end(); it++) it->referenceAlignmentScore = 1000; // make flankingHaplotypes unique std::set< std::string > setFlanking(flankingHaplotypes.begin(), flankingHaplotypes.end()); for(std::set< std::string >::const_iterator it = setFlanking.begin(); it != setFlanking.end(); it++) { dindelHaplotypes.push_back(*it); //dindelRefMappings[i] = std::vector<DindelReferenceMapping>(refMappings.begin(),refMappings.end()); } std::vector<DindelReferenceMapping> dRefMappings(refMappings.begin(),refMappings.end()); DindelWindow dWindow(dindelHaplotypes, dRefMappings); // // Run Dindel // // Initialize VCF collections VCFCollection vcfCollections[2]; // If in multisample mode, load the sample names into the VCFCollection if(parameters.variantIndex.pPopIdx != NULL) { for(size_t i = 0; i <= 1; ++i) vcfCollections[i].samples = parameters.variantIndex.pPopIdx->getSamples(); } size_t start_i = parameters.bReferenceMode ? 1 : 0; DindelRealignWindowResult *pThisResult = NULL; DindelRealignWindowResult *pPreviousResult = NULL; for(size_t i = start_i; i <= 1; ++i) { SeqRecordVector& fwdReads = (i == 0) ? normalReads : variantReads; SeqRecordVector& rcReads = (i == 0) ? normalRCReads : variantRCReads; const BWTIndexSet* indices = ¶meters.variantIndex; // Create dindel reads // Mates must be at the end of the array. std::vector<DindelRead> dReads; for(size_t j = 0; j < fwdReads.size(); ++j) dReads.push_back(convertToDindelRead(indices, fwdReads[j], true)); for(size_t j = 0; j < rcReads.size(); ++j) { rcReads[j].seq.reverseComplement(); std::reverse(rcReads[j].qual.begin(), rcReads[j].qual.end()); dReads.push_back(convertToDindelRead(indices, rcReads[j], false)); } pThisResult = new DindelRealignWindowResult(); std::stringstream out_ss; try { DindelRealignWindow dRealignWindow(&dWindow, dReads, parameters.dindelRealignParameters); dRealignWindow.run("hmm", vcfCollections[i], pReadAlignments, id, pThisResult, pPreviousResult, parameters.pRefTable); } catch(std::string e) { std::cerr << "Dindel Exception: " << e << "\n"; exit(DRC_EXCEPTION); } if(i == 0) pPreviousResult = pThisResult; } // Copy raw VCFRecords to output for(size_t i = 0; i <= 1; ++i) { std::ostream& curr_out = i == 0 ? baseOut : variantOut; for(size_t j = 0; j < vcfCollections[i].records.size(); ++j) curr_out << vcfCollections[i].records[j] << "\n"; } // Make comparative calls size_t VARIANT_IDX = 1; size_t BASE_IDX = 0; bool has_base_calls = !vcfCollections[BASE_IDX].records.empty(); for(size_t i = 0; i < vcfCollections[1].records.size(); ++i) { bool not_called_in_base = true; if(has_base_calls) not_called_in_base = vcfCollections[BASE_IDX].records[i].passStr == "NoCall" || vcfCollections[BASE_IDX].records[i].passStr == "NoSupp"; bool called_in_variant = vcfCollections[VARIANT_IDX].records[i].passStr == "PASS"; if(called_in_variant && not_called_in_base) callsOut << vcfCollections[VARIANT_IDX].records[i] << "\n"; } baseOut.flush(); variantOut.flush(); delete pThisResult; delete pPreviousResult; return DRC_OK; }
ForceValidationResult* ValidateOpenMMForces::compareForce(Context& context, std::vector<int>& compareForces, Platform& platform1, Platform& platform2 ) const { // --------------------------------------------------------------------------------------- //static const std::string methodName = "ValidateOpenMMForces::compareForce"; // --------------------------------------------------------------------------------------- // note if platforms are identical if( getLog() && platform1.getName().compare( platform2.getName() ) == 0 ){ (void) fprintf( getLog(), "Note: Platforms to compares %s are identical.\n", platform1.getName().c_str() ); (void) fflush( getLog() ); } const System& system = context.getSystem(); // collect systemForceNameMap[forceName] = index in system // systemForceNameIndex[index] = force name StringIntMap systemForceNameMap; StringVector systemForceNameIndex; systemForceNameIndex.resize( system.getNumForces() ); for( int ii = 0; ii < system.getNumForces(); ii++ ){ std::string forceName = getForceName( system.getForce( ii ) ); if( forceName.compare( "NA" ) == 0 ){ std::stringstream message; message << "Force at index=" << ii << " not found -- aborting!"; std::cerr << message.str() << std::endl; throw OpenMM::OpenMMException(message.str()); } systemForceNameMap[forceName] = ii; systemForceNameIndex[ii] = forceName; } // diagnostics if( 0 && getLog() ){ for( StringIntMapI ii = systemForceNameMap.begin(); ii != systemForceNameMap.end(); ii++ ){ int index = (*ii).second; (void) fprintf( getLog(), " System force map %s index=%d reverse map=%s\n", (*ii).first.c_str(), index, systemForceNameIndex[index].c_str() ); } for( unsigned int ii = 0; ii < compareForces.size(); ii++ ){ (void) fprintf( getLog(), " ValidateOpenMMForces %u %s\n", ii, systemForceNameIndex[compareForces[ii]].c_str() ); } (void) fflush( getLog() ); } // get system copy and add forces to system System* validationSystem = copySystemExcludingForces( system ); StringUIntMap forceNamesMap; for( unsigned int ii = 0; ii < compareForces.size(); ii++ ){ const Force& forceToCopy = system.getForce( compareForces[ii] ); Force* force = copyForce( forceToCopy ); validationSystem->addForce( force ); forceNamesMap[systemForceNameIndex[compareForces[ii]]] = ii; } // include any missing dependencies (e.g, OBC force requires NB force for Cuda platform) for( StringUIntMapI ii = forceNamesMap.begin(); ii != forceNamesMap.end(); ii++ ){ std::string forceName = (*ii).first; StringVector dependencyVector; getForceDependencies( forceName, dependencyVector ); for( unsigned int jj = 0; jj < dependencyVector.size(); jj++ ){ std::string dependentForceName = dependencyVector[jj]; StringUIntMapCI dependent = forceNamesMap.find( dependentForceName ); if( dependent == forceNamesMap.end() ){ forceNamesMap[dependentForceName] = 1; int forceIndex = systemForceNameMap[dependentForceName]; const Force& forceToCopy = system.getForce( forceIndex ); validationSystem->addForce( copyForce( forceToCopy ) ); } } } // create contexts VerletIntegrator verletIntegrator( 0.001 ); Context* validationContext1 = new Context( *validationSystem, verletIntegrator, platform1); Context* validationContext2 = new Context( *validationSystem, verletIntegrator, platform2); // set positions synchContexts( context, *validationContext1 ); synchContexts( context, *validationContext2 ); // diagnostics if( 0 && getLog() ){ std::stringstream forceNames; (void) fprintf( getLog(), " Validating system forces=%d\n", validationSystem->getNumForces() ); for( int ii = 0; ii < validationSystem->getNumForces(); ii++ ){ std::string forceName = getForceName( validationSystem->getForce( ii ) ); forceNames << forceName; if( ii < (validationSystem->getNumForces()-1) ){ forceNames << "_"; } else { forceNames << "Parameters.txt"; } (void) fprintf( getLog(), " force %d %s\n", ii, forceName.c_str() ); } writeParameterFile( *validationContext1, forceNames.str() ); (void) fflush( getLog() ); } // calculate forces & build return result ForceValidationResult* forceValidationResult = new ForceValidationResult( *validationContext1, *validationContext2, forceNamesMap ); delete validationContext1; delete validationContext2; delete validationSystem; return forceValidationResult; }
bool SGSmoothingVisitor::visit(StringGraph* pGraph, Vertex* pVertex) { (void)pGraph; if(pVertex->getColor() == GC_RED) return false; bool found = false; for(size_t idx = 0; idx < ED_COUNT; idx++) { EdgeDir dir = EDGE_DIRECTIONS[idx]; EdgePtrVec edges = pVertex->getEdges(dir); if(edges.size() <= 1) continue; for(size_t i = 0; i < edges.size(); ++i) { if(edges[i]->getEnd()->getColor() == GC_RED) return false; } //std::cout << "Smoothing " << pVertex->getID() << "\n"; const int MAX_WALKS = 10; const int MAX_DISTANCE = 5000; bool bIsDegenerate = false; bool bFailGapCheck = false; bool bFailDivergenceCheck = false; bool bFailIndelSizeCheck = false; SGWalkVector variantWalks; SGSearch::findVariantWalks(pVertex, dir, MAX_DISTANCE, MAX_WALKS, variantWalks); if(variantWalks.size() > 0) { found = true; size_t selectedIdx = -1; size_t selectedCoverage = 0; // Calculate the minimum amount overlapped on the start/end vertex. // This is used to properly extract the sequences from walks that represent the variation. int minOverlapX = std::numeric_limits<int>::max(); int minOverlapY = std::numeric_limits<int>::max(); for(size_t i = 0; i < variantWalks.size(); ++i) { if(variantWalks[i].getNumEdges() <= 1) bIsDegenerate = true; // Calculate the walk coverage using the internal vertices of the walk. // The walk with the highest coverage will be retained size_t walkCoverage = 0; for(size_t j = 1; j < variantWalks[i].getNumVertices() - 1; ++j) walkCoverage += variantWalks[i].getVertex(j)->getCoverage(); if(walkCoverage > selectedCoverage || selectedCoverage == 0) { selectedIdx = i; selectedCoverage = walkCoverage; } Edge* pFirstEdge = variantWalks[i].getFirstEdge(); Edge* pLastEdge = variantWalks[i].getLastEdge(); if((int)pFirstEdge->getMatchLength() < minOverlapX) minOverlapX = pFirstEdge->getMatchLength(); if((int)pLastEdge->getTwin()->getMatchLength() < minOverlapY) minOverlapY = pLastEdge->getTwin()->getMatchLength(); } // Calculate the strings for each walk that represent the region of variation StringVector walkStrings; for(size_t i = 0; i < variantWalks.size(); ++i) { Vertex* pStartVertex = variantWalks[i].getStartVertex(); Vertex* pLastVertex = variantWalks[i].getLastVertex(); assert(pStartVertex != NULL && pLastVertex != NULL); std::string full = variantWalks[i].getString(SGWT_START_TO_END); int posStart = 0; int posEnd = 0; if(dir == ED_ANTISENSE) { // pLast ----------- // pStart ------------ // full -------------------- // out ---- posStart = pLastVertex->getSeqLen() - minOverlapY; posEnd = full.size() - (pStartVertex->getSeqLen() - minOverlapX); } else { // pStart -------------- // pLast ----------- // full --------------------- // out ---- posStart = pStartVertex->getSeqLen() - minOverlapX; // match start position posEnd = full.size() - (pLastVertex->getSeqLen() - minOverlapY); // match end position } std::string out; if(posEnd > posStart) out = full.substr(posStart, posEnd - posStart); walkStrings.push_back(out); } assert(selectedIdx != (size_t)-1); SGWalk& selectedWalk = variantWalks[selectedIdx]; assert(selectedWalk.isIndexed()); // Check the divergence of the other walks to this walk StringVector cigarStrings; std::vector<int> maxIndel; std::vector<double> gapPercent; // percentage of matching that is gaps std::vector<double> totalPercent; // percent of total alignment that is mismatch or gap cigarStrings.resize(variantWalks.size()); gapPercent.resize(variantWalks.size()); totalPercent.resize(variantWalks.size()); maxIndel.resize(variantWalks.size()); for(size_t i = 0; i < variantWalks.size(); ++i) { if(i == selectedIdx) continue; // We want to compute the total gap length, total mismatches and percent // divergence between the two paths. int matchLen = 0; int totalDiff = 0; int gapLength = 0; int maxGapLength = 0; // We have to handle the degenerate case where one internal string has zero length // this can happen when there is an isolated insertion/deletion and the walks are like: // x -> y -> z // x -> z if(walkStrings[selectedIdx].empty() || walkStrings[i].empty()) { matchLen = std::max(walkStrings[selectedIdx].size(), walkStrings[i].size()); totalDiff = matchLen; gapLength = matchLen; } else { AlnAln *aln_global; aln_global = aln_stdaln(walkStrings[selectedIdx].c_str(), walkStrings[i].c_str(), &aln_param_blast, 1, 1); // Calculate the alignment parameters while(aln_global->outm[matchLen] != '\0') { if(aln_global->outm[matchLen] == ' ') totalDiff += 1; matchLen += 1; } std::stringstream cigarSS; for (int j = 0; j != aln_global->n_cigar; ++j) { char cigarOp = "MID"[aln_global->cigar32[j]&0xf]; int cigarLen = aln_global->cigar32[j]>>4; if(cigarOp == 'I' || cigarOp == 'D') { gapLength += cigarLen; if(gapLength > maxGapLength) maxGapLength = gapLength; } cigarSS << cigarLen; cigarSS << cigarOp; } cigarStrings[i] = cigarSS.str(); aln_free_AlnAln(aln_global); } double percentDiff = (double)totalDiff / matchLen; double percentGap = (double)gapLength / matchLen; if(percentDiff > m_maxTotalDivergence) bFailDivergenceCheck = true; if(percentGap > m_maxGapDivergence) bFailGapCheck = true; if(maxGapLength > m_maxIndelLength) bFailIndelSizeCheck = true; gapPercent[i] = percentGap; totalPercent[i] = percentDiff; maxIndel[i] = maxGapLength; } if(bIsDegenerate || bFailGapCheck || bFailDivergenceCheck || bFailIndelSizeCheck) continue; // Write the selected path to the variants file as variant 0 int variantIdx = 0; std::string selectedSequence = selectedWalk.getString(SGWT_START_TO_END); std::stringstream ss; ss << "variant-" << m_numRemovedTotal << "/" << variantIdx++; writeFastaRecord(&m_outFile, ss.str(), selectedSequence); // The vertex set for each walk is not necessarily disjoint, // the selected walk may contain vertices that are part // of other paths. We handle this be initially marking all // vertices of the for(size_t i = 0; i < variantWalks.size(); ++i) { if(i == selectedIdx) continue; SGWalk& currWalk = variantWalks[i]; for(size_t j = 0; j < currWalk.getNumEdges() - 1; ++j) { Edge* currEdge = currWalk.getEdge(j); // If the vertex is also on the selected path, do not mark it Vertex* currVertex = currEdge->getEnd(); if(!selectedWalk.containsVertex(currVertex->getID())) { currEdge->getEnd()->setColor(GC_RED); } } // Write the variant to a file std::string variantSequence = currWalk.getString(SGWT_START_TO_END); std::stringstream variantID; std::stringstream ss; ss << "variant-" << m_numRemovedTotal << "/" << variantIdx++; ss << " IGD:" << (double)gapPercent[i] << " ITD:" << totalPercent[i] << " MID: " << maxIndel[i] << " InternalCigar:" << cigarStrings[i]; writeFastaRecord(&m_outFile, ss.str(), variantSequence); } if(variantWalks.size() == 2) m_simpleBubblesRemoved += 1; else m_complexBubblesRemoved += 1; ++m_numRemovedTotal; } }