// Load a graph (with no edges) from a fasta file StringGraph* SGUtil::loadFASTA(const std::string& filename) { StringGraph* pGraph = new StringGraph; SeqReader reader(filename); SeqRecord record; while(reader.get(record)) { Vertex* pVertex = new(pGraph->getVertexAllocator()) Vertex(record.id, record.seq.toString()); pGraph->addVertex(pVertex); } return pGraph; }
int main() { // Create a graph of string nodes // StringGraph derives from the Graph<string> class, // and includes the extra function "LoadGraph" which allows the loading of text files into a graph StringGraph graph; cout << "Enter the path of the graph text file: "; string path; cin >> path; //Load the graph from a text file graph.LoadGraph(path); cout << "Enter the index of a source node to start from: "; int index; cin >> index; //Run Dijkstra's algorithm starting at a source index graph.Dijkstra(index); cin.get();cin.get(); }
StringGraph* SGUtil::loadASQG(const std::string& filename, const unsigned int minOverlap, bool allowContainments) { // Initialize graph StringGraph* pGraph = new StringGraph; std::istream* pReader = createReader(filename); int stage = 0; int line = 0; std::string recordLine; while(getline(*pReader, recordLine)) { ASQG::RecordType rt = ASQG::getRecordType(recordLine); switch(rt) { case ASQG::RT_HEADER: { if(stage != 0) { std::cerr << "Error: Unexpected header record found at line " << line << "\n"; exit(EXIT_FAILURE); } ASQG::HeaderRecord headerRecord(recordLine); const SQG::IntTag& overlapTag = headerRecord.getOverlapTag(); if(overlapTag.isInitialized()) pGraph->setMinOverlap(overlapTag.get()); else pGraph->setMinOverlap(0); const SQG::FloatTag& errorRateTag = headerRecord.getErrorRateTag(); if(errorRateTag.isInitialized()) pGraph->setErrorRate(errorRateTag.get()); const SQG::IntTag& containmentTag = headerRecord.getContainmentTag(); if(containmentTag.isInitialized()) pGraph->setContainmentFlag(containmentTag.get()); else pGraph->setContainmentFlag(true); // conservatively assume containments are present const SQG::IntTag& transitiveTag = headerRecord.getTransitiveTag(); if(!transitiveTag.isInitialized()) { std::cerr << "Warning: ASQG does not have transitive tag\n"; pGraph->setTransitiveFlag(true); } else { pGraph->setTransitiveFlag(transitiveTag.get()); } break; } case ASQG::RT_VERTEX: { // progress the stage if we are done the header if(stage == 0) stage = 1; if(stage != 1) { std::cerr << "Error: Unexpected vertex record found at line " << line << "\n"; exit(EXIT_FAILURE); } ASQG::VertexRecord vertexRecord(recordLine); const SQG::IntTag& ssTag = vertexRecord.getSubstringTag(); Vertex* pVertex = new(pGraph->getVertexAllocator()) Vertex(vertexRecord.getID(), vertexRecord.getSeq()); if(ssTag.isInitialized() && ssTag.get() == 1) { // Vertex is a substring of some other vertex, mark it as contained pVertex->setContained(true); pGraph->setContainmentFlag(true); } pGraph->addVertex(pVertex); break; } case ASQG::RT_EDGE: { if(stage == 1) stage = 2; if(stage != 2) { std::cerr << "Error: Unexpected edge record found at line " << line << "\n"; exit(EXIT_FAILURE); } ASQG::EdgeRecord edgeRecord(recordLine); const Overlap& ovr = edgeRecord.getOverlap(); // Add the edge to the graph if(ovr.match.getMinOverlapLength() >= (int)minOverlap) SGAlgorithms::createEdgesFromOverlap(pGraph, ovr, allowContainments); break; } } ++line; } // Remove any duplicate edges SGDuplicateVisitor dupVisit; pGraph->visit(dupVisit); SGGraphStatsVisitor statsVisit; pGraph->visit(statsVisit); // Remove identical vertices // This is much cheaper to do than remove via // SGContainRemove as no remodelling needs to occur /* SGIdenticalRemoveVisitor irv; pGraph->visit(irv); // Remove substring vertices while(pGraph->hasContainment()) { SGContainRemoveVisitor crv; pGraph->visit(crv); } */ delete pReader; return pGraph; }
void subgraph() { StringGraph* pGraph = SGUtil::loadASQG(opt::asqgFile, 0, true, opt::maxEdges); pGraph->printMemSize(); // Remove containments from the graph SGContainRemoveVisitor containVisit; std::cout << "Removing contained vertices\n"; while(pGraph->hasContainment()) { pGraph->visit(containVisit); } if (opt::maxOverlap) { SGMaximalOverlapVisitor moVisit(-1); // Remove non-maximal overlap edges std::cout << "Removing non-maximal overlap edges from graph\n"; pGraph->visit(moVisit); } StringGraph* pSubgraph = new StringGraph; // Set the graph parameters to match the main graph pSubgraph->setContainmentFlag(pGraph->hasContainment()); pSubgraph->setTransitiveFlag(pGraph->hasTransitive()); pSubgraph->setMinOverlap(pGraph->getMinOverlap()); pSubgraph->setErrorRate(pGraph->getErrorRate()); // Get the root vertex Vertex* pRootVertex = pGraph->getVertex(opt::rootID); if(pRootVertex == NULL) { std::cout << "Vertex " << opt::rootID << " not found in the graph.\n"; } else { copyVertexToSubgraph(pSubgraph, pRootVertex); pRootVertex->setColor(GC_BLACK); // Recursively add neighbors addNeighborsToSubgraph(pRootVertex, pSubgraph, opt::span); // Write the subgraph pSubgraph->writeASQG(opt::outFile); } delete pSubgraph; delete pGraph; }
int assemble() { StringGraph* pGraph; #pragma omp parallel { #pragma omp single nowait { std::cout << "\n[ Loading string graph: " << opt::asqgFile << " ]\n"; pGraph=SGUtil::loadASQGVertex(opt::asqgFile, opt::minOverlap, true, opt::maxEdges); } #pragma omp single nowait { std::cout << "[ Loading BWT ]\n"; opt::pBWT = new BWT(opt::prefix + BWT_EXT, BWT::DEFAULT_SAMPLE_RATE_SMALL); } #pragma omp single nowait { std::cout << "[ Loading RBWT ]\n"; opt::pRBWT = new BWT(opt::prefix + RBWT_EXT, BWT::DEFAULT_SAMPLE_RATE_SMALL); } #pragma omp single nowait { std::cout << "[ Loading SAI ]\n"; opt::pSSA = new SampledSuffixArray(opt::prefix + SAI_EXT, SSA_FT_SAI); } } opt::indices.pBWT = opt::pBWT; opt::indices.pRBWT = opt::pRBWT; opt::indices.pSSA = opt::pSSA; pGraph=SGUtil::loadASQGEdge(opt::asqgFile, opt::minOverlap, true, opt::maxEdges, pGraph); // if(opt::bExact) pGraph->setExactMode(opt::bExact); //pGraph->printMemSize(); // // Pre-assembly graph stats SGGraphStatsVisitor statsVisit; std::cout << "[Stats] Input graph:\n"; pGraph->visitP(statsVisit); int phase = 0 ; // Remove containments from the graph std::cout << "Removing contained vertices from graph\n"; SGContainRemoveVisitor containVisit; while(pGraph->hasContainment()) pGraph->visit(containVisit); /*---Remove Transitive Edges---*/ std::cout << "Removing transitive edges\n"; SGTransitiveReductionVisitor trVisit; pGraph->visit(trVisit); /*---Remove Transitive Edges---*/ // getchar(); // Compact together unbranched chains of vertices std::cout << "Start to simplify unipaths ...\n"; pGraph->simplify(); std::cout << "[Stats] Simplified graph:\n"; pGraph->visitP(statsVisit); /**********************Compute overlap raio and diff (for debug)************** std::ofstream ssol ("simpleOverlapLength.histo", std::ofstream::out); std::map<size_t,int> simpleStats = pGraph->getCountMap() ; //Stats simple overlap length std::cout << "< Compute Stats of Overlap Length and Ratios>" << std::endl; //ssol << "Length\tCount" << std::endl; for (std::map<size_t,int> ::iterator iter=simpleStats.begin(); iter!=simpleStats.end(); ++iter) ssol << iter->first << "\t" << iter->second << std::endl; ssol.close(); std::ofstream orStats ("overlapRaito.stats", std::ofstream::out); std::vector < overlapRatioInfo > overlapRatioStats = pGraph->getOverlapRatioStats(); for (std::vector < overlapRatioInfo >::iterator iter=overlapRatioStats.begin(); iter!=overlapRatioStats.end(); ++iter) orStats << (*iter).overlapLength << "\t" << (*iter).originalLength << "\t" << std::setiosflags(std::ios::fixed) << std::setprecision(3) << (double)(*iter).overlapLength/(*iter).originalLength << "\t" << (*iter).ratioDiff << std::endl; orStats.close(); **********************Compute overlap raio and diff***************************/ // /*** Remove Illegal Kmer Edges ***/ SGRemoveIllegalKmerEdgeVisitor ikeVisit(opt::pBWT,opt::kmerLength,opt::kmerThreshold, opt::credibleOverlapLength); std::cout << "\n[ Remove Illegal Kmer Edges due to kmerization ]" << std::endl; pGraph->visitP(ikeVisit); pGraph->simplify(); /*** trim dead end vertices from small to large***/ size_t trimLen = opt::kmerLength+1 , stepsize=opt::insertSize/5; while (trimLen < opt::insertSize *3/2) { SGTrimVisitor shortTrimVisit("",trimLen); std::cout << "[ Trimming short vertices (<" << trimLen << ") ]\n"; if(pGraph->visitP(shortTrimVisit)) pGraph->simplify(); trimLen=trimLen+stepsize; } /*** Pop Bubbles ***/ std::cout << "\n[ Remove bubbles and tips ]\n"; graphTrimAndSmooth (pGraph, opt::maxChimeraLength); // outputGraphAndFasta(pGraph,"popBubbles",++phase); /*** Remove small chimeric vertices ***/ std::cout << "\n[ Remove small chimera vertices ]\n"; for (size_t threshold=2; threshold<=opt::kmerThreshold; threshold++) RemoveVertexWithBothShortEdges (pGraph, opt::readLength, opt::credibleOverlapLength, opt::pBWT, opt::kmerLength, threshold); //remove edges with overlap <= 1st overlap length peak (from unmerged original reads) RemoveVertexWithBothShortEdges (pGraph, opt::readLength, pGraph->getMinOverlap()); RemoveVertexWithBothShortEdges (pGraph, opt::readLength, opt::credibleOverlapLength); RemoveVertexWithBothShortEdges (pGraph, opt::insertSize, opt::credibleOverlapLength); RemoveVertexWithBothShortEdges (pGraph, opt::maxChimeraLength, opt::credibleOverlapLength); pGraph->contigStats(); pGraph->visit(statsVisit); outputGraphAndFasta(pGraph,"",++phase); /*** remove edges from 1st peak to 2nd peak in the overlap length distribution 1st peak: opt::credibleOverlapLength , 2nd peak: opt::insertSize*opt::minOverlapRatio Define large vertex as opt::maxChimeraLength*4 be very careful and little by little for avoiding misassembly ***/ std::cout << "\n[ Remove chimeric edges with insufficient overlap or large-overlap difference from large vertices]\n"; size_t stepsize2 = (opt::insertSize*opt::minOverlapRatio-opt::credibleOverlapLength)/4; //iterate 4 times for (size_t len = opt::credibleOverlapLength ; len <= opt::insertSize*opt::minOverlapRatio ; len+=stepsize2 ) { SGRemoveByOverlapLenDiffVisitor srv1(1600, len, (opt::insertSize*opt::minOverlapRatio)+opt::credibleOverlapLength-len); if(pGraph->visitP(srv1)) graphTrimAndSmooth(pGraph, opt::maxChimeraLength); } // fix min overlap length= insertsize*opt::minOverlapRatio, remove more edges by overlap diff from large to small diff // iterate 2 times for (size_t stepsize3 = opt::credibleOverlapLength/4; stepsize3 <= opt::credibleOverlapLength/2 ; stepsize3+=stepsize3 ) { SGRemoveByOverlapLenDiffVisitor srv1(1600, /*opt::insertSize*opt::minOverlapRatio*/ 0, opt::credibleOverlapLength-stepsize3); if(pGraph->visitP(srv1)) graphTrimAndSmooth(pGraph, opt::maxChimeraLength); } // SGRemoveByOverlapLenDiffVisitor srv1(opt::maxChimeraLength*3, /*opt::insertSize*opt::minOverlapRatio*/ 0, opt::credibleOverlapLength/3); // if(pGraph->visitP(srv1)) // graphTrimAndSmooth(pGraph, opt::maxChimeraLength); RemoveVertexWithBothShortEdges (pGraph, opt::readLength+100, opt::readLength*0.9); pGraph->contigStats(); pGraph->visit(statsVisit); outputGraphAndFasta(pGraph,"",++phase); /*** Remove edges according to PE links, which remove small repeat vertices most of the time ***/ for (size_t minPELink = 1; minPELink <= 1; minPELink++ ) { // 51 is more accurate while 31 produces longer contigs SGRemoveEdgeByPEVisitor REPEVistit(opt::indices, opt::insertSize, 51, minPELink); if(pGraph->visitP(REPEVistit)) graphTrimAndSmooth(pGraph, opt::maxChimeraLength); } pGraph->contigStats(); pGraph->visit(statsVisit); outputGraphAndFasta(pGraph,"",++phase); /** Incrementally remove chimera edges from small to large chimeric vertices using overlap ratio Be very careful and little by little for avoiding misassembly This visitor resolves chimeric with size roughly equal to read length+100 (first peak) Resolve larger vertices lead to misassembly ***/ std::cout << "\n[ Remove chimeric edges from small vertices using overlap ratios ]\n"; size_t stepsize4 = 15; for (size_t len = opt::readLength ; len <= opt::readLength+100 ; len+=stepsize4 ) RemoveSmallOverlapRatioEdges ( pGraph, len); // Rename requires extra memory which should // only be done after the string graph has been greatly simplified. pGraph->renameVertices(""); /******* Re-join broken islands/tips due to high-GC errors ********/ size_t min_size_of_islandtip=opt::maxChimeraLength; /***************** 1. Trim bad ends of island/tip *****************/ SGFastaErosionVisitor eFAVisit (opt::pBWT, opt::kmerLength, opt::kmerThreshold, min_size_of_islandtip); pGraph->visitP(eFAVisit); pGraph->contigStats(); pGraph->visitP(statsVisit); outputGraphAndFasta(pGraph,"", ++phase); /*** 2. Collect read IDs mapped to large island/tip with size > min_size_of_islandtip ***/ ThreadSafeListVector tslv; tslv.resize(opt::pSSA->getNumberOfReads()); SGIslandCollectVisitor sgicv(&tslv, opt::indices, opt::insertSize, 51, min_size_of_islandtip); pGraph->visitP(sgicv); /*** 3. Join islands/tips with PE support using FM-index walk (depth,leaves,minoverlap)=(150, 2000, 19) ***/ SGJoinIslandVisitor sgjiv(100, 4000, opt::kmerLength/2+4, min_size_of_islandtip, &tslv, opt::indices, 3); pGraph->visitProgress(sgjiv); graphTrimAndSmooth (pGraph, opt::maxChimeraLength); std::cout << "\n[Stats] Final graph statistic:\n"; pGraph->contigStats(); pGraph->visitP(statsVisit); // Write the results std::cout << "\n<Printing the contig file> : " << opt::outContigsFile << " \n" << std::endl; SGFastaVisitor av(opt::outContigsFile); pGraph->visit(av); //std::cout << "<Printing the ASQG and dot files>\n" << std::endl; pGraph->writeASQG(opt::outGraphFile); pGraph->writeDot("StriDe-graph.dot",0); delete pGraph; return 0; }
void assemble() { Timer t("sga assemble"); StringGraph* pGraph = SGUtil::loadASQG(opt::asqgFile, opt::minOverlap, true); if(opt::bExact) pGraph->setExactMode(true); pGraph->printMemSize(); // Visitor functors SGTransitiveReductionVisitor trVisit; SGGraphStatsVisitor statsVisit; SGRemodelVisitor remodelVisit; SGEdgeStatsVisitor edgeStatsVisit; SGTrimVisitor trimVisit(opt::trimLengthThreshold); SGBubbleVisitor bubbleVisit; SGBubbleEdgeVisitor bubbleEdgeVisit; SGContainRemoveVisitor containVisit; SGValidateStructureVisitor validationVisit; // Pre-assembly graph stats std::cout << "[Stats] Input graph:\n"; pGraph->visit(statsVisit); // Remove containments from the graph std::cout << "Removing contained vertices from graph\n"; while(pGraph->hasContainment()) pGraph->visit(containVisit); // Pre-assembly graph stats std::cout << "[Stats] After removing contained vertices:\n"; pGraph->visit(statsVisit); // Remove any extraneous transitive edges that may remain in the graph if(opt::bPerformTR) { std::cout << "Removing transitive edges\n"; pGraph->visit(trVisit); } // Compact together unbranched chains of vertices pGraph->simplify(); if(opt::bValidate) { std::cout << "Validating graph structure\n"; pGraph->visit(validationVisit); } // Remove dead-end branches from the graph if(opt::numTrimRounds > 0) { std::cout << "Trimming bad vertices\n"; int numTrims = opt::numTrimRounds; while(numTrims-- > 0) pGraph->visit(trimVisit); std::cout << "\n[Stats] Graph after trimming:\n"; pGraph->visit(statsVisit); } // Resolve small repeats if(opt::resolveSmallRepeatLen > 0) { SGSmallRepeatResolveVisitor smallRepeatVisit(opt::resolveSmallRepeatLen); std::cout << "Resolving small repeats\n"; int totalSmallRepeatRounds = 0; while(pGraph->visit(smallRepeatVisit)) std::cout << "Finished small repeat resolve round " << totalSmallRepeatRounds++ << "\n"; std::cout << "\n[Stats] After small repeat resolution:\n"; pGraph->visit(statsVisit); } // if(opt::coverageCutoff > 0) { std::cout << "Coverage visit\n"; SGCoverageVisitor coverageVisit(opt::coverageCutoff); pGraph->visit(coverageVisit); pGraph->visit(trimVisit); pGraph->visit(trimVisit); pGraph->visit(trimVisit); } // Peform another round of simplification pGraph->simplify(); if(opt::numBubbleRounds > 0) { std::cout << "\nPerforming variation smoothing\n"; SGSmoothingVisitor smoothingVisit(opt::outVariantsFile, opt::maxBubbleGapDivergence, opt::maxBubbleDivergence, opt::maxIndelLength); int numSmooth = opt::numBubbleRounds; while(numSmooth-- > 0) pGraph->visit(smoothingVisit); pGraph->simplify(); } pGraph->renameVertices("contig-"); std::cout << "\n[Stats] Final graph:\n"; pGraph->visit(statsVisit); // Rename the vertices to have contig IDs instead of read IDs //pGraph->renameVertices("contig-"); // Write the results SGFastaVisitor av(opt::outContigsFile); pGraph->visit(av); pGraph->writeASQG(opt::outGraphFile); delete pGraph; }
// Run the bubble construction process HaplotypeBuilderReturnCode DeBruijnHaplotypeBuilder::run(StringVector& out_haplotypes) { PROFILE_FUNC("GraphCompare::buildVariantStringGraph") assert(!m_startingKmer.empty()); std::map<std::string, int> kmerCountMap; // We search until we find the first common vertex in each direction size_t MIN_TARGET_COUNT = m_parameters.bReferenceMode ? 1 : 2; size_t MAX_ITERATIONS = 2000; size_t MAX_SIMULTANEOUS_BRANCHES = 40; size_t MAX_TOTAL_BRANCHES = 50; // Tracking stats size_t max_simul_branches_used = 0; size_t total_branches = 0; size_t iterations = 0; // Initialize the graph StringGraph* pGraph = new StringGraph; BuilderExtensionQueue queue; Vertex* pVertex = new(pGraph->getVertexAllocator()) Vertex(m_startingKmer, m_startingKmer); pVertex->setColor(GC_BLACK); pGraph->addVertex(pVertex); // Add the vertex to the extension queue queue.push(BuilderExtensionNode(pVertex, ED_SENSE)); queue.push(BuilderExtensionNode(pVertex, ED_ANTISENSE)); std::vector<Vertex*> sense_join_vector; std::vector<Vertex*> antisense_join_vector; // Perform the extension. The while conditions are heuristics to avoid searching // the graph too much while(!queue.empty() && iterations++ < MAX_ITERATIONS && queue.size() < MAX_SIMULTANEOUS_BRANCHES && total_branches < MAX_TOTAL_BRANCHES) { if(queue.size() > max_simul_branches_used) max_simul_branches_used = queue.size(); BuilderExtensionNode curr = queue.front(); queue.pop(); // Calculate de Bruijn extensions for this node std::string vertStr = curr.pVertex->getSeq().toString(); AlphaCount64 extensionCounts = BWTAlgorithms::calculateDeBruijnExtensionsSingleIndex(vertStr, m_parameters.variantIndex.pBWT, curr.direction); std::string extensionsUsed; for(size_t i = 0; i < DNA_ALPHABET::size; ++i) { char b = DNA_ALPHABET::getBase(i); size_t count = extensionCounts.get(b); bool acceptExt = count >= m_parameters.minDBGCount; if(!acceptExt) continue; extensionsUsed.push_back(b); std::string newStr = VariationBuilderCommon::makeDeBruijnVertex(vertStr, b, curr.direction); kmerCountMap[newStr] = count; // Create the new vertex and edge in the graph // Skip if the vertex already exists if(pGraph->getVertex(newStr) != NULL) continue; // Allocate the new vertex and add it to the graph Vertex* pVertex = new(pGraph->getVertexAllocator()) Vertex(newStr, newStr); pVertex->setColor(GC_BLACK); pGraph->addVertex(pVertex); // Add edges VariationBuilderCommon::addSameStrandDeBruijnEdges(pGraph, curr.pVertex, pVertex, curr.direction); // Check if this sequence is present in the FM-index of the target // If so, it is the join point of the de Bruijn graph and we extend no further. size_t targetCount = BWTAlgorithms::countSequenceOccurrences(newStr, m_parameters.baseIndex); if(targetCount >= MIN_TARGET_COUNT) { if(curr.direction == ED_SENSE) sense_join_vector.push_back(pVertex); else antisense_join_vector.push_back(pVertex); } else { // Add the vertex to the extension queue queue.push(BuilderExtensionNode(pVertex, curr.direction)); } } // Update the total number of times we branches the search if(!extensionsUsed.empty()) total_branches += extensionsUsed.size() - 1; } // If the graph construction was successful, walk the graph // between the endpoints to make a string // Generate haplotypes between every pair of antisense/sense join vertices for(size_t i = 0; i < antisense_join_vector.size(); ++i) { for(size_t j = 0; j < sense_join_vector.size(); ++j) { SGWalkVector outWalks; SGSearch::findWalks(antisense_join_vector[i], sense_join_vector[j], ED_SENSE, 100000, // max distance to search 10000, // max nodes to search true, // exhaustive search outWalks); for(size_t k = 0; k < outWalks.size(); ++k) out_haplotypes.push_back(outWalks[k].getString(SGWT_START_TO_END)); } } delete pGraph; return HBRC_OK; }