int main (int argc, char* argv[]) { /** We create a command line parser. */ OptionsParser parser; parser.push_back (new OptionOneParam (STR_URI_INPUT, "graph file", true)); IProperties* params = 0; try { /** We parse the user options. */ params = parser.parse (argc, argv); } catch (OptionFailure& e) { e.getParser().displayErrors (stdout); e.getParser().displayHelp (stdout); return EXIT_FAILURE; } // We create the graph with the bank and other options Graph graph = Graph::load (params->getStr(STR_URI_INPUT)); // We create a graph marker. GraphMarker<BranchingNode> marker (graph); // We create an object for Breadth First Search for the de Bruijn graph. BFS<BranchingNode> bfs (graph); // We want to compute the distribution of connected components of the branching nodes. // - key is a connected component class (for a given number of branching nodes for this component) // - value is the number of times this component class occurs in the branching sub graph map<size_t,Entry> distrib; // We get an iterator for all nodes of the graph. We use a progress iterator to get some progress feedback ProgressGraphIterator<BranchingNode,ProgressTimer> itBranching (graph.iterator<BranchingNode>(), "statistics"); // We want to know the number of connected components size_t nbConnectedComponents = 0; // We define some kind of unique identifier for a couple (indegree,outdegree) map <InOut_t, size_t> topology; size_t simplePathSizeMin = ~0; size_t simplePathSizeMax = 0; // We want time duration of the iteration TimeInfo ti; ti.start ("compute"); // We loop the branching nodes for (itBranching.first(); !itBranching.isDone(); itBranching.next()) { // We get branching nodes neighbors for the current branching node. Graph::Vector<BranchingEdge> successors = graph.successors <BranchingEdge> (*itBranching); Graph::Vector<BranchingEdge> predecessors = graph.predecessors<BranchingEdge> (*itBranching); // We increase the occurrences number for the current couple (in/out) neighbors topology [make_pair(predecessors.size(), successors.size())] ++; // We loop the in/out neighbors and update min/max simple path size for (size_t i=0; i<successors.size(); i++) { simplePathSizeMax = std::max (simplePathSizeMax, successors[i].distance); simplePathSizeMin = std::min (simplePathSizeMin, successors[i].distance); } for (size_t i=0; i<predecessors.size(); i++) { simplePathSizeMax = std::max (simplePathSizeMax, predecessors[i].distance); simplePathSizeMin = std::min (simplePathSizeMin, predecessors[i].distance); } // We skip already visited nodes. if (marker.isMarked (*itBranching)) { continue; } // We launch the breadth first search; we get as a result the set of branching nodes in this component const set<BranchingNode>& component = bfs.run (*itBranching); // We mark the nodes for this connected component marker.mark (component); // We update our distribution distrib[component.size()].nbOccurs += 1; // We update the number of connected components. nbConnectedComponents++; } ti.stop ("compute"); // We compute the total number of branching nodes in all connected components. size_t sumOccurs = 0; size_t sumKmers = 0; for (map<size_t,Entry>::iterator it = distrib.begin(); it != distrib.end(); it++) { sumOccurs += it->first*it->second.nbOccurs; sumKmers += it->second.nbKmers; } // We sort the statistics by decreasing occurrence numbers. Since map have its own ordering, we need to put all // the data into a vector and sort it with our own sorting criteria. vector < pair<InOut_t,size_t> > stats; for (map <InOut_t, size_t>::iterator it = topology.begin(); it != topology.end(); it++) { stats.push_back (*it); } sort (stats.begin(), stats.end(), CompareFct); // Note: it must be equal to the number of branching nodes of the graph assert (sumOccurs == itBranching.size()); // We aggregate the computed information Properties props ("topology"); props.add (1, "graph"); props.add (2, "name", "%s", graph.getName().c_str()); props.add (2, "db_input", "%s", graph.getInfo().getStr("input").c_str()); props.add (2, "db_nb_seq", "%d", graph.getInfo().getInt("sequences_number")); props.add (2, "db_size", "%d", graph.getInfo().getInt("sequences_size")); props.add (2, "kmer_size", "%d", graph.getInfo().getInt("kmer_size")); props.add (2, "kmer_nks", "%d", graph.getInfo().getInt("nks")); props.add (2, "nb_nodes", "%d", graph.getInfo().getInt("kmers_nb_solid")); props.add (2, "nb_branching_nodes", "%d", graph.getInfo().getInt("nb_branching")); props.add (2, "percent_branching_nodes", "%.1f", graph.getInfo().getInt("kmers_nb_solid") > 0 ? 100.0 * (float)graph.getInfo().getInt("nb_branching") / (float) graph.getInfo().getInt("kmers_nb_solid") : 0 ); props.add (1, "branching_nodes"); props.add (2, "simple_path"); props.add (3, "size_min", "%d", simplePathSizeMin); props.add (3, "size_max", "%d", simplePathSizeMax); props.add (2, "neighborhoods"); for (size_t i=0; i<stats.size(); i++) { props.add (3, "neighborhood", "in=%d out=%d", stats[i].first.first, stats[i].first.second); props.add (4, "nb_bnodes", "%d", stats[i].second); props.add (4, "percentage", "%5.2f", itBranching.size() > 0 ? 100.0*(float)stats[i].second / (float)itBranching.size() : 0 ); } props.add (2, "connected_components"); props.add (3, "nb_classes", "%d", distrib.size()); props.add (3, "nb_components", "%d", nbConnectedComponents); for (map<size_t,Entry>::iterator it = distrib.begin(); it!=distrib.end(); it++) { props.add (3, "component_class"); props.add (4, "nb_occurs", "%d", it->second.nbOccurs); props.add (4, "nb_bnodes", "%d", it->first); props.add (4, "freq_bnodes", "%f", sumOccurs > 0 ? 100.0*(float)(it->first*it->second.nbOccurs) / (float)sumOccurs : 0 ); } props.add (1, ti.getProperties("time")); // We dump the results in a XML file in the current directory XmlDumpPropertiesVisitor v (graph.getName() + ".xml", false); props.accept (&v); return EXIT_SUCCESS; }
int main (int argc, char* argv[]) { /** We create a command line parser. */ OptionsParser parser ("GraphStats"); parser.push_back (new OptionOneParam (STR_URI_GRAPH, "graph input", true)); try { /** We parse the user options. */ IProperties* options = parser.parse (argc, argv); // We load the graph Graph graph = Graph::load (options->getStr(STR_URI_GRAPH)); // We create a graph marker. GraphMarker marker (graph); // We create an object for Breadth First Search for the de Bruijn graph. BFS bfs (graph); // We want to compute the distribution of connected components of the branching nodes. // - key is a connected component class (for a given number of branching nodes for this component) // - value is the number of times this component class occurs in the branching sub graph map<size_t,size_t> distrib; // We get an iterator for all nodes of the graph. We use a progress iterator to get some progress feedback ProgressGraphIterator<BranchingNode,ProgressTimer> itBranching (graph.iteratorBranching(), "statistics"); // We want time duration of the iteration TimeInfo ti; ti.start ("compute"); // We need to keep each connected component. list<set<BranchingNode> > components; // We loop the branching nodes for (itBranching.first(); !itBranching.isDone(); itBranching.next()) { // We skip already visited nodes. if (marker.isMarked (*itBranching)) { continue; } // We launch the breadth first search; we get as a result the set of branching nodes in this component const set<BranchingNode>& component = bfs.run (*itBranching); // We memorize the component components.push_back (component); // We mark the nodes for this connected component marker.mark (component); // We update our distribution distrib[component.size()] ++; } ti.stop ("compute"); // We compute the total number of branching nodes in all connected components. size_t sum = 0; for (map<size_t,size_t>::iterator it = distrib.begin(); it != distrib.end(); it++) { sum += it->first*it->second; } // Note: it must be equal to the number of branching nodes of the graph assert (sum == itBranching.size()); size_t idx1=0; size_t cc=0; // We check that each component has no intersection with all other components. // Note: this check may take a long time since we have N^2 intersections to compute. for (list<set<BranchingNode> >::iterator it1 = components.begin(); it1 != components.end(); it1++, idx1++) { size_t idx2=0; for (list<set<BranchingNode> >::iterator it2 = components.begin(); it2 != components.end(); it2++, idx2++) { if (it1 != it2) { set<BranchingNode> inter; set_intersection (it1->begin(),it1->end(),it2->begin(),it2->end(), std::inserter(inter,inter.begin())); if (inter.size()!=0) { printf ("ERROR, intersection should be empty...\n"); exit(EXIT_FAILURE); } } if (++cc % 50 == 0) { cc = 0; printf ("[check] %.1f %.1f\r", 100.0*(float)idx1/(float)components.size(), 100.0*(float)idx2/(float)components.size()); fflush (stdout); } } } printf ("\n"); // We aggregate the computed information Properties props ("connected_components"); props.add (1, "graph_name", "%s", graph.getName().c_str()); props.add (1, "nb_branching_nodes", "%d", sum); props.add (1, "nb_connected_components", "%d", distrib.size()); for (map<size_t,size_t>::iterator it = distrib.begin(); it!=distrib.end(); it++) { props.add (2, "component"); props.add (3, "nb_nodes", "%d", it->first); props.add (3, "nb_occurs", "%d", it->second); props.add (3, "freq_nodes", "%f", 100.0*(float)(it->first*it->second) / (float)sum); props.add (3, "freq_occurs", "%f", 100.0*(float)it->second / (float)sum); } props.add (1, ti.getProperties("time")); // We dump the results in a XML file in the current directory XmlDumpPropertiesVisitor v (graph.getName() + ".xml", false); props.accept (&v); } catch (OptionFailure& e) { return e.displayErrors (std::cout); } catch (Exception& e) { std::cerr << "EXCEPTION: " << e.getMessage() << std::endl; } return EXIT_SUCCESS; }