Exemplo n.º 1
0
int main(int argc, char* argv[])
{
    size_t kmerSize = 7;
    
    const char* sequences[] = {
        "AGGCGCTAGGGAGAGGATGATGAAA",
        "AGGCGCTCGGGAGAGGATGATGAAA",
        "AGGCGCTTGGGAGAGGATGATGAAA"
    };
    
    try
    {
        // Create the graph
        Graph graph = Graph::create (
            new BankStrings (sequences, ARRAY_SIZE(sequences)),
            "-kmer-size %d -abundance-min 1 -verbose 0", kmerSize
        );
        
        // Take the first node, which should be a branching node
        Node node = graph.buildNode (sequences[0]);
        
        // Retrieve the branching neighbors for the node (as BRANCHING EDGES!)
        Graph::Vector<BranchingEdge> branchingNeighbors =
            graph.successors<BranchingEdge> (node);
        
        std::cout 
            << "We found "
            << branchingNeighbors.size()
            << " branching neighbors from node "
            << graph.toString(node)
            << std::endl;
        
        // Iterate over the branching neighbors to print them
        for (size_t i = 0; i < branchingNeighbors.size(); i++)
        {
            // Note; we don't display all the transition nucleotides, only the
            // first transition nucleotide. We also display the number of 
            // transitions needed to link the two branching nodes.
            std::cout << graph.toString (branchingNeighbors[i]) << std::endl;
        }
        
    }
    
    catch(Exception& e)
    {
        std::cerr << "EXCEPTION: " << e.getMessage() << std::endl;
    }
    
    return EXIT_SUCCESS;
}
Exemplo n.º 2
0
int main (int argc, char* argv[])
{
    /** We create a command line parser. */
    OptionsParser parser;
    parser.push_back (new OptionOneParam (STR_URI_INPUT,  "graph file", true));

    IProperties* params = 0;

    try  {
        /** We parse the user options. */
        params = parser.parse (argc, argv);
    }
    catch (OptionFailure& e)
    {
        e.getParser().displayErrors (stdout);
        e.getParser().displayHelp   (stdout);
        return EXIT_FAILURE;
    }

    // We create the graph with the bank and other options
    Graph graph = Graph::load (params->getStr(STR_URI_INPUT));

    // We create a graph marker.
    GraphMarker<BranchingNode> marker (graph);

    // We create an object for Breadth First Search for the de Bruijn graph.
    BFS<BranchingNode> bfs (graph);

    // We want to compute the distribution of connected components of the branching nodes.
    //    - key is a connected component class (for a given number of branching nodes for this component)
    //    - value is the number of times this component class occurs in the branching sub graph
    map<size_t,Entry> distrib;

    // We get an iterator for all nodes of the graph. We use a progress iterator to get some progress feedback
    ProgressGraphIterator<BranchingNode,ProgressTimer>  itBranching (graph.iterator<BranchingNode>(), "statistics");

    // We want to know the number of connected components
    size_t nbConnectedComponents = 0;

    // We define some kind of unique identifier for a couple (indegree,outdegree)
    map <InOut_t, size_t> topology;

    size_t simplePathSizeMin = ~0;
    size_t simplePathSizeMax =  0;


    // We want time duration of the iteration
    TimeInfo ti;
    ti.start ("compute");

    // We loop the branching nodes
    for (itBranching.first(); !itBranching.isDone(); itBranching.next())
    {
        // We get branching nodes neighbors for the current branching node.
        Graph::Vector<BranchingEdge> successors   = graph.successors  <BranchingEdge> (*itBranching);
        Graph::Vector<BranchingEdge> predecessors = graph.predecessors<BranchingEdge> (*itBranching);

        // We increase the occurrences number for the current couple (in/out) neighbors
        topology [make_pair(predecessors.size(), successors.size())] ++;

        // We loop the in/out neighbors and update min/max simple path size
        for (size_t i=0; i<successors.size(); i++)
        {
            simplePathSizeMax = std::max (simplePathSizeMax, successors[i].distance);
            simplePathSizeMin = std::min (simplePathSizeMin, successors[i].distance);
        }
        for (size_t i=0; i<predecessors.size(); i++)
        {
            simplePathSizeMax = std::max (simplePathSizeMax, predecessors[i].distance);
            simplePathSizeMin = std::min (simplePathSizeMin, predecessors[i].distance);
        }

        // We skip already visited nodes.
        if (marker.isMarked (*itBranching))  {
            continue;
        }

        // We launch the breadth first search; we get as a result the set of branching nodes in this component
        const set<BranchingNode>& component = bfs.run (*itBranching);

        // We mark the nodes for this connected component
        marker.mark (component);

        // We update our distribution
        distrib[component.size()].nbOccurs += 1;

        // We update the number of connected components.
        nbConnectedComponents++;
    }

    ti.stop ("compute");

    // We compute the total number of branching nodes in all connected components.
    size_t sumOccurs = 0;
    size_t sumKmers = 0;
    for (map<size_t,Entry>::iterator it = distrib.begin(); it != distrib.end(); it++)
    {
        sumOccurs += it->first*it->second.nbOccurs;
        sumKmers  += it->second.nbKmers;
    }

    // We sort the statistics by decreasing occurrence numbers. Since map have its own ordering, we need to put all
    // the data into a vector and sort it with our own sorting criteria.
    vector < pair<InOut_t,size_t> >  stats;
    for (map <InOut_t, size_t>::iterator it = topology.begin(); it != topology.end(); it++)  {
        stats.push_back (*it);
    }

    sort (stats.begin(), stats.end(), CompareFct);

    // Note: it must be equal to the number of branching nodes of the graph
    assert (sumOccurs == itBranching.size());

    // We aggregate the computed information
    Properties props ("topology");

    props.add (1, "graph");
    props.add (2, "name",                    "%s", graph.getName().c_str());
    props.add (2, "db_input",                "%s", graph.getInfo().getStr("input").c_str());
    props.add (2, "db_nb_seq",               "%d", graph.getInfo().getInt("sequences_number"));
    props.add (2, "db_size",                 "%d", graph.getInfo().getInt("sequences_size"));
    props.add (2, "kmer_size",               "%d", graph.getInfo().getInt("kmer_size"));
    props.add (2, "kmer_nks",                "%d", graph.getInfo().getInt("nks"));
    props.add (2, "nb_nodes",                "%d", graph.getInfo().getInt("kmers_nb_solid"));
    props.add (2, "nb_branching_nodes",      "%d", graph.getInfo().getInt("nb_branching"));
    props.add (2, "percent_branching_nodes", "%.1f",
               graph.getInfo().getInt("kmers_nb_solid") > 0 ?
               100.0 * (float)graph.getInfo().getInt("nb_branching") / (float) graph.getInfo().getInt("kmers_nb_solid") : 0
              );

    props.add (1, "branching_nodes");

    props.add (2, "simple_path");
    props.add (3, "size_min", "%d", simplePathSizeMin);
    props.add (3, "size_max", "%d", simplePathSizeMax);

    props.add (2, "neighborhoods");
    for (size_t i=0; i<stats.size(); i++)
    {
        props.add (3, "neighborhood", "in=%d out=%d", stats[i].first.first, stats[i].first.second);
        props.add (4, "nb_bnodes",     "%d",    stats[i].second);
        props.add (4, "percentage",   "%5.2f", itBranching.size() > 0 ?
                   100.0*(float)stats[i].second / (float)itBranching.size() : 0
                  );
    }

    props.add (2, "connected_components");
    props.add (3, "nb_classes",    "%d", distrib.size());
    props.add (3, "nb_components", "%d", nbConnectedComponents);
    for (map<size_t,Entry>::iterator it = distrib.begin(); it!=distrib.end(); it++)
    {
        props.add (3, "component_class");
        props.add (4, "nb_occurs",    "%d", it->second.nbOccurs);
        props.add (4, "nb_bnodes",    "%d", it->first);
        props.add (4, "freq_bnodes",  "%f", sumOccurs > 0 ?
                   100.0*(float)(it->first*it->second.nbOccurs) / (float)sumOccurs : 0
                  );
    }
    props.add (1, ti.getProperties("time"));

    // We dump the results in a XML file in the current directory
    XmlDumpPropertiesVisitor v (graph.getName() + ".xml", false);
    props.accept (&v);

    return EXIT_SUCCESS;
}
Exemplo n.º 3
0
int main (int argc, char* argv[])
{
    /** We create a command line parser. */
    OptionsParser parser ("GraphStats");
    parser.push_back (new OptionOneParam (STR_URI_GRAPH, "graph input",  true));
    parser.push_back (new OptionOneParam (STR_NB_CORES,  "nb cores",     false, "0"));

    try
    {
        /** We parse the user options. */
        IProperties* options = parser.parse (argc, argv);

        // We load the graph
        Graph graph = Graph::load (options->getStr(STR_URI_GRAPH));

        // We set the number of cores to be used. Use all available cores if set to 0.
        size_t nbCores = options->getInt(STR_NB_CORES);

        // We get an iterator for branching nodes of the graph.
        // We use a progress iterator to get some progress feedback
        ProgressGraphIterator<BranchingNode,ProgressTimer>  itBranching (graph.iterator<BranchingNode>(), "statistics");

        // We define some kind of unique identifier for a couple (indegree,outdegree)
        typedef pair<size_t,size_t> InOut_t;

        // We want to gather some statistics during the iteration.
        // Note the use of ThreadObject: this object will be cloned N times (one object per thread) and each clone will
        // be reachable within the iteration block through ThreadObject::operator()
        ThreadObject <map <InOut_t, size_t> > topology;

        // We dispatch the iteration on several cores. Note the usage of lambda expression here.
        IDispatcher::Status status = Dispatcher(nbCores).iterate (itBranching, [&] (const BranchingNode& node)
        {
            // We retrieve the current instance of map <InOut_t,size_t> for the current running thread.
            map <InOut_t,size_t>& localTopology = topology();

            // We get branching nodes neighbors for the current branching node.
            Graph::Vector<BranchingEdge> successors   = graph.successors  <BranchingEdge> (node);
            Graph::Vector<BranchingEdge> predecessors = graph.predecessors<BranchingEdge> (node);

            // We increase the occurrences number for the current couple (in/out) neighbors
            localTopology [make_pair(predecessors.size(), successors.size())] ++;
        });

        // Now, the parallel processing is done. We want now to aggregate the information retrieved
        // in each thread in a single map.

        // We get each map<InOut_t,size_t> object filled in each thread, and we add its data into the "global" map.
        // The global map is reachable through the ThreadObject::operator*. The "topology.foreach" will loop over
        // all cloned object used in the threads.
        topology.foreach ([&] (const map <InOut_t, size_t>& t)
        {
            // We update the occurrence of the current couple (in/out)
            for_each (t.begin(), t.end(), [&] (const pair<InOut_t, size_t>& p) { (*topology)[p.first] += p.second;  });
        });

        // We sort the statistics by decreasing occurrence numbers. Since map have its own ordering, we need to put all
        // the data into a vector and sort it with our own sorting criteria.
        vector < pair<InOut_t,size_t> >  stats;
        for (auto it = topology->begin(); it != topology->end(); it++)  { stats.push_back (*it); }
        sort (stats.begin(), stats.end(), [=] (const pair<InOut_t,size_t>& a, const pair<InOut_t,size_t>& b) { return a.second > b.second; });

        printf ("\nThere are %d branching nodes with the following distribution: \n", itBranching.size());

        size_t sum=0;
        for (size_t i=0; i<stats.size(); i++)
        {
            sum += stats[i].second;

            printf ("    [in=%d out=%d]  nb=%7d  percent=%5.2f  distrib=%5.2f\n",
                stats[i].first.first,
                stats[i].first.second,
                stats[i].second,
                100.0*(float)stats[i].second / (float)itBranching.size(),
                100.0*(float)sum             / (float)itBranching.size()
            );
        }

        printf ("\nDone on %d cores in %.2f sec\n\n", status.nbCores, (float)status.time/1000.0);
    }
    catch (OptionFailure& e)
    {
        return e.displayErrors (std::cout);
    }
    catch (Exception& e)
    {
        std::cerr << "EXCEPTION: " << e.getMessage() << std::endl;
    }

    return EXIT_SUCCESS;
}