//! runs one iteration. This call blocks the main thread void FMEThreadPool::runThreads() { for (__uint32 i=1; i < numThreads(); i++) { thread(i)->start(); } thread(0)->doWork(); for (__uint32 i=1; i < numThreads(); i++) { thread(i)->join(); } }
void FMEThreadPool::deallocate() { for (__uint32 i=0; i < numThreads(); i++) { delete m_pThreads[i]; } delete[] m_pThreads; delete m_pSyncBarrier; }
void FMEThreadPool::allocate() { typedef FMEThread* FMEThreadPtr; m_pSyncBarrier = new Barrier(m_numThreads); m_pThreads = new FMEThreadPtr[m_numThreads]; for (__uint32 i=0; i < numThreads(); i++) { m_pThreads[i] = new FMEThread(this, i); #ifdef OGDF_SYSTEM_WINDOWS m_pThreads[i]->priority(Thread::tpCritical); m_pThreads[i]->cpuAffinity(1 << i); #endif } }
void FMEMultipoleKernel::quadtreeConstruction(ArrayPartition& pointPartition) { FMELocalContext* localContext = m_pLocalContext; FMEGlobalContext* globalContext = m_pGlobalContext; LinearQuadtree& tree = *globalContext->pQuadtree; // precompute the bounding box for the quadtree points from the graph nodes for_loop(pointPartition, min_max_x_function(localContext)); for_loop(pointPartition, min_max_y_function(localContext)); // wait until the thread's bounding box is computed sync(); // let the main thread computed the bounding box of the bounding boxes if (isMainThread()) { globalContext->min_x = globalContext->pLocalContext[0]->min_x; globalContext->min_y = globalContext->pLocalContext[0]->min_y; globalContext->max_x = globalContext->pLocalContext[0]->max_x; globalContext->max_y = globalContext->pLocalContext[0]->max_y; for (__uint32 j=1; j < numThreads(); j++) { globalContext->min_x = min(globalContext->min_x, globalContext->pLocalContext[j]->min_x); globalContext->min_y = min(globalContext->min_y, globalContext->pLocalContext[j]->min_y); globalContext->max_x = max(globalContext->max_x, globalContext->pLocalContext[j]->max_x); globalContext->max_y = max(globalContext->max_y, globalContext->pLocalContext[j]->max_y); }; tree.init(globalContext->min_x, globalContext->min_y, globalContext->max_x, globalContext->max_y); globalContext->coolDown *= 0.999f; tree.clear(); }; // wait because the morton number computation needs the bounding box sync(); // udpate morton number to prepare them for sorting for_loop(pointPartition, LQMortonFunctor(localContext)); // wait so we can sort them by morton number sync(); #ifdef OGDF_FME_PARALLEL_QUADTREE_SORT // use a simple parallel sorting algorithm LinearQuadtree::LQPoint* points = tree.pointArray(); sort_parallel(points, tree.numberOfPoints(), LQPointComparer); #else if (isMainThread()) { LinearQuadtree::LQPoint* points = tree.pointArray(); sort_single(points, tree.numberOfPoints(), LQPointComparer); }; #endif // wait because the quadtree builder needs the sorted order sync(); // if not a parallel run, we can do the easy way if (isSingleThreaded()) { LinearQuadtreeBuilder builder(tree); // prepare the tree builder.prepareTree(); // and link it builder.build(); LQPartitioner partitioner( localContext ); partitioner.partition(); } else // the more difficult part { // snap the left point of the interval of the thread to the first in the cell LinearQuadtree::PointID beginPoint = tree.findFirstPointInCell(pointPartition.begin); LinearQuadtree::PointID endPoint_plus_one; // if this thread is the last one, no snapping required for the right point if (threadNr()==numThreads()-1) endPoint_plus_one = tree.numberOfPoints(); else // find the left point of the next thread endPoint_plus_one = tree.findFirstPointInCell(pointPartition.end+1); // and calculate the number of points to prepare __uint32 numPointsToPrepare = endPoint_plus_one - beginPoint; // now we can prepare the snapped interval LinearQuadtreeBuilder builder(tree); // this function prepares the tree from begin point to endPoint_plus_one-1 (EXCLUDING endPoint_plus_one) builder.prepareTree(beginPoint, endPoint_plus_one); // save the start, end and count of the inner node chain in the context localContext->firstInnerNode = builder.firstInner; localContext->lastInnerNode = builder.lastInner; localContext->numInnerNodes = builder.numInnerNodes; // save the start, end and count of the leaf node chain in the context localContext->firstLeaf = builder.firstLeaf; localContext->lastLeaf = builder.lastLeaf; localContext->numLeaves = builder.numLeaves; // wait until all are finished sync(); // now the main thread has to link the tree if (isMainThread()) { // with his own builder LinearQuadtreeBuilder sbuilder(tree); // first we need the complete chain data sbuilder.firstInner = globalContext->pLocalContext[0]->firstInnerNode; sbuilder.firstLeaf = globalContext->pLocalContext[0]->firstLeaf; sbuilder.numInnerNodes = globalContext->pLocalContext[0]->numInnerNodes; sbuilder.numLeaves = globalContext->pLocalContext[0]->numLeaves; for (__uint32 j=1; j < numThreads(); j++) { sbuilder.numLeaves += globalContext->pLocalContext[j]->numLeaves; sbuilder.numInnerNodes += globalContext->pLocalContext[j]->numInnerNodes; }; sbuilder.lastInner = globalContext->pLocalContext[numThreads()-1]->lastInnerNode; sbuilder.lastLeaf = globalContext->pLocalContext[numThreads()-1]->lastLeaf; // Link the tree sbuilder.build(); // and run the partitions LQPartitioner partitioner(localContext); partitioner.partition(); }; }; // wait for tree to finish sync(); // now update the copy of the point data for_loop(pointPartition, LQPointUpdateFunctor(localContext)); // compute the nodes coordinates and sizes tree.forall_tree_nodes(LQCoordsFunctor(localContext), localContext->innerNodePartition.begin, localContext->innerNodePartition.numNodes)(); tree.forall_tree_nodes(LQCoordsFunctor(localContext), localContext->leafPartition.begin, localContext->leafPartition.numNodes)(); };
void FMEMultipoleKernel::operator()(FMEGlobalContext* globalContext) { __uint32 maxNumIterations = globalContext->pOptions->maxNumIterations; __uint32 minNumIterations = globalContext->pOptions->minNumIterations; __uint32 numPoints = globalContext->pQuadtree->numberOfPoints(); ArrayGraph& graph = *globalContext->pGraph; LinearQuadtree& tree = *globalContext->pQuadtree; LinearQuadtreeExpansion& treeExp = *globalContext->pExpansion; WSPD& wspd = *globalContext->pWSPD; FMELocalContext* localContext = globalContext->pLocalContext[threadNr()]; FMEGlobalOptions* options = globalContext->pOptions; float* threadsForceArrayX = localContext->forceX; float* threadsForceArrayY = localContext->forceY; float* globalForceArrayX = globalContext->globalForceX; float* globalForceArrayY = globalContext->globalForceY; ArrayPartition edgePartition = arrayPartition(graph.numEdges()); ArrayPartition nodePointPartition = arrayPartition(graph.numNodes()); m_pLocalContext = localContext; m_pGlobalContext = globalContext; /****************************/ /* INIT */ /****************************/ //! reset the global force array for_loop_array_set(threadNr(), numThreads(), globalForceArrayX, tree.numberOfPoints(), 0.0f); for_loop_array_set(threadNr(), numThreads(), globalForceArrayY, tree.numberOfPoints(), 0.0f); // reset the threads force array for (__uint32 i = 0; i < tree.numberOfPoints(); i++) { threadsForceArrayX[i] = 0.0f; threadsForceArrayY[i] = 0.0f; }; __uint32 maxNumIt = options->preProcMaxNumIterations; for (__uint32 currNumIteration = 0; ((currNumIteration < maxNumIt) ); currNumIteration++) { // iterate over all edges and store the resulting forces in the threads array for_loop(edgePartition, edge_force_function< EDGE_FORCE_DIV_DEGREE > (localContext) // divide the forces by degree of the node to avoid oscilation ); // wait until all edges are done sync(); // now collect the forces in parallel and put the sum into the global array and move the nodes accordingly for_loop(nodePointPartition, func_comp( collect_force_function<COLLECT_EDGE_FACTOR_PREP | COLLECT_ZERO_THREAD_ARRAY >(localContext), node_move_function<TIME_STEP_PREP | ZERO_GLOBAL_ARRAY>(localContext) ) ); }; if (isMainThread()) { globalContext->coolDown = 1.0f; }; sync(); for (__uint32 currNumIteration = 0; ((currNumIteration < maxNumIterations) && !globalContext->earlyExit); currNumIteration++) { // reset the coefficients for_loop_array_set(threadNr(), numThreads(), treeExp.m_multiExp, treeExp.m_numExp*(treeExp.m_numCoeff << 1), 0.0); for_loop_array_set(threadNr(), numThreads(), treeExp.m_localExp, treeExp.m_numExp*(treeExp.m_numCoeff << 1), 0.0); localContext->maxForceSq = 0.0; localContext->avgForce = 0.0; // construct the quadtree quadtreeConstruction(nodePointPartition); // wait for all threads to finish sync(); if (isSingleThreaded()) // if is single threaded run the simple approximation multipoleApproxSingleThreaded(nodePointPartition); else // otherwise use the partitioning multipoleApproxFinal(nodePointPartition); // now wait until all forces are summed up in the global array and mapped to graph node order sync(); // run the edge forces for_loop(edgePartition, // iterate over all edges and sum up the forces in the threads array edge_force_function< EDGE_FORCE_DIV_DEGREE >(localContext) // divide the forces by degree of the node to avoid oscilation ); // wait until edges are finished sync(); // collect the edge forces and move nodes without waiting for_loop(nodePointPartition, func_comp( collect_force_function<COLLECT_EDGE_FACTOR | COLLECT_ZERO_THREAD_ARRAY>(localContext), node_move_function<TIME_STEP_NORMAL | ZERO_GLOBAL_ARRAY>(localContext) ) ); // wait so we can decide if we need another iteration sync(); // check the max force square for all threads if (isMainThread()) { double maxForceSq = 0.0; for (__uint32 j=0; j < numThreads(); j++) maxForceSq = max(globalContext->pLocalContext[j]->maxForceSq, maxForceSq); // if we are allowed to quit and the max force sq falls under the threshold tell all threads we are done if ((currNumIteration >= minNumIterations) && (maxForceSq < globalContext->pOptions->stopCritForce )) { globalContext->earlyExit = true; }; }; // this is required to wait for the earlyExit result sync(); }; };
int rapMapMap(int argc, char* argv[]) { std::cerr << "RapMap Mapper\n"; std::string versionString = rapmap::version; TCLAP::CmdLine cmd( "RapMap Mapper", ' ', versionString); cmd.getProgramName() = "rapmap"; TCLAP::ValueArg<std::string> index("i", "index", "The location of the pseudoindex", true, "", "path"); TCLAP::ValueArg<std::string> read1("1", "leftMates", "The location of the left paired-end reads", false, "", "path"); TCLAP::ValueArg<std::string> read2("2", "rightMates", "The location of the right paired-end reads", false, "", "path"); TCLAP::ValueArg<std::string> unmatedReads("r", "unmatedReads", "The location of single-end reads", false, "", "path"); TCLAP::ValueArg<uint32_t> numThreads("t", "numThreads", "Number of threads to use", false, 1, "positive integer"); TCLAP::ValueArg<uint32_t> maxNumHits("m", "maxNumHits", "Reads mapping to more than this many loci are discarded", false, 200, "positive integer"); TCLAP::ValueArg<std::string> outname("o", "output", "The output file (default: stdout)", false, "", "path"); TCLAP::SwitchArg endCollectorSwitch("e", "endCollector", "Use the simpler (and faster) \"end\" collector as opposed to the more sophisticated \"skipping\" collector", false); TCLAP::SwitchArg noout("n", "noOutput", "Don't write out any alignments (for speed testing purposes)", false); cmd.add(index); cmd.add(noout); cmd.add(read1); cmd.add(read2); cmd.add(unmatedReads); cmd.add(outname); cmd.add(numThreads); cmd.add(maxNumHits); cmd.add(endCollectorSwitch); auto consoleSink = std::make_shared<spdlog::sinks::stderr_sink_mt>(); auto consoleLog = spdlog::create("stderrLog", {consoleSink}); try { cmd.parse(argc, argv); bool pairedEnd = (read1.isSet() or read2.isSet()); if (pairedEnd and (read1.isSet() != read2.isSet())) { consoleLog->error("You must set both the -1 and -2 arguments to align " "paired end reads!"); std::exit(1); } if (pairedEnd and unmatedReads.isSet()) { consoleLog->error("You cannot specify both paired-end and unmated " "reads in the input!"); std::exit(1); } if (!pairedEnd and !unmatedReads.isSet()) { consoleLog->error("You must specify input; either both paired-end " "or unmated reads!"); std::exit(1); } std::string indexPrefix(index.getValue()); if (indexPrefix.back() != '/') { indexPrefix += "/"; } if (!rapmap::fs::DirExists(indexPrefix.c_str())) { consoleLog->error("It looks like the index you provided [{}] " "doesn't exist", indexPrefix); std::exit(1); } IndexHeader h; std::ifstream indexStream(indexPrefix + "header.json"); { cereal::JSONInputArchive ar(indexStream); ar(h); } indexStream.close(); if (h.indexType() != IndexType::PSEUDO) { consoleLog->error("The index {} does not appear to be of the " "appropriate type (pseudo)", indexPrefix); std::exit(1); } RapMapIndex rmi; rmi.load(indexPrefix); std::cerr << "\n\n\n\n"; // from: http://stackoverflow.com/questions/366955/obtain-a-stdostream-either-from-stdcout-or-stdofstreamfile // set either a file or cout as the output stream std::streambuf* outBuf; std::ofstream outFile; bool haveOutputFile{false}; if (outname.getValue() == "") { outBuf = std::cout.rdbuf(); } else { outFile.open(outname.getValue()); outBuf = outFile.rdbuf(); haveOutputFile = true; } // Now set the output stream to the buffer, which is // either std::cout, or a file. std::ostream outStream(outBuf); // Must be a power of 2 size_t queueSize{268435456}; spdlog::set_async_mode(queueSize); auto outputSink = std::make_shared<spdlog::sinks::ostream_sink_mt>(outStream); auto outLog = std::make_shared<spdlog::logger>("outLog", outputSink); outLog->set_pattern("%v"); uint32_t nthread = numThreads.getValue(); std::unique_ptr<paired_parser> pairParserPtr{nullptr}; std::unique_ptr<single_parser> singleParserPtr{nullptr}; if (!noout.getValue()) { rapmap::utils::writeSAMHeader(rmi, outLog); } SpinLockT iomutex; { ScopedTimer timer; HitCounters hctrs; consoleLog->info("mapping reads . . . \n\n\n"); if (pairedEnd) { std::vector<std::thread> threads; std::vector<std::string> read1Vec = rapmap::utils::tokenize(read1.getValue(), ','); std::vector<std::string> read2Vec = rapmap::utils::tokenize(read2.getValue(), ','); if (read1Vec.size() != read2Vec.size()) { consoleLog->error("The number of provided files for " "-1 and -2 must be the same!"); std::exit(1); } size_t numFiles = read1Vec.size() + read2Vec.size(); char** pairFileList = new char*[numFiles]; for (size_t i = 0; i < read1Vec.size(); ++i) { pairFileList[2*i] = const_cast<char*>(read1Vec[i].c_str()); pairFileList[2*i+1] = const_cast<char*>(read2Vec[i].c_str()); } size_t maxReadGroup{1000}; // Number of reads in each "job" size_t concurrentFile{2}; // Number of files to read simultaneously pairParserPtr.reset(new paired_parser(4 * nthread, maxReadGroup, concurrentFile, pairFileList, pairFileList+numFiles)); /** Create the threads depending on the collector type **/ if (endCollectorSwitch.getValue()) { EndCollector endCollector(&rmi); for (size_t i = 0; i < nthread; ++i) { threads.emplace_back(processReadsPair<EndCollector, SpinLockT>, pairParserPtr.get(), std::ref(rmi), std::ref(endCollector), &iomutex, outLog, std::ref(hctrs), maxNumHits.getValue(), noout.getValue()); } } else { SkippingCollector skippingCollector(&rmi); for (size_t i = 0; i < nthread; ++i) { threads.emplace_back(processReadsPair<SkippingCollector, SpinLockT>, pairParserPtr.get(), std::ref(rmi), std::ref(skippingCollector), &iomutex, outLog, std::ref(hctrs), maxNumHits.getValue(), noout.getValue()); } } for (auto& t : threads) { t.join(); } delete [] pairFileList; } else { std::vector<std::thread> threads; std::vector<std::string> unmatedReadVec = rapmap::utils::tokenize(unmatedReads.getValue(), ','); size_t maxReadGroup{1000}; // Number of reads in each "job" size_t concurrentFile{1}; stream_manager streams( unmatedReadVec.begin(), unmatedReadVec.end(), concurrentFile); singleParserPtr.reset(new single_parser(4 * nthread, maxReadGroup, concurrentFile, streams)); /** Create the threads depending on the collector type **/ if (endCollectorSwitch.getValue()) { EndCollector endCollector(&rmi); for (size_t i = 0; i < nthread; ++i) { threads.emplace_back(processReadsSingle<EndCollector, SpinLockT>, singleParserPtr.get(), std::ref(rmi), std::ref(endCollector), &iomutex, outLog, std::ref(hctrs), maxNumHits.getValue(), noout.getValue()); } } else { SkippingCollector skippingCollector(&rmi); for (size_t i = 0; i < nthread; ++i) { threads.emplace_back(processReadsSingle<SkippingCollector, SpinLockT>, singleParserPtr.get(), std::ref(rmi), std::ref(skippingCollector), &iomutex, outLog, std::ref(hctrs), maxNumHits.getValue(), noout.getValue()); } } for (auto& t : threads) { t.join(); } } consoleLog->info("Done mapping reads."); consoleLog->info("In total saw {} reads.", hctrs.numReads); consoleLog->info("Final # hits per read = {}", hctrs.totHits / static_cast<float>(hctrs.numReads)); consoleLog->info("Discarded {} reads because they had > {} alignments", hctrs.tooManyHits, maxNumHits.getValue()); consoleLog->info("flushing output"); outLog->flush(); } if (haveOutputFile) { outFile.close(); } return 0; } catch (TCLAP::ArgException& e) { consoleLog->error("Exception [{}] when parsing argument {}", e.error(), e.argId()); return 1; } }