int main() {
  std::cout << "Parallel Framework: " << PARALLEL_FRAMEWORK << std::endl;

  std::size_t length(1000);
  std::vector<std::vector<double>> a(length, std::vector<double>(length)),
      b(length, std::vector<double>(length)),
      c(length, std::vector<double>(length));
  fillMatrix(a);
  fillMatrix(b);

  std::chrono::high_resolution_clock::time_point t1 =
      std::chrono::high_resolution_clock::now();
  parallel_for_each(par, 0, length, [&a, &b, &c, length](std::size_t i) {
    for (std::size_t j = 0; j < length; ++j) {
      for (std::size_t k = 0; k < length; ++k) {
        c[i][j] += a[i][k] * b[k][j];
      }
    }
  });

  std::chrono::high_resolution_clock::time_point t2 =
      std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> time_span =
      std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1);
  std::cout << "It took me " << time_span.count() << " seconds." << std::endl;

  std::cout << "serial fallback: " << std::endl;
  t1 = std::chrono::high_resolution_clock::now();
  parallel_for_each(seq, 0, length, [&a, &b, &c, length](std::size_t i) {
    for (std::size_t j = 0; j < length; ++j) {
      for (std::size_t k = 0; k < length; ++k) {
        c[i][j] += a[i][k] * b[k][j];
      }
    }
  });

  t2 = std::chrono::high_resolution_clock::now();
  time_span =
      std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1);
  std::cout << "It took me " << time_span.count() << " seconds." << std::endl;

  std::cout << "serial old: " << std::endl;
  t1 = std::chrono::high_resolution_clock::now();
  for (std::size_t i = 0; i < length; ++i) {
    for (std::size_t j = 0; j < length; ++j) {
      for (std::size_t k = 0; k < length; ++k) {
        c[i][j] += a[i][k] * b[k][j];
      }
    }
  }
  t2 = std::chrono::high_resolution_clock::now();
  time_span =
      std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1);
  std::cout << "It took me " << time_span.count() << " seconds." << std::endl;

  return 0;
}
void basicSgemm( char transa, char transb, int m, int n, int k, float alpha,
        array_view<const float>& A, int lda,
        array_view<const float>& B, int ldb, float beta,
        array_view<float>& C, int ldc )
{
  if ((transa != 'N') && (transa != 'n')) {
    std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
    return;
  }

  if ((transb != 'T') && (transb != 't')) {
    std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
    return;
  }

  // In this code we assume the matrix sizes are multiple of tile size
  if ((m%TILE_SZ) || (n%TILE_SZ)) {
    std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_SZ
      << "; n should be multiple of " << TILE_SZ << std::endl;
  }


  int threads[2] = {TILE_SZ, TILE_SZ};
  int grid[2] = {m/TILE_SZ*threads[0], n/TILE_SZ*threads[1]};
  parallel_for_each(extent<2>(grid).tile(threads[0], threads[1]),
          [=] (tiled_index<2> tidx) [[hc]]
          {
          mysgemmNT(tidx, A, lda, B, ldb, C, ldc, k, alpha, beta);
          });
}
Exemple #3
0
void kmeans::update_clusters(const dataset & p_centers, cluster_sequence & p_clusters) {
    const dataset & data = *m_ptr_data;

    p_clusters.clear();
    p_clusters.resize(p_centers.size());

    /* fill clusters again in line with centers. */
    if (m_ptr_indexes->empty()) {
        std::vector<std::size_t> winners(data.size(), 0);
        parallel_for(std::size_t(0), data.size(), [this, &p_centers, &winners](std::size_t p_index) {
            assign_point_to_cluster(p_index, p_centers, winners);
        });

        for (std::size_t index_point = 0; index_point < winners.size(); index_point++) {
            const std::size_t suitable_index_cluster = winners[index_point];
            p_clusters[suitable_index_cluster].push_back(index_point);
        }
    }
    else {
        /* This part of code is used by X-Means and in case of parallel implementation of this part in scope of X-Means
           performance is slightly reduced. Experiments has been performed our implementation and Intel TBB library. 
           But in K-Means case only - it works perfectly and increase performance. */
        std::vector<std::size_t> winners(data.size(), 0);
        parallel_for_each(*m_ptr_indexes, [this, &p_centers, &winners](std::size_t p_index) {
            assign_point_to_cluster(p_index, p_centers, winners);
        });

        for (std::size_t index_point : *m_ptr_indexes) {
            const std::size_t suitable_index_cluster = winners[index_point];
            p_clusters[suitable_index_cluster].push_back(index_point);
        }
    }

    erase_empty_clusters(p_clusters);
}
Exemple #4
0
void parallel_for_each(Iterator first, Iterator last, Func f)
{
  ptrdiff_t const range_length = last - first;
  if (!range_length)
    return;
  if (range_length == 1)
  {
    f(*first);
    return;
  }

  Iterator const mid = first + (range_length / 2);

  std::future<void> bgtask = std::async(launch::async, &parallel_for_each<Iterator, Func>,
    first, mid, f);
  try
  {
    parallel_for_each(mid, last, f);
  }
  catch (...)
  {
    bgtask.wait();
    throw;
  }
  bgtask.get();
}
Exemple #5
0
Fichier : ICF.cpp Projet : sas/lld
// Merge identical COMDAT sections.
// Two sections are considered the same if their section headers,
// contents and relocations are all the same.
void ICF::run(const std::vector<Chunk *> &Vec) {
  // Collect only mergeable sections and group by hash value.
  parallel_for_each(Vec.begin(), Vec.end(), [&](Chunk *C) {
    if (auto *SC = dyn_cast<SectionChunk>(C)) {
      bool Global = SC->Sym && SC->Sym->isExternal();
      bool Writable = SC->getPermissions() & llvm::COFF::IMAGE_SCN_MEM_WRITE;
      if (SC->isCOMDAT() && SC->isLive() && Global && !Writable)
        SC->GroupID = getHash(SC) | (uint64_t(1) << 63);
    }
  });
  std::vector<SectionChunk *> Chunks;
  for (Chunk *C : Vec) {
    if (auto *SC = dyn_cast<SectionChunk>(C)) {
      if (SC->GroupID) {
        Chunks.push_back(SC);
      } else {
        SC->GroupID = NextID++;
      }
    }
  }

  // From now on, sections in Chunks are ordered so that sections in
  // the same group are consecutive in the vector.
  std::sort(Chunks.begin(), Chunks.end(),
            [](SectionChunk *A, SectionChunk *B) {
              return A->GroupID < B->GroupID;
            });

  // Split groups until we get a convergence.
  int Cnt = 1;
  forEachGroup(Chunks, equalsConstant);

  for (;;) {
    if (!forEachGroup(Chunks, equalsVariable))
      break;
    ++Cnt;
  }
  if (Config->Verbose)
    llvm::outs() << "\nICF needed " << Cnt << " iterations.\n";

  // Merge sections in the same group.
  for (auto It = Chunks.begin(), End = Chunks.end(); It != End;) {
    SectionChunk *Head = *It++;
    auto Bound = std::find_if(It, End, [&](SectionChunk *SC) {
      return Head->GroupID != SC->GroupID;
    });
    if (It == Bound)
      continue;
    if (Config->Verbose)
      llvm::outs() << "Selected " << Head->getDebugName() << "\n";
    while (It != Bound) {
      SectionChunk *SC = *It++;
      if (Config->Verbose)
        llvm::outs() << "  Removed " << SC->getDebugName() << "\n";
      Head->replace(SC);
    }
  }
}
int main() {
  std::mutex aMutex;
  parallel_for_each(par, 0, 10, [&aMutex](std::size_t i) {
    std::lock_guard<std::mutex> lock(aMutex);
    std::cout << "Hello from task " << i << std::endl;
  });

  return 0;
}
   void tree::for_all(Function& action)
   {
      // Perform the action on each child.
      parallel_for_each(begin(_children), end(_children), [&](tree& child) {
         child.for_all(action);
      });

      // Perform the action on this node.
      action(*this);
   }
Exemple #8
0
int main ()
{
  std::vector<int> values(100);
  int i = 0;
  for (auto &v: values) v = i++;
  
  parallel_for_each(Range(1000), [](int a) {
      printf("%d\n",a);
  }, 10);

  return 0;
}
void parallel_for_each(Iterator first, Iterator last, Func f) {
    const unsigned long length = std::distance(first, last);

    if (!length)
        return;

    const unsigned long min_per_thread = 25;
    if (length < 2 * min_per_thread) {
        std::for_each(first, last, f);
    } else {
        const Iterator mid_point = first + length / 2;
        std::future<void> first_half = std::async(&parallel_for_each<Iterator, Func>, first, mid_point, f);
        parallel_for_each(mid_point, last, f);
        first_half.get();
    }
}
// Finds all prime factors of the given value.
concurrent_vector<int> prime_factors_of(int n, 
   const concurrent_vector<int>& primes)
{
   // Holds the prime factors of n.
   concurrent_vector<int> prime_factors;
   
   // Use trial division to find the prime factors of n.
   // Every prime number that divides evenly into n is a prime factor of n.
   const int max = sqrt(static_cast<double>(n));
   parallel_for_each(begin(primes), end(primes), [&](int prime)
   {
      if (prime <= max)
      {         
         if ((n % prime) == 0)
            prime_factors.push_back(prime);
      }
   });

   return prime_factors;
}
Exemple #11
0
void parallel_for_each(T range, F callback, int numSegments)
{
    parallel_for_each(std::begin(range), std::end(range), callback, numSegments);
}
Exemple #12
0
void optimize(Index& index, php::Program& program) {
    assert(check(program));
    trace_time tracer("optimize");
    SCOPE_EXIT { state_after("optimize", program); };

    // Counters, just for debug printing.
    std::atomic<uint32_t> total_funcs{0};
    auto round = uint32_t{0};

    /*
     * Algorithm:
     *
     * Start by running an analyze pass on every function.  During
     * analysis, information about functions or classes will be
     * requested from the Index, which initially won't really know much,
     * but will record a dependency.  This part is done in parallel: no
     * passes are mutating anything, just reading from the Index.
     *
     * After a pass, we do a single-threaded "update" step to prepare
     * for the next pass: for each function that was analyzed, note the
     * facts we learned that may aid analyzing other functions in the
     * program, and register them in the index.  At this point, if any
     * of these facts are more useful than they used to be, add all the
     * Contexts that had a dependency on the new information to the work
     * list again, in case they can do better based on the new fact.
     *
     * Repeat until the work list is empty.
     */
    auto work = initial_work(program);
    while (!work.empty()) {
        auto const results = [&] {
            trace_time trace(
                "analyzing",
                folly::format("round {} -- {} work items", round, work.size()).str()
            );
            return parallel_map(
                work,
            [&] (const Context& ctx) -> folly::Optional<FuncAnalysis> {
                total_funcs.fetch_add(1, std::memory_order_relaxed);
                return analyze_func(index, ctx);
            }
            );
        }();
        work.clear();

        ++round;
        trace_time update_time("updating");

        std::set<Context> revisit;
        for (auto i = size_t{0}; i < results.size(); ++i) {
            auto& result = *results[i];

            assert(result.ctx.func == work[i].func);
            assert(result.ctx.cls == work[i].cls);
            assert(result.ctx.unit == work[i].unit);

            auto deps = index.refine_return_type(
                            result.ctx.func, result.inferredReturn
                        );
            for (auto& d : deps) revisit.insert(d);
        }

        std::copy(begin(revisit), end(revisit), std::back_inserter(work));
    }

    if (Trace::moduleEnabledRelease(Trace::hhbbc_time, 1)) {
        Trace::traceRelease("total function visits %u\n", total_funcs.load());
    }

    /*
     * Finally, use the results of all these iterations to perform
     * optimization.  This reanalyzes every function using our
     * now-very-updated Index, and then runs optimize_func with the
     * results.
     *
     * We do this in parallel: all the shared information is queried out
     * of the index, and each thread is allowed to modify the bytecode
     * for the function it is looking at.
     *
     * NOTE: currently they can't modify anything other than the
     * bytecode/Blocks, because other threads may be doing unlocked
     * queries to php::Func and php::Class structures.
     */
    trace_time final_pass("final pass");
    work = initial_work(program);
    parallel_for_each(
        initial_work(program),
    [&] (Context ctx) {
        optimize_func(index, analyze_func(index, ctx));
    }
    );
}
Exemple #13
0
void parallel_for_each(const TypeContainer & p_container, const TypeAction & p_task) {
    parallel_for_each(std::begin(p_container), std::end(p_container), p_task);
}