Пример #1
0
int main (int narg, const char** argv) {

	typedef Range<false> R;
	typedef Range<true> CR;

	Matrix<double> M1(10, 1), M2, M3(M1), M4(3, 4), M6(4, 3), M5(3, 4, 2), M7(3, 4, 2), M8, M9, M10, M11;
    M1 = 0.0;
    M2 = M1;

    for (size_t i = 1; i < M1.Size(); ++i)
        M1[i] = i;
    for (size_t i = 1; i < M4.Size(); ++i)
        M4[i] = i;
    for (size_t i = 1; i < M5.Size(); ++i)
        M5[i] = i;

    M3(R(7,-2,2)) = M1(CR(2,4));
    M6(R(3,-2,0),R(0,1)) = M4(CR(0,1),CR(1,2));
    M7(R(2,-2,0),R(0,1),R(0,0)) =  M5(CR(0,1),CR(1,2),CR(0,0));
    M8 = M6(CR(1,-1,0),CR(1,2));
    M6(R(0,1),R(1,2)) = M8;
    M9 = M6(CR("1:-1:0"),CR("1:2"));
    M10 = M6(CR("1:-1:0,1"),CR("1:2"));
    M11 = M6(CR(),CR()) * M6(CR(),CR());
    M11 = M6 * M6(CR(),CR());
    std::cout << M3 << std::endl<< std::endl;
    std::cout << M4 << std::endl<< std::endl;
    std::cout << M6 << std::endl<< std::endl;
    std::cout << M5 << std::endl<< std::endl;
    std::cout << M7 << std::endl<< std::endl;
    std::cout << M8 << std::endl<< std::endl;
    std::cout << M9 << std::endl<< std::endl;
    std::cout << M10 << std::endl<< std::endl;
    std::cout << M11 << std::endl<< std::endl;
    return 0;
}
Пример #2
0
void FastMatmulRecursive(LockAndCounter& locker, MemoryManager<Scalar>& mem_mngr, Matrix<Scalar>& A, Matrix<Scalar>& B, Matrix<Scalar>& C, int total_steps, int steps_left, int start_index, double x, int num_threads, Scalar beta) {
    // Update multipliers
    C.UpdateMultiplier(A.multiplier());
    C.UpdateMultiplier(B.multiplier());
    A.set_multiplier(Scalar(1.0));
    B.set_multiplier(Scalar(1.0));
    // Base case for recursion
    if (steps_left == 0) {
        MatMul(A, B, C);
        return;
    }

    Matrix<Scalar> A11 = A.Subblock(2, 2, 1, 1);
    Matrix<Scalar> A12 = A.Subblock(2, 2, 1, 2);
    Matrix<Scalar> A21 = A.Subblock(2, 2, 2, 1);
    Matrix<Scalar> A22 = A.Subblock(2, 2, 2, 2);
    Matrix<Scalar> B11 = B.Subblock(2, 2, 1, 1);
    Matrix<Scalar> B12 = B.Subblock(2, 2, 1, 2);
    Matrix<Scalar> B21 = B.Subblock(2, 2, 2, 1);
    Matrix<Scalar> B22 = B.Subblock(2, 2, 2, 2);
    Matrix<Scalar> C11 = C.Subblock(2, 2, 1, 1);
    Matrix<Scalar> C12 = C.Subblock(2, 2, 1, 2);
    Matrix<Scalar> C21 = C.Subblock(2, 2, 2, 1);
    Matrix<Scalar> C22 = C.Subblock(2, 2, 2, 2);


    // Matrices to store the results of multiplications.
#ifdef _PARALLEL_
    Matrix<Scalar> M1(mem_mngr.GetMem(start_index, 1, total_steps - steps_left, M), C11.m(), C11.m(), C11.n(), C.multiplier());
    Matrix<Scalar> M2(mem_mngr.GetMem(start_index, 2, total_steps - steps_left, M), C11.m(), C11.m(), C11.n(), C.multiplier());
    Matrix<Scalar> M3(mem_mngr.GetMem(start_index, 3, total_steps - steps_left, M), C11.m(), C11.m(), C11.n(), C.multiplier());
    Matrix<Scalar> M4(mem_mngr.GetMem(start_index, 4, total_steps - steps_left, M), C11.m(), C11.m(), C11.n(), C.multiplier());
    Matrix<Scalar> M5(mem_mngr.GetMem(start_index, 5, total_steps - steps_left, M), C11.m(), C11.m(), C11.n(), C.multiplier());
    Matrix<Scalar> M6(mem_mngr.GetMem(start_index, 6, total_steps - steps_left, M), C11.m(), C11.m(), C11.n(), C.multiplier());
    Matrix<Scalar> M7(mem_mngr.GetMem(start_index, 7, total_steps - steps_left, M), C11.m(), C11.m(), C11.n(), C.multiplier());
    Matrix<Scalar> M8(mem_mngr.GetMem(start_index, 8, total_steps - steps_left, M), C11.m(), C11.m(), C11.n(), C.multiplier());
#else
    Matrix<Scalar> M1(C11.m(), C11.n(), C.multiplier());
    Matrix<Scalar> M2(C11.m(), C11.n(), C.multiplier());
    Matrix<Scalar> M3(C11.m(), C11.n(), C.multiplier());
    Matrix<Scalar> M4(C11.m(), C11.n(), C.multiplier());
    Matrix<Scalar> M5(C11.m(), C11.n(), C.multiplier());
    Matrix<Scalar> M6(C11.m(), C11.n(), C.multiplier());
    Matrix<Scalar> M7(C11.m(), C11.n(), C.multiplier());
    Matrix<Scalar> M8(C11.m(), C11.n(), C.multiplier());
#endif
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
    bool sequential1 = should_launch_task(8, total_steps, steps_left, start_index, 1, num_threads);
    bool sequential2 = should_launch_task(8, total_steps, steps_left, start_index, 2, num_threads);
    bool sequential3 = should_launch_task(8, total_steps, steps_left, start_index, 3, num_threads);
    bool sequential4 = should_launch_task(8, total_steps, steps_left, start_index, 4, num_threads);
    bool sequential5 = should_launch_task(8, total_steps, steps_left, start_index, 5, num_threads);
    bool sequential6 = should_launch_task(8, total_steps, steps_left, start_index, 6, num_threads);
    bool sequential7 = should_launch_task(8, total_steps, steps_left, start_index, 7, num_threads);
    bool sequential8 = should_launch_task(8, total_steps, steps_left, start_index, 8, num_threads);
#else
    bool sequential1 = false;
    bool sequential2 = false;
    bool sequential3 = false;
    bool sequential4 = false;
    bool sequential5 = false;
    bool sequential6 = false;
    bool sequential7 = false;
    bool sequential8 = false;
#endif



    // M1 = (1 * A11) * (1 * B11)
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
    # pragma omp task if(sequential1) shared(mem_mngr, locker) untied
    {
#endif
        M1.UpdateMultiplier(Scalar(1));
        M1.UpdateMultiplier(Scalar(1));
        FastMatmulRecursive(locker, mem_mngr, A11, B11, M1, total_steps, steps_left - 1, (start_index + 1 - 1) * 8, x, num_threads, Scalar(0.0));
#ifndef _PARALLEL_
#endif
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
        locker.Decrement();
    }
    if (should_task_wait(8, total_steps, steps_left, start_index, 1, num_threads)) {
        # pragma omp taskwait
# if defined(_PARALLEL_) && (_PARALLEL_ == _HYBRID_PAR_)
        SwitchToDFS(locker, num_threads);
# endif
    }
#endif

    // M2 = (1 * A12) * (1 * B21)
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
    # pragma omp task if(sequential2) shared(mem_mngr, locker) untied
    {
#endif
        M2.UpdateMultiplier(Scalar(1));
        M2.UpdateMultiplier(Scalar(1));
        FastMatmulRecursive(locker, mem_mngr, A12, B21, M2, total_steps, steps_left - 1, (start_index + 2 - 1) * 8, x, num_threads, Scalar(0.0));
#ifndef _PARALLEL_
#endif
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
        locker.Decrement();
    }
    if (should_task_wait(8, total_steps, steps_left, start_index, 2, num_threads)) {
        # pragma omp taskwait
# if defined(_PARALLEL_) && (_PARALLEL_ == _HYBRID_PAR_)
        SwitchToDFS(locker, num_threads);
# endif
    }
#endif

    // M3 = (1 * A11) * (1 * B12)
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
    # pragma omp task if(sequential3) shared(mem_mngr, locker) untied
    {
#endif
        M3.UpdateMultiplier(Scalar(1));
        M3.UpdateMultiplier(Scalar(1));
        FastMatmulRecursive(locker, mem_mngr, A11, B12, M3, total_steps, steps_left - 1, (start_index + 3 - 1) * 8, x, num_threads, Scalar(0.0));
#ifndef _PARALLEL_
#endif
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
        locker.Decrement();
    }
    if (should_task_wait(8, total_steps, steps_left, start_index, 3, num_threads)) {
        # pragma omp taskwait
# if defined(_PARALLEL_) && (_PARALLEL_ == _HYBRID_PAR_)
        SwitchToDFS(locker, num_threads);
# endif
    }
#endif

    // M4 = (1 * A12) * (1 * B22)
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
    # pragma omp task if(sequential4) shared(mem_mngr, locker) untied
    {
#endif
        M4.UpdateMultiplier(Scalar(1));
        M4.UpdateMultiplier(Scalar(1));
        FastMatmulRecursive(locker, mem_mngr, A12, B22, M4, total_steps, steps_left - 1, (start_index + 4 - 1) * 8, x, num_threads, Scalar(0.0));
#ifndef _PARALLEL_
#endif
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
        locker.Decrement();
    }
    if (should_task_wait(8, total_steps, steps_left, start_index, 4, num_threads)) {
        # pragma omp taskwait
# if defined(_PARALLEL_) && (_PARALLEL_ == _HYBRID_PAR_)
        SwitchToDFS(locker, num_threads);
# endif
    }
#endif

    // M5 = (1 * A21) * (1 * B11)
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
    # pragma omp task if(sequential5) shared(mem_mngr, locker) untied
    {
#endif
        M5.UpdateMultiplier(Scalar(1));
        M5.UpdateMultiplier(Scalar(1));
        FastMatmulRecursive(locker, mem_mngr, A21, B11, M5, total_steps, steps_left - 1, (start_index + 5 - 1) * 8, x, num_threads, Scalar(0.0));
#ifndef _PARALLEL_
#endif
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
        locker.Decrement();
    }
    if (should_task_wait(8, total_steps, steps_left, start_index, 5, num_threads)) {
        # pragma omp taskwait
# if defined(_PARALLEL_) && (_PARALLEL_ == _HYBRID_PAR_)
        SwitchToDFS(locker, num_threads);
# endif
    }
#endif

    // M6 = (1 * A22) * (1 * B21)
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
    # pragma omp task if(sequential6) shared(mem_mngr, locker) untied
    {
#endif
        M6.UpdateMultiplier(Scalar(1));
        M6.UpdateMultiplier(Scalar(1));
        FastMatmulRecursive(locker, mem_mngr, A22, B21, M6, total_steps, steps_left - 1, (start_index + 6 - 1) * 8, x, num_threads, Scalar(0.0));
#ifndef _PARALLEL_
#endif
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
        locker.Decrement();
    }
    if (should_task_wait(8, total_steps, steps_left, start_index, 6, num_threads)) {
        # pragma omp taskwait
# if defined(_PARALLEL_) && (_PARALLEL_ == _HYBRID_PAR_)
        SwitchToDFS(locker, num_threads);
# endif
    }
#endif

    // M7 = (1 * A21) * (1 * B12)
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
    # pragma omp task if(sequential7) shared(mem_mngr, locker) untied
    {
#endif
        M7.UpdateMultiplier(Scalar(1));
        M7.UpdateMultiplier(Scalar(1));
        FastMatmulRecursive(locker, mem_mngr, A21, B12, M7, total_steps, steps_left - 1, (start_index + 7 - 1) * 8, x, num_threads, Scalar(0.0));
#ifndef _PARALLEL_
#endif
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
        locker.Decrement();
    }
    if (should_task_wait(8, total_steps, steps_left, start_index, 7, num_threads)) {
        # pragma omp taskwait
# if defined(_PARALLEL_) && (_PARALLEL_ == _HYBRID_PAR_)
        SwitchToDFS(locker, num_threads);
# endif
    }
#endif

    // M8 = (1 * A22) * (1 * B22)
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
    # pragma omp task if(sequential8) shared(mem_mngr, locker) untied
    {
#endif
        M8.UpdateMultiplier(Scalar(1));
        M8.UpdateMultiplier(Scalar(1));
        FastMatmulRecursive(locker, mem_mngr, A22, B22, M8, total_steps, steps_left - 1, (start_index + 8 - 1) * 8, x, num_threads, Scalar(0.0));
#ifndef _PARALLEL_
#endif
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
        locker.Decrement();
    }
    if (should_task_wait(8, total_steps, steps_left, start_index, 8, num_threads)) {
        # pragma omp taskwait
    }
#endif

    M_Add1(M1, M2, C11, x, false, beta);
    M_Add2(M3, M4, C12, x, false, beta);
    M_Add3(M5, M6, C21, x, false, beta);
    M_Add4(M7, M8, C22, x, false, beta);

    // Handle edge cases with dynamic peeling
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
    if (total_steps == steps_left) {
        mkl_set_num_threads_local(num_threads);
        mkl_set_dynamic(0);
    }
#endif
    DynamicPeeling(A, B, C, 2, 2, 2, beta);
}