int main (int argc, char *argv[]) {
	
	MKLVersion ver;
	
	// Print information on CPU optimization in effect
	
	MKLGetVersion(&ver);
	
	printf("Processor optimization: %s\n",ver.Processor);
	omp_set_num_threads(8);
	mkl_set_dynamic(true);
	
	unsigned short A = ((unsigned short) std::strtoul(argv[1],(char **)0, 10));
	unsigned short K_min = ((unsigned short) std::strtoul(argv[2],(char **)0, 10));
	unsigned short K_max = ((unsigned short) std::strtoul(argv[3],(char **)0, 10));
	unsigned short P = ((unsigned short) std::strtoul(argv[4],(char **)0, 10));
	unsigned short Parita_Orbitale = ((unsigned short) std::strtoul(argv[5],(char **)0, 10));; //0 = pari ; 1 = dispari ; 2 = tutte
	unsigned short modo_cluster = 0;
	if (A==4) {
		if (argc == 7) {
			modo_cluster = ((unsigned short) std::strtoul(argv[6],(char **)0, 10)); //mettere a 1 per avere modo H sui 4 corpi
		}
	}
	init_main_0(A,K_min,K_max,P,modo_cluster,Parita_Orbitale);
	return true;
}
Exemplo n.º 2
0
double* ttm (double *X, const int Dim, long int *XDimSize,
            double *U, long int *UDimSize, char tflag,
            double *Y, long int *YDimSize,
            int ModeComn, int *ModeCom, int ModeComLen)
{
	//Decide transa and transb according to tflag and ModeComn and ModeCom's relationship.
	CBLAS_TRANSPOSE transa = CblasNoTrans;
	CBLAS_TRANSPOSE transb = CblasNoTrans;
	long int m,n,k;
	double alpha, beta;
	long int rsu,csu,rsx,csx,rsy,csy;
	long int XLoopStride = 0, YLoopStride = 0;
	alpha = 1.0;
	beta = 1.0;

	/****Generated TM algo according to different modes and dimensions ***/
	m = 10;
	n = 1000;
	k = 1000;
	rsu = 1000;
	csu = 1;
	rsx = 1000;
	csx = 1;
	rsy = 1000;
	csy = 1;

#pragma omp parallel for default(shared) schedule(static) num_threads(1)
	for (long int i0=0;i0<1000;i0++)
	{
		mkl_set_dynamic(0);
		mkl_set_num_threads_local(8);
		XLoopStride = i0*1000000;
		YLoopStride = i0*10000;
		cblas_dgemm(CblasRowMajor, transa, transb, 
              			m, n, k, 
              			alpha, 
              			U, rsu, 
              			X+XLoopStride, rsx, 
              			beta, 
              			Y+YLoopStride, rsy);
	}
	return Y;
}
int main (int argc, char *argv[]) {
	if (argc == 1){
		std::cerr << "Sintassi: A m_max K_min K_max nucleoni S_tot masse_uguali [modo_H]" << std::endl;
		return true;
	}

	MKLVersion ver;
	
	// Print information on CPU optimization in effect
	
	MKLGetVersion(&ver);
	
	printf("Processor optimization: %s\n",ver.Processor);
	
	omp_set_num_threads(8);
	mkl_set_dynamic(true);
	unsigned short A = ((unsigned short) std::strtoul(argv[1],(char **)0, 10));
	unsigned short m_max = ((unsigned short) std::strtoul(argv[2],(char **)0, 10));
	unsigned short K_min = ((unsigned short) std::strtoul(argv[3],(char **)0, 10));
	unsigned short K_max = ((unsigned short) std::strtoul(argv[4],(char **)0, 10));
	std::vector <char> nucleoni;
	for (short i = 5; i<5+A; ++i) {
		nucleoni.push_back(*argv[i]);
	}
	unsigned short S_tot = ((unsigned short) std::strtoul(argv[5+A],(char **)0, 10));
	bool controllo = ((bool) std::strtoul(argv[6+A],(char **)0, 10));
	bool modo_cluster = false;
	if (A==4) {
		beta_riferimento = 2.;
		if (argc == 8+A) {
			modo_cluster = ((bool) std::strtoul(argv[7+A],(char **)0, 10)); //mettere a 1 per avere modo H sui 4 corpi
		}		
		
	}

	Init_Corpi_LS Corp(nucleoni,controllo);
	std::string Nome_dir_vec = "lista_vettori"; 
	int crea_dir = mkdir(Nome_dir_vec.c_str(), 0777);
	init_main_0(A,m_max,K_min,K_max,S_tot,Corp,modo_cluster);
	return true;
}
Exemplo n.º 4
0
void matmul_init() {
#ifdef WITH_MKL
	mkl_set_dynamic(1);
#endif
//	omp_set_dynamic(1);
}
void StVKReducedStiffnessMatrix::Evaluate(double * q, double * Rq)
{
  // this is same as EvaluateSubset with start=0, end=quadraticSize

  /*
  int i,j,k;
  int output;

  // reset to free terms
  int index = 0;
  int indexEntry = 0;
  for(output=0; output<r; output++)
  {
    for(i=output; i<r; i++)
    {
      Rq[indexEntry] = freeCoef_[index];
      index++;
      indexEntry++;
    }
    indexEntry += output + 1;
  }

  // add linear terms
  index = 0;
  indexEntry = 0;
  for(output=0; output<r; output++)
  {
    for(i=output; i<r; i++)
    {
      for(j=0; j<r; j++)
      {
        Rq[indexEntry] += linearCoef_[index] * q[j];
        index++;
      }
      indexEntry++;
    }
    indexEntry += output + 1;
  }

  // add quadratic terms
  index = 0;
  indexEntry = 0;
  for(output=0; output<r; output++)
  {
    for(i=output; i<r; i++)
    {
      for(j=0; j<r; j++)
        for(k=j; k<r; k++)
        {
          Rq[indexEntry] += quadraticCoef_[index] * q[j] * q[k];
          index++;
        }
        indexEntry++;
    }
    indexEntry += output + 1;
  }

  // make symetric
  for(output=0; output<r; output++)
    for(i=0; i<output; i++)
      Rq[ELT(r,i,output)] = Rq[ELT(r,output,i)];
  */

  if (useSingleThread)
  {
    #if defined(WIN32) || defined(linux)
      mkl_max_threads = mkl_get_max_threads();
      mkl_dynamic = mkl_get_dynamic();
      mkl_set_num_threads(1);
      mkl_set_dynamic(0);
    #elif defined(__APPLE__)
      //setenv("VECLIB_MAXIMUM_THREADS", "1", true);
    #endif
  }

  // reset to free terms
  memcpy(buffer1,freeCoef_,sizeof(double)*quadraticSize);

  // add linear terms
  // multiply linearCoef_ and q
  // linearCoef_ is r x quadraticSize array
  cblas_dgemv(CblasColMajor, CblasTrans, 
        r, quadraticSize,
        1.0,
        linearCoef_, r,
        q, 1,
        1.0,
        buffer1, 1);

  // compute qiqj
  int index = 0;
  for(int output=0; output<r; output++)
    for(int i=output; i<r; i++)
    {
      qiqj[index] = q[output] * q[i];
      index++;
    }
 
  // update Rq
  // quadraticCoef_ is quadraticSize x quadraticSize matrix
  // each column gives quadratic coef for one matrix entry
  cblas_dgemv(CblasColMajor, CblasTrans, 
        quadraticSize, quadraticSize,
        1.0,
        quadraticCoef_, quadraticSize,
        qiqj, 1,
        1.0,
        buffer1, 1);

  // unpack into a symmetric matrix
  int i1=0,j1=0;
  for(int i=0; i< quadraticSize; i++)
  {
    Rq[ELT(r,i1,j1)] = buffer1[i];
    Rq[ELT(r,j1,i1)] = buffer1[i];
    j1++;
    if(j1 == r)
    {
      i1++;
      j1 = i1;
    }
  }

  if (useSingleThread)
  {
    #if defined(WIN32) || defined(linux)
      mkl_set_num_threads(mkl_max_threads);
      mkl_set_dynamic(mkl_dynamic);
    #elif defined(__APPLE__)
      //unsetenv("VECLIB_MAXIMUM_THREADS");
    #endif
  }
}
void StVKReducedInternalForces::Evaluate(double * q, double * fq)
{
/* // unoptimized version
  // reset to zero
  int i,j,k,l;
  for(l=0; l<r; l++)
    fq[l] = 0;

  // add linear terms
  int index = 0;
  for(l=0; l<r; l++)
    for(i=0; i<r; i++)
    {
      fq[l] += linearCoef_[index] * q[i];
      index++;
    }

  // add quadratic terms
  index = 0;
  for(l=0; l<r; l++)
    for(i=0; i<r; i++)
      for(j=i; j<r; j++)
      {
        fq[l] += quadraticCoef_[index] * q[i] * q[j];
        index++;
      }

  // add cubic terms
  index = 0;
  for(l=0; l<r; l++)
    for(i=0; i<r; i++)
      for(j=i; j<r; j++)
        for(k=j; k<r; k++)
        {
          fq[l] += cubicCoef_[index] * q[i] * q[j] * q[k];
          index++;
        }
*/

  if (useSingleThread)
  {
    #if defined(_WIN32) || defined(WIN32) || defined(linux)
      mkl_max_threads = mkl_get_max_threads();
      mkl_dynamic = mkl_get_dynamic();
      mkl_set_num_threads(1);
      mkl_set_dynamic(0);
    #elif defined(__APPLE__)
      //setenv("VECLIB_MAXIMUM_THREADS", "1", true);
    #endif
  }

  // add linear terms
  // multiply linearCoef_ and q
  // linearCoef_ is r x r array
  cblas_dgemv(CblasColMajor, CblasTrans,
       r, r,
       1.0,
       linearCoef_, r,
       q, 1,
       0.0,
       fq, 1);

  // compute qiqj
  int index = 0;
  for(int output=0; output<r; output++)
    for(int i=output; i<r; i++)
    {
      qiqj[index] = q[output] * q[i];
      index++;
    }

  // add quadratic terms
  // quadraticCoef_ is quadraticSize x r matrix
  // each column gives quadratic coef for one force vector component
  cblas_dgemv(CblasColMajor, CblasTrans,
       quadraticSize, r,
       1.0,
       quadraticCoef_, quadraticSize,
       qiqj, 1,
       1.0,
       fq, 1);

  // add cubic terms
  // cubicCoef_ is cubicSize x r matrix
  // each column gives cubicSize coef for one force vector component
  int size = quadraticSize;
  double * qiqjPos = qiqj;
  double * cubicCoefPos = cubicCoef_;
  for(int i=0; i<r; i++)
  {
    cblas_dgemv(CblasColMajor, CblasTrans,
        size, r,
        q[i],
        cubicCoefPos, cubicSize,
        qiqjPos, 1,
        1.0,
        fq, 1);

    int param = r-i;
    size -= param;
    qiqjPos += param;
    cubicCoefPos += param * (param+1) / 2;
  }

  if (addGravity)
  {
    for(int i=0; i<r; i++)
      fq[i] -= reducedGravityForce[i];
  }

  if (useSingleThread)
  {
    #if defined(_WIN32) || defined(WIN32) || defined(linux)
      mkl_set_num_threads(mkl_max_threads);
      mkl_set_dynamic(mkl_dynamic);
    #elif defined(__APPLE__)
      //unsetenv("VECLIB_MAXIMUM_THREADS");
    #endif
  }
}
Exemplo n.º 7
0
double FastMatmul(Matrix<Scalar>& A, Matrix<Scalar>& B, Matrix<Scalar>& C,
                  int num_steps, double x=1e-8, Scalar alpha=Scalar(1.0), Scalar beta=Scalar(0.0)) {
    MemoryManager<Scalar> mem_mngr;
#ifdef _PARALLEL_
    mem_mngr.Allocate(2, 2, 2, 8, num_steps, A.m(), A.n(), B.n());
#endif
    A.set_multiplier(alpha);
    int num_multiplies_per_step = 8;
    int total_multiplies = pow(num_multiplies_per_step, num_steps);

    // Set parameters needed for all types of parallelism.
    int num_threads = 0;
#ifdef _PARALLEL_
    # pragma omp parallel
    {
        if (omp_get_thread_num() == 0) {
            num_threads = omp_get_num_threads();
        }
    }
    omp_set_nested(1);
#endif

#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_)
    # pragma omp parallel
    {
        mkl_set_num_threads_local(1);
        mkl_set_dynamic(0);
    }
#endif

#if defined(_PARALLEL_) && (_PARALLEL_ == _DFS_PAR_)
    mkl_set_dynamic(0);
#endif

#if defined(_PARALLEL_) && (_PARALLEL_ == _HYBRID_PAR_)
    if (num_threads > total_multiplies) {
        mkl_set_dynamic(0);
    } else {
        # pragma omp parallel
        {
            mkl_set_num_threads_local(1);
            mkl_set_dynamic(0);
        }
    }
#endif

    LockAndCounter locker(total_multiplies - (total_multiplies % num_threads));
    using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
    auto t1 = std::chrono::high_resolution_clock::now();

#ifdef _PARALLEL_
    # pragma omp parallel
    {
        # pragma omp single
#endif
        FastMatmulRecursive(locker, mem_mngr, A, B, C, num_steps, num_steps, 0, x, num_threads, beta);
#ifdef _PARALLEL_
    }
#endif
    auto t2 = std::chrono::high_resolution_clock::now();
    return FpMilliseconds(t2 - t1).count();
}
Exemplo n.º 8
0
void FastMatmulRecursive(LockAndCounter& locker, MemoryManager<Scalar>& mem_mngr, Matrix<Scalar>& A, Matrix<Scalar>& B, Matrix<Scalar>& C, int total_steps, int steps_left, int start_index, double x, int num_threads, Scalar beta) {
    // Update multipliers
    C.UpdateMultiplier(A.multiplier());
    C.UpdateMultiplier(B.multiplier());
    A.set_multiplier(Scalar(1.0));
    B.set_multiplier(Scalar(1.0));
    // Base case for recursion
    if (steps_left == 0) {
        MatMul(A, B, C);
        return;
    }

    Matrix<Scalar> A11 = A.Subblock(2, 2, 1, 1);
    Matrix<Scalar> A12 = A.Subblock(2, 2, 1, 2);
    Matrix<Scalar> A21 = A.Subblock(2, 2, 2, 1);
    Matrix<Scalar> A22 = A.Subblock(2, 2, 2, 2);
    Matrix<Scalar> B11 = B.Subblock(2, 2, 1, 1);
    Matrix<Scalar> B12 = B.Subblock(2, 2, 1, 2);
    Matrix<Scalar> B21 = B.Subblock(2, 2, 2, 1);
    Matrix<Scalar> B22 = B.Subblock(2, 2, 2, 2);
    Matrix<Scalar> C11 = C.Subblock(2, 2, 1, 1);
    Matrix<Scalar> C12 = C.Subblock(2, 2, 1, 2);
    Matrix<Scalar> C21 = C.Subblock(2, 2, 2, 1);
    Matrix<Scalar> C22 = C.Subblock(2, 2, 2, 2);


    // Matrices to store the results of multiplications.
#ifdef _PARALLEL_
    Matrix<Scalar> M1(mem_mngr.GetMem(start_index, 1, total_steps - steps_left, M), C11.m(), C11.m(), C11.n(), C.multiplier());
    Matrix<Scalar> M2(mem_mngr.GetMem(start_index, 2, total_steps - steps_left, M), C11.m(), C11.m(), C11.n(), C.multiplier());
    Matrix<Scalar> M3(mem_mngr.GetMem(start_index, 3, total_steps - steps_left, M), C11.m(), C11.m(), C11.n(), C.multiplier());
    Matrix<Scalar> M4(mem_mngr.GetMem(start_index, 4, total_steps - steps_left, M), C11.m(), C11.m(), C11.n(), C.multiplier());
    Matrix<Scalar> M5(mem_mngr.GetMem(start_index, 5, total_steps - steps_left, M), C11.m(), C11.m(), C11.n(), C.multiplier());
    Matrix<Scalar> M6(mem_mngr.GetMem(start_index, 6, total_steps - steps_left, M), C11.m(), C11.m(), C11.n(), C.multiplier());
    Matrix<Scalar> M7(mem_mngr.GetMem(start_index, 7, total_steps - steps_left, M), C11.m(), C11.m(), C11.n(), C.multiplier());
    Matrix<Scalar> M8(mem_mngr.GetMem(start_index, 8, total_steps - steps_left, M), C11.m(), C11.m(), C11.n(), C.multiplier());
#else
    Matrix<Scalar> M1(C11.m(), C11.n(), C.multiplier());
    Matrix<Scalar> M2(C11.m(), C11.n(), C.multiplier());
    Matrix<Scalar> M3(C11.m(), C11.n(), C.multiplier());
    Matrix<Scalar> M4(C11.m(), C11.n(), C.multiplier());
    Matrix<Scalar> M5(C11.m(), C11.n(), C.multiplier());
    Matrix<Scalar> M6(C11.m(), C11.n(), C.multiplier());
    Matrix<Scalar> M7(C11.m(), C11.n(), C.multiplier());
    Matrix<Scalar> M8(C11.m(), C11.n(), C.multiplier());
#endif
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
    bool sequential1 = should_launch_task(8, total_steps, steps_left, start_index, 1, num_threads);
    bool sequential2 = should_launch_task(8, total_steps, steps_left, start_index, 2, num_threads);
    bool sequential3 = should_launch_task(8, total_steps, steps_left, start_index, 3, num_threads);
    bool sequential4 = should_launch_task(8, total_steps, steps_left, start_index, 4, num_threads);
    bool sequential5 = should_launch_task(8, total_steps, steps_left, start_index, 5, num_threads);
    bool sequential6 = should_launch_task(8, total_steps, steps_left, start_index, 6, num_threads);
    bool sequential7 = should_launch_task(8, total_steps, steps_left, start_index, 7, num_threads);
    bool sequential8 = should_launch_task(8, total_steps, steps_left, start_index, 8, num_threads);
#else
    bool sequential1 = false;
    bool sequential2 = false;
    bool sequential3 = false;
    bool sequential4 = false;
    bool sequential5 = false;
    bool sequential6 = false;
    bool sequential7 = false;
    bool sequential8 = false;
#endif



    // M1 = (1 * A11) * (1 * B11)
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
    # pragma omp task if(sequential1) shared(mem_mngr, locker) untied
    {
#endif
        M1.UpdateMultiplier(Scalar(1));
        M1.UpdateMultiplier(Scalar(1));
        FastMatmulRecursive(locker, mem_mngr, A11, B11, M1, total_steps, steps_left - 1, (start_index + 1 - 1) * 8, x, num_threads, Scalar(0.0));
#ifndef _PARALLEL_
#endif
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
        locker.Decrement();
    }
    if (should_task_wait(8, total_steps, steps_left, start_index, 1, num_threads)) {
        # pragma omp taskwait
# if defined(_PARALLEL_) && (_PARALLEL_ == _HYBRID_PAR_)
        SwitchToDFS(locker, num_threads);
# endif
    }
#endif

    // M2 = (1 * A12) * (1 * B21)
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
    # pragma omp task if(sequential2) shared(mem_mngr, locker) untied
    {
#endif
        M2.UpdateMultiplier(Scalar(1));
        M2.UpdateMultiplier(Scalar(1));
        FastMatmulRecursive(locker, mem_mngr, A12, B21, M2, total_steps, steps_left - 1, (start_index + 2 - 1) * 8, x, num_threads, Scalar(0.0));
#ifndef _PARALLEL_
#endif
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
        locker.Decrement();
    }
    if (should_task_wait(8, total_steps, steps_left, start_index, 2, num_threads)) {
        # pragma omp taskwait
# if defined(_PARALLEL_) && (_PARALLEL_ == _HYBRID_PAR_)
        SwitchToDFS(locker, num_threads);
# endif
    }
#endif

    // M3 = (1 * A11) * (1 * B12)
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
    # pragma omp task if(sequential3) shared(mem_mngr, locker) untied
    {
#endif
        M3.UpdateMultiplier(Scalar(1));
        M3.UpdateMultiplier(Scalar(1));
        FastMatmulRecursive(locker, mem_mngr, A11, B12, M3, total_steps, steps_left - 1, (start_index + 3 - 1) * 8, x, num_threads, Scalar(0.0));
#ifndef _PARALLEL_
#endif
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
        locker.Decrement();
    }
    if (should_task_wait(8, total_steps, steps_left, start_index, 3, num_threads)) {
        # pragma omp taskwait
# if defined(_PARALLEL_) && (_PARALLEL_ == _HYBRID_PAR_)
        SwitchToDFS(locker, num_threads);
# endif
    }
#endif

    // M4 = (1 * A12) * (1 * B22)
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
    # pragma omp task if(sequential4) shared(mem_mngr, locker) untied
    {
#endif
        M4.UpdateMultiplier(Scalar(1));
        M4.UpdateMultiplier(Scalar(1));
        FastMatmulRecursive(locker, mem_mngr, A12, B22, M4, total_steps, steps_left - 1, (start_index + 4 - 1) * 8, x, num_threads, Scalar(0.0));
#ifndef _PARALLEL_
#endif
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
        locker.Decrement();
    }
    if (should_task_wait(8, total_steps, steps_left, start_index, 4, num_threads)) {
        # pragma omp taskwait
# if defined(_PARALLEL_) && (_PARALLEL_ == _HYBRID_PAR_)
        SwitchToDFS(locker, num_threads);
# endif
    }
#endif

    // M5 = (1 * A21) * (1 * B11)
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
    # pragma omp task if(sequential5) shared(mem_mngr, locker) untied
    {
#endif
        M5.UpdateMultiplier(Scalar(1));
        M5.UpdateMultiplier(Scalar(1));
        FastMatmulRecursive(locker, mem_mngr, A21, B11, M5, total_steps, steps_left - 1, (start_index + 5 - 1) * 8, x, num_threads, Scalar(0.0));
#ifndef _PARALLEL_
#endif
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
        locker.Decrement();
    }
    if (should_task_wait(8, total_steps, steps_left, start_index, 5, num_threads)) {
        # pragma omp taskwait
# if defined(_PARALLEL_) && (_PARALLEL_ == _HYBRID_PAR_)
        SwitchToDFS(locker, num_threads);
# endif
    }
#endif

    // M6 = (1 * A22) * (1 * B21)
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
    # pragma omp task if(sequential6) shared(mem_mngr, locker) untied
    {
#endif
        M6.UpdateMultiplier(Scalar(1));
        M6.UpdateMultiplier(Scalar(1));
        FastMatmulRecursive(locker, mem_mngr, A22, B21, M6, total_steps, steps_left - 1, (start_index + 6 - 1) * 8, x, num_threads, Scalar(0.0));
#ifndef _PARALLEL_
#endif
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
        locker.Decrement();
    }
    if (should_task_wait(8, total_steps, steps_left, start_index, 6, num_threads)) {
        # pragma omp taskwait
# if defined(_PARALLEL_) && (_PARALLEL_ == _HYBRID_PAR_)
        SwitchToDFS(locker, num_threads);
# endif
    }
#endif

    // M7 = (1 * A21) * (1 * B12)
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
    # pragma omp task if(sequential7) shared(mem_mngr, locker) untied
    {
#endif
        M7.UpdateMultiplier(Scalar(1));
        M7.UpdateMultiplier(Scalar(1));
        FastMatmulRecursive(locker, mem_mngr, A21, B12, M7, total_steps, steps_left - 1, (start_index + 7 - 1) * 8, x, num_threads, Scalar(0.0));
#ifndef _PARALLEL_
#endif
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
        locker.Decrement();
    }
    if (should_task_wait(8, total_steps, steps_left, start_index, 7, num_threads)) {
        # pragma omp taskwait
# if defined(_PARALLEL_) && (_PARALLEL_ == _HYBRID_PAR_)
        SwitchToDFS(locker, num_threads);
# endif
    }
#endif

    // M8 = (1 * A22) * (1 * B22)
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
    # pragma omp task if(sequential8) shared(mem_mngr, locker) untied
    {
#endif
        M8.UpdateMultiplier(Scalar(1));
        M8.UpdateMultiplier(Scalar(1));
        FastMatmulRecursive(locker, mem_mngr, A22, B22, M8, total_steps, steps_left - 1, (start_index + 8 - 1) * 8, x, num_threads, Scalar(0.0));
#ifndef _PARALLEL_
#endif
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
        locker.Decrement();
    }
    if (should_task_wait(8, total_steps, steps_left, start_index, 8, num_threads)) {
        # pragma omp taskwait
    }
#endif

    M_Add1(M1, M2, C11, x, false, beta);
    M_Add2(M3, M4, C12, x, false, beta);
    M_Add3(M5, M6, C21, x, false, beta);
    M_Add4(M7, M8, C22, x, false, beta);

    // Handle edge cases with dynamic peeling
#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_)
    if (total_steps == steps_left) {
        mkl_set_num_threads_local(num_threads);
        mkl_set_dynamic(0);
    }
#endif
    DynamicPeeling(A, B, C, 2, 2, 2, beta);
}