/*
 * This routine handles the case where N <= maxNB && K <= maxKB, so B is
 * only one block.  It is particularly important for the panel factorizations
 * of both LU and QR.
 */
int Mjoin(PATL,tammm_tNK)
(
   enum ATLAS_TRANS TA,
   enum ATLAS_TRANS TB,
   ATL_CINT M,
   ATL_CINT N,
   ATL_CINT K,
   const SCALAR alpha,
   const TYPE *A,
   ATL_CINT lda,
   const TYPE *B,
   ATL_CINT ldb,
   const SCALAR beta,
   TYPE *C,
   ATL_CINT ldc
)
{
   ATL_SZT nmblks;
   amminfo_t mminfo;
   unsigned int i, mb, nb, kb, mu, nu, ku, P, mr;
   ATL_tamm_tNK_t pd;  /* problem definition structure */
   void *vp;

/*
 * Special case for tiny N&K, and large M
 */
   if (N >= ATL_AMM_MAXNB || K >= ATL_AMM_MAXKB || M < ATL_AMM_MAXMB ||
       M < Mmin(8,ATL_NTHREADS)*ATL_AMM_MAXMB)
      return(1);
   Mjoin(PATL,GetRankKInfo)(&mminfo, TA, TB, M, N, K, alpha, beta);
   pd.a2blk = mminfo.a2blk;
   pd.b2blk = mminfo.b2blk;
   pd.blk2c = mminfo.Cblk2cm;
   pd.amm_b0 = mminfo.amm_b0;
   pd.TA = (TA == AtlasTrans);
   pd.TB = (TB == AtlasTrans);
   pd.N = N;
   pd.K = K;
   pd.A = A;
   pd.B = B;
   pd.C = C;
   pd.lda = lda;
   pd.ldb = ldb;
   pd.ldc = ldc;
   pd.alpha = &alpha;
   pd.beta  = &beta;
   mu = mminfo.mu;
   nu = mminfo.nu;
   ku = mminfo.ku;
   pd.mb = mb = mminfo.mb;
   pd.nmu = mb / mu;
   pd.nnu = (N+nu-1)/nu;
   nb = pd.nnu * nu;
   kb = mminfo.kb;
   nmblks = M / mb;
   mr = M - nmblks*mb;
   if (!mr)
   {
      pd.mbL = mr = mb;
      pd.nmuL = pd.nmu;
   }
   else
   {
      nmblks++;
      pd.nmuL = (mr+mu-1)/mu;
      pd.mbL = pd.nmuL * mu;
   }
   pd.mr = mr;
   pd.nmblks = nmblks;
   pd.KB0 = K;
   #if ATL_MAXKMAJ_RKK > 1
      if (ATL_AMMFLG_KMAJOR(mminfo.flag))
         pd.KB0 = ((K+ku-1)/ku)*ku;
   #endif
/*
 * Maximum scale is limited by NTHREADS or max number of M-blocks
 */
   P = (ATL_NTHREADS <= nmblks) ? ATL_NTHREADS : nmblks;
/*
 * We have a common B wrk of size KB0*nb, then
 * for each node, we need workspace: sz(A,C) = mb*K, K*nb, mb*N, laid out
 * in memory as A,C, then we add safety margin mu*nu*ku so advance loads don't
 * seg fault, and we add space for aligning the ptrs
 */
   pd.bsz = pd.KB0*nb;
   pd.wsz = mb*(pd.nnu*nu + pd.bsz) + 2*ATL_DivBySize(ATL_Cachelen);

   vp = malloc(ATL_MulBySize(pd.wsz*P + pd.bsz+mu*nu*ku) + ATL_Cachelen);
   if (!vp)
      return(2);
   pd.w = ATL_AlignPtr(vp);
   pd.MbCtr = ATL_SetGlobalAtomicCount(ATL_EstNctr(nmblks, P), nmblks, 0);
   pd.BassgCtr = ATL_SetAtomicCount(1);
   pd.BdoneCtr = ATL_SetAtomicCount(1);
   #ifdef DEBUG1
   {
      ATL_LAUNCHSTRUCT_t ls;
      ATL_thread_t ts;
      ts.rank = 0;
      ts.P = 1;
      ls.opstruct = &pd;
      Mjoin(PATL,DoWork_tamm_tNK)(&ls, &ts);
   }
   #else
      ATL_goparallel(P, Mjoin(PATL,DoWork_tamm_tNK), &pd, NULL);
   #endif

   ATL_FreeAtomicCount(pd.BdoneCtr);
   ATL_FreeAtomicCount(pd.BassgCtr);
   ATL_FreeGlobalAtomicCount(pd.MbCtr);

   free(vp);
   return(0);
}
int Mjoin(PATL,tsyrk_amm_K)
(
   const enum ATLAS_UPLO Uplo,
   const enum ATLAS_TRANS Trans,
   ATL_CINT N,
   ATL_CINT K,
   const SCALAR alpha,
   const TYPE *A,
   ATL_CINT lda,
   const SCALAR beta,
   TYPE *C,
   ATL_CINT ldc
)
{
   amminfo_t mminfo;
   ATL_tsyrk_ammK_t pd;
   ablk2cmat_t Mjoin(PATL,tGetSyammInfo_K)
      (amminfo_t *out, const int P, enum ATLAS_TRANS TA, ATL_CSZT N,ATL_CSZT K);
   int kb=ATL_AMM_MAXKB, nkb = K / ATL_AMM_MAXKB, P = ATL_NTHREADS;
   int ku, kr, mb, nb, mu, nu;
   size_t sz;
   void *vp=NULL;

   if (nkb < P)
   {
      kb = ATL_AMM_98KB;
      nkb = K / ATL_AMM_98KB;
      if (nkb < P)
      {
         nkb = K / ATL_AMM_66KB;
         kb = ATL_AMM_66KB;
      }
   }
   if (nkb < P)
   {
      if (nkb < 2)
      {
          Mjoin(PATL,syrk)(Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc);
          return(0);
      }
      P = nkb;
   }
   pd.blk2c_b0 = Mjoin(PATL,tGetSyammInfo_K)(&mminfo, P, Trans, N, kb);
   kb = mminfo.kb;
   nkb = K / kb;

   mu = mminfo.mu;
   nu = mminfo.nu;
   pd.nmu = (N+mu-1) / mu;
   pd.nnu = (N+nu-1) / nu;
   pd.mb = mb = pd.nmu*mu;
   pd.nb = nb = pd.nnu*nu;
   pd.kb = mminfo.kb;
   sz = ((((size_t)mb)*nb)<<1) + (mb+nb)*kb;
   pd.wsz = sz;
   sz = ATL_MulBySize(sz)*P;
   vp = malloc(sz+ATL_Cachelen);
   if (!vp)
      return(1);
   pd.w = ATL_AlignPtr(vp);
   kr = K - nkb*kb;
   pd.kb0 = pd.KB0 = kr;
   ku = mminfo.ku;
   if (!kr)
   {
      pd.kb0 = pd.KB0 = kb;
      pd.ammK_b0 = mminfo.amm_b0;
      pd.ammK_b1 = mminfo.amm_b1;
   }
   else
   {
      #if ATL_AMM_MAXKMAJ > 1
         if (ATL_AMMFLG_KMAJOR(mminfo.flag))
         {
            pd.KB0 = ((kr+ku-1)/ku)*ku;
            if (ATL_AMMFLG_KRUNTIME(mminfo.flag))
            {
               pd.ammK_b0 = mminfo.amm_b0;
               pd.ammK_b1 = mminfo.amm_b1;
            }
            else
            {
               pd.ammK_b0 = mminfo.amm_k1_b0;
               pd.ammK_b1 = mminfo.amm_k1_b1;
            }
         }
         else
      #endif
      {
         if (ATL_AMMFLG_KRUNTIME(mminfo.flag) && kr == (kr/ku)*ku &&
             kr > mminfo.kbmin)
         {
               pd.ammK_b0 = mminfo.amm_b0;
               pd.ammK_b1 = mminfo.amm_b1;
         }
         else
         {
               pd.ammK_b0 = mminfo.amm_k1_b0;
               pd.ammK_b1 = mminfo.amm_k1_b1;
         }
      }
   }
   pd.amm_b0 = mminfo.amm_b0;
   pd.amm_b1 = mminfo.amm_b1;
   pd.blk2c_b1 = mminfo.Cblk2cm;
   pd.a2blk = mminfo.a2blk;
   pd.b2blk = mminfo.b2blk;
   pd.A = A;
   pd.C = C;
   pd.alpha = &alpha;
   pd.beta = &beta;
   pd.nkblks = (kr) ? nkb+1 : nkb;
   pd.KbCtr = ATL_SetGlobalAtomicCount(ATL_EstNctr(pd.nkblks, P), pd.nkblks, 0);
   pd.Cmut = ATL_mutex_init();
   pd.BETA_APPLIED = SCALAR_IS_ONE(beta);
   pd.LOWER = (Uplo == AtlasLower);
   pd.TA = (Trans == AtlasTrans);
   pd.N = N;
   pd.lda = lda;
   pd.ldc = ldc;

//   #define DEBUG1 1
   #ifdef DEBUG1
   {
      ATL_LAUNCHSTRUCT_t ls;
      ATL_thread_t ts;
      ts.rank = 0;
      ts.P = 1;
      ls.opstruct = &pd;
      Mjoin(PATL,DoWork_syrk_amm_K)(&ls, &ts);
   }
   #else
      ATL_goparallel(P, Mjoin(PATL,DoWork_syrk_amm_K), &pd,
                     Mjoin(PATL,CombSyrk_ammK));
   #endif
/*
 * Answer is written back to rank0's workspace, extract it & write to C
 */
   {
      TYPE *wC = pd.w+kb*(mb+nb), *w = wC + mb*nb, *c = C;
/*
 *    Put it into block-major storage in w
 */
      pd.blk2c_b0(N, N, ATL_rone, wC, ATL_rzero, w, N);
/*
 *    Now copy out only upper or lower portion
 */
      if (pd.LOWER)
      {
         int j;
         for (j=0; j < N; j++, c += ldc+1, w += N+1)
             Mjoin(PATL,axpby)(N-j, alpha, w, 1, beta, c, 1);
      }
      else
      {
         int j;
         for (j=0; j < N; j++, c += ldc, w += N)
             Mjoin(PATL,axpby)(j+1, alpha, w, 1, beta, c, 1);
      }
   }
   free(vp);
   ATL_mutex_free(pd.Cmut);
   ATL_FreeGlobalAtomicCount(pd.KbCtr);
   return(0);
}
Exemple #3
0
void ATL_goparallel
/*
 * This function is used when you pass a single opstruct to all threads;
 * In this case, we stash opstruct in launchstruct's vp, and then use the
 * rank array as opstruct during the spawn.  Therefore, these routines
 * should expect to get their problem def from ls.vp, and their rank from
 * the second argument.  The DoWork function is the function that should
 * be called from each thread to do the parallel work.  This function should
 * look like:
 * void DoWork_example(ATL_LAUNCHSTRUCT_t *lp, void *vp)
 * {
 *    ATL_thread_t *tp = vp;
 *    const int myrank = tp->rank;
 *    my_prob_def_t *pd = lp->vp;
 *    ... do work based on info in struct pointed to by lp->vp ...
 * }
 * Your DoWork should perform any needed combine before finishing execution,
 * and any return values can be passed in the problem definition structure
 * that you define.
 */
(
   const unsigned int P, /* # of cores to use */
   void *DoWork,         /* func ptr to work function */
   void *opstruct,       /* structure giving tasks to threads */
   void *DoComb          /* function to combine two opstructs */
)
{
   ATL_thread_t *tp;
   int *chkin;
   void *vp, *lc;
   int i;
   ATL_LAUNCHSTRUCT_t ls;

   ls.OpStructIsInit = NULL;
   ls.DoWork = DoWork;
   ls.DoComb = DoComb;
   ls.opstruct = opstruct;
#ifdef ATL_OMP_THREADS
   tp = malloc(sizeof(ATL_thread_t)*P);
   ATL_assert(tp);
   for (i=0; i < P; i++)
   {
      tp[i].vp = &ls;
      tp[i].rank = i;
      tp[i].P = P;
   }
   ls.rank2thr = tp;
   omp_set_num_threads(P);
   #pragma omp parallel
   {
/*
 *    Make sure we got the requested nodes, and set affinity if supported
 */
      ATL_assert(omp_get_num_threads() == P);
      #ifdef ATL_PAFF_SELF
         ATL_setmyaffinity();
      #endif
      i = omp_get_thread_num();
      ls.DoWork(&ls, tp+i);
   }
/*
 * Do combine (linear) if requested
 */
   if (DoComb)
      for (i=1; i < P; i++)
         ls.DoComb(ls.opstruct, 0, i);
#else
   #if ATL_USE_DYNAMIC
      ls.acounts = &lc;
      ls.acounts[0] = ATL_SetGlobalAtomicCount(P>>1, P-1, 0);
      vp = malloc(P*(sizeof(ATL_thread_t)+sizeof(int)) + ATL_Cachelen);
      ATL_assert(vp);
      chkin = vp;
      tp = (ATL_thread_t*)(chkin+P);
      tp = ATL_AlignPtr(tp);
   #else
      vp = malloc(P*(sizeof(ATL_thread_t)) + ATL_Cachelen);
      tp = ATL_AlignPtr(vp);
   #endif
   ls.rank2thr = tp;

   for (i=0; i < P; i++)
   {
      tp[i].vp = &ls;
      tp[i].rank = i;
      tp[i].P = P;
      #if ATL_USE_DYNAMIC
         chkin[i] = 0;
      #endif
   }
   #if ATL_USE_DYNAMIC
      ls.chkin = (volatile int*) chkin;
   #endif
   ATL_thread_start(tp, 0, 1, ATL_dyntlaunch, tp);
   ATL_thread_join(tp);
   #if ATL_USE_DYNAMIC
      ATL_FreeGlobalAtomicCount(ls.acounts[0]);
   #endif
   free(vp);
#endif
}