Ejemplo n.º 1
0
/*
 * In the first phase, we work only on diagonal blocks, while copying both
 * A & A'.  For diag work, we parallelize both N & K dims so that the copy
 * is done as quickly as possible.  Threads coming in first choose differing
 * diag blks; diagonal blocks are dealt out cheaply using the dCtr global
 * counter (which starts at nnblks == ndiag).
 * Once all diagonal blocks are dealt out, new threads will start using
 * the atomic ctr array KbegCtr array to share K work for each diagonal.
 * both KbegCtr & KdonCtr are nnblk-len arrays of atomic counters.  Each
 * counter starts at nkblks.  Once the block pointed to by KbegCtr is
 * completely copied, the copying array increments the KdonCtr.  Only one
 * core per diag will get KdonCtr == 0 after doing his copy, and this
 * core will seize cdmut mutex in order to set the appropriate bit in
 * cpydonBV, which is a nnblks-length bit vector.  If the kth bit is set,
 * that means the ith row of A & jth col of A' has been copied.
 * Once a thread gets KbegCtr for a particular diag of 0, it means there's
 * no more work for this block of C, and so it will seize the appropriate
 * Cdmuts mutex which protects each diagonal block of C, and write its
 * finished contribution out to C.  The first such thread to ever seize
 * the mutex will scope dbetaBV to find this diagonal block needs beta applied;
 * while later threads will use beta=1.
 * Eventually, all diagonal work is finished, and the first processor to
 * get 0 for all dCtr & KbegCtr requests will set NODWORK=1, so later
 * threads don't have to query all the counters to know they should proceed
 * to non-diagonal work.
 */
static void DoDiag(const int rank, ATL_tsyrk_ammN_t *pd)
{
   int DIAG=1, k;
   TYPE *myC = pd->wC + (rank+rank)*(pd->nbnb);
   while (!(pd->NODWORK))
   {
       int d=0;

/*
 *     Find which diagonal block to work on, and then which k blk to use
 */
       if (DIAG)
       {
          d = ATL_DecGlobalAtomicCount(pd->dCtr, rank);
          if (d)
          {
             k = ATL_DecGlobalAtomicCount((pd->KbegCtr)[pd->ndiag - d], rank);
             if (!k)     /* if no more K work to do */
                d = 0;   /* can't work on this diag after all */
          }
       }
/*
 *     If all diagonal blocks currently being worked on by threads, find
 *     one that I can help with.
 */
       if (!d)
       {
          unsigned int i, n=pd->ndiag;
          DIAG = 0;
          for (i=0; i < n; i++)
          {
             unsigned int j = (i+rank)%n;
             k = ATL_DecGlobalAtomicCount((pd->KbegCtr)[j], rank);
             d = n-j;
             if (k)
                goto FOUNDDK;
          }
          pd->NODWORK = 1;    /* no work left to assign */
          return;             /* so done with copy of A/A' & this func */
       }
/*
 *     If I reach here, I've got a valid d & k;  and I'll call a routine
 *     that continues to grab blocks from this diag along K until all K
 *     is done; it will then write the answer back to the original C, and
 *     return to this loop to see if it can help with another diag.
 */
       FOUNDDK:
          DoBlksWtCpy(rank, pd, pd->ndiag-d, k, myC);
   }
}
Ejemplo n.º 2
0
static void DoBlksWtCpy
(
   unsigned const int rank,   /* my thread rank */
   ATL_tsyrk_ammN_t *pd,      /* problem def structure */
   unsigned const int dblk,   /* diagonal block of C to compute */
   unsigned int kctr,         /* non-zero Kbeg ctr */
   TYPE *wC                   /* my private workspace */
)
{
   const int TRANS = (pd->TA == AtlasTrans);
   int kblk = pd->nkblks - kctr;
   const int b = (dblk == pd->ndiag-1) ? pd->nbf : pd->nb;
   const int nmu = (dblk == pd->ndiag-1) ? pd->nmuf : pd->nmu;
   const int nnu = (dblk == pd->ndiag-1) ? pd->nnuf : pd->nnu;
   TYPE *wA, *wB, *w, *c;
   TYPE beta = ATL_rone;
   DoBlkWtCpy(rank, pd, dblk, kblk, b, nmu, nnu, pd->amm_b0, wC);
   while (kctr = ATL_DecGlobalAtomicCount((pd->KbegCtr)[dblk], rank))
   {
      kblk = pd->nkblks - kctr;
      DoBlkWtCpy(rank, pd, dblk, kblk, b, nmu, nnu, pd->amm_b1, wC);
   }
/*
 * Since I'm done with this blk of C, copy it to block-major storage, and
 * scale by alpha (blk2c won't access only triangle, so must cpy first)
 */
   w = wC + pd->nbnb;
   pd->blk2d(b, b, *(pd->alpha), wC, ATL_rzero, w, b);
/*
 * Now, seize mutex for diagonal block of original C, and copy back out
 * only above/below diagonal
 */
   c = pd->C + dblk*(pd->nb)*(pd->ldc+1);
   ATL_mutex_lock(pd->Cdmuts[dblk]);
   if (!ATL_IsBitSetBV(pd->dbetaBV, dblk)) /* if I apply beta */
   {
      ATL_SetBitBV(pd->dbetaBV, dblk); /* tell rest of thrs don't apply beta */
      beta = *(pd->beta);              /* because I'm going to do it */
   }
   if (pd->LOWER)
   {
      int j;
      for (j=0; j < b; j++, c += pd->ldc+1, w += b+1)
          Mjoin(PATL,axpby)(b-j, ATL_rone, w, 1, beta, c, 1);
   }
   else
   {
      int j;
      for (j=0; j < b; j++, c += pd->ldc, w += b)
          Mjoin(PATL,axpby)(j+1, ATL_rone, w, 1, beta, c, 1);
   }
   ATL_mutex_unlock(pd->Cdmuts[dblk]);
}
Ejemplo n.º 3
0
static void DoSyrkK(unsigned int rank, ATL_tsyrk_ammK_t *pd)
{
   TYPE *wA, *wB, *wC;
   ammkern_t amm = pd->amm_b0;
   const cm2am_t a2blk=pd->a2blk, b2blk=pd->b2blk;
   const unsigned int mb=pd->mb, nb=pd->nb, kb=pd->kb, nkblks=pd->nkblks,
      N=pd->N, nmu=pd->nmu, nnu=pd->nnu;
   const size_t mulA = (pd->TA) ? 1 : pd->lda;
   int lda=pd->lda;
   int kctr;
   wA = pd->w + pd->wsz*rank;
   wB = wA + mb*kb;
   wC = wB + nb*kb;

   while ((kctr = ATL_DecGlobalAtomicCount(pd->KbCtr, rank)))
   {
      const int kblk = nkblks - kctr;
      const TYPE *a = pd->A + ((size_t)kblk)*mulA*kb;
      if (kblk != nkblks-1)  /* normal full-kb operation */
      {
         a2blk(kb, N, ATL_rone, a, lda, wA);
         b2blk(kb, N, ATL_rone, a, lda, wB);
         amm(nmu, nnu, kb, wA, wB, wC, wA, wB, wC);
      }
      else /* last block of size kb0 */
      {
         a2blk(pd->kb0, N, ATL_rone, a, lda, wA);
         b2blk(pd->kb0, N, ATL_rone, a, lda, wB);
         if (amm == pd->amm_b0)
            pd->ammK_b0(nmu, nnu, pd->KB0, wA, wB, wC, wA, wB, wC);
         else
            pd->ammK_b1(nmu, nnu, pd->KB0, wA, wB, wC, wA, wB, wC);
      }
      amm = pd->amm_b1;
   }
/*
 * If I did no work, zero my wrkspace so I don't screw up combine!
 */
   if (amm != pd->amm_b1)
      Mjoin(PATL,zero)(mb*nb, wC, 1);
}
Ejemplo n.º 4
0
void *ATL_dyntlaunch(void *vp)
#endif
{
   ATL_thread_t *tp = vp, *btp;
   ATL_LAUNCHSTRUCT_t *lp;
   const int iam = tp->rank, P = tp->P;
   int i, src, dest, nthrP2, mask, abit;
   void *acnt;

   lp = tp->vp;
   acnt = lp->acounts[0];
   btp = tp - iam;
/*
 * Set my affinity if I haven't already
 */
   #ifdef ATL_PAFF_SELF
      if (!tp->paff_set)
          ATL_setmyaffinity(tp);
   #endif
   dest = ATL_DecGlobalAtomicCount(acnt, iam);
   while(dest)
   {
      dest = tp->P - dest;
      ATL_thread_start(btp+dest, dest, 0, ATL_dyntlaunch, btp+dest);
      dest = ATL_DecGlobalAtomicCount(acnt, iam);
   }
/*
 * Do the operation
 */
   lp->DoWork(lp, tp);
/*
 * Do combine in minimum spanning tree, combining results as required
 */
   for (i=0; (1<<i) < P; i++);
   nthrP2 = i;
   mask = 0;
   for (i=0; i < nthrP2; i++)
   {
      if (!(iam & mask))
      {
         abit = (1<<i);
         if (!(iam & abit))
         {
            src = iam ^ abit;
            if (src < P)
            {
               while (lp->chkin[src] != ATL_CHK_DONE_OP)
                  ATL_POLL;
               if (lp->DoComb)
                  lp->DoComb(lp->opstruct, iam, src);
            }
         }
         else
         {
            lp->chkin[iam] = ATL_CHK_DONE_OP;
            ATL_thread_exit(NULL);
         }
      }
      mask |= abit;
   }
   return(NULL);
}
Ejemplo n.º 5
0
void Mjoin(PATL,DoWork_tamm_tNK)(ATL_LAUNCHSTRUCT_t *lp, void *vp)
{
   ATL_thread_t *tp = vp;
   ATL_tamm_tNK_t *pd = lp->opstruct;  /* problem definition structure */
   const unsigned int rank = tp->rank, mb=pd->mb, kb=pd->KB0, K=pd->K,
      N=pd->N, nmblks = pd->nmblks, nmbm1=nmblks-1, nmu=pd->nmu, nnu=pd->nnu,
      lda=pd->lda, ldc=pd->ldc;
   TYPE *pB, *pA, *pC, *C = pd->C;
   const TYPE *A = pd->A;
   int BCOPIED=0;
   int imtr;
   const TYPE beta = *pd->beta;
   const size_t incA = (pd->TA) ? lda : 1;
   cm2am_t a2blk = pd->a2blk;
   ablk2cmat_t blk2c = pd->blk2c;
   ammkern_t amm = pd->amm_b0;

   pB = pd->w;
/*
 * First guy here starts to copy B
 */
   if (ATL_DecAtomicCount(pd->BassgCtr))
   {
      pd->b2blk(K, N, *pd->alpha, pd->B, pd->ldb, pB);
/*
 *    Let waiting threads know B is ready for use
 */
      BCOPIED = ATL_DecAtomicCount(pd->BdoneCtr);
      ATL_assert(BCOPIED);
   }
   pA = pB + pd->bsz + rank*(pd->wsz);
   pA = ATL_AlignPtr(pA);
   pC = pA + mb*kb;
   pC = ATL_AlignPtr(pC);
/*
 * For first block I work on, I must await B to be copied
 */
   if (!BCOPIED)
   {
      int iblk;
      size_t ii;

      imtr = ATL_DecGlobalAtomicCount(pd->MbCtr, rank);
      if (imtr)
      {
         iblk = nmblks - imtr;
         ii = mb*iblk;
         if (iblk != nmbm1)
            a2blk(K, mb, ATL_rone, A+incA*ii, lda, pA);
         else
            a2blk(K, pd->mr, ATL_rone, A+incA*ii, lda, pA);
         while (ATL_GetAtomicCount(pd->BdoneCtr))  /* await B cpy finish */
            ATL_thread_yield();
         if (iblk != nmbm1)
         {
            amm(nmu, nnu, kb, pA, pB, pC, pA, pB, pC);
            blk2c(mb, N, ATL_rone, pC, beta, C+ii, ldc);
         }
         else
         {
            amm(pd->nmuL, nnu, kb, pA, pB, pC, pA, pB, pC);
            blk2c(pd->mr, N, ATL_rone, pC, beta, C+ii, ldc);
         }
      }
   }
/*
 * Now, B is ready, so just go to town on remaining blocks
 */
   while ((imtr = ATL_DecGlobalAtomicCount(pd->MbCtr, rank)))
   {
      const int iblk = nmblks - imtr;
      const size_t ii = mb*iblk;

      if (iblk != nmbm1)
      {
         a2blk(K, mb, ATL_rone, A+incA*ii, lda, pA);
         amm(nmu, nnu, kb, pA, pB, pC, pA, pB, pC);
         blk2c(mb, N, ATL_rone, pC, beta, C+ii, ldc);
      }
      else
      {
         a2blk(K, pd->mr, ATL_rone, A+incA*ii, lda, pA);
         amm(pd->nmuL, nnu, kb, pA, pB, pC, pA, pB, pC);
         blk2c(pd->mr, N, ATL_rone, pC, beta, C+ii, ldc);
      }
   }
}