/* * In the first phase, we work only on diagonal blocks, while copying both * A & A'. For diag work, we parallelize both N & K dims so that the copy * is done as quickly as possible. Threads coming in first choose differing * diag blks; diagonal blocks are dealt out cheaply using the dCtr global * counter (which starts at nnblks == ndiag). * Once all diagonal blocks are dealt out, new threads will start using * the atomic ctr array KbegCtr array to share K work for each diagonal. * both KbegCtr & KdonCtr are nnblk-len arrays of atomic counters. Each * counter starts at nkblks. Once the block pointed to by KbegCtr is * completely copied, the copying array increments the KdonCtr. Only one * core per diag will get KdonCtr == 0 after doing his copy, and this * core will seize cdmut mutex in order to set the appropriate bit in * cpydonBV, which is a nnblks-length bit vector. If the kth bit is set, * that means the ith row of A & jth col of A' has been copied. * Once a thread gets KbegCtr for a particular diag of 0, it means there's * no more work for this block of C, and so it will seize the appropriate * Cdmuts mutex which protects each diagonal block of C, and write its * finished contribution out to C. The first such thread to ever seize * the mutex will scope dbetaBV to find this diagonal block needs beta applied; * while later threads will use beta=1. * Eventually, all diagonal work is finished, and the first processor to * get 0 for all dCtr & KbegCtr requests will set NODWORK=1, so later * threads don't have to query all the counters to know they should proceed * to non-diagonal work. */ static void DoDiag(const int rank, ATL_tsyrk_ammN_t *pd) { int DIAG=1, k; TYPE *myC = pd->wC + (rank+rank)*(pd->nbnb); while (!(pd->NODWORK)) { int d=0; /* * Find which diagonal block to work on, and then which k blk to use */ if (DIAG) { d = ATL_DecGlobalAtomicCount(pd->dCtr, rank); if (d) { k = ATL_DecGlobalAtomicCount((pd->KbegCtr)[pd->ndiag - d], rank); if (!k) /* if no more K work to do */ d = 0; /* can't work on this diag after all */ } } /* * If all diagonal blocks currently being worked on by threads, find * one that I can help with. */ if (!d) { unsigned int i, n=pd->ndiag; DIAG = 0; for (i=0; i < n; i++) { unsigned int j = (i+rank)%n; k = ATL_DecGlobalAtomicCount((pd->KbegCtr)[j], rank); d = n-j; if (k) goto FOUNDDK; } pd->NODWORK = 1; /* no work left to assign */ return; /* so done with copy of A/A' & this func */ } /* * If I reach here, I've got a valid d & k; and I'll call a routine * that continues to grab blocks from this diag along K until all K * is done; it will then write the answer back to the original C, and * return to this loop to see if it can help with another diag. */ FOUNDDK: DoBlksWtCpy(rank, pd, pd->ndiag-d, k, myC); } }
static void DoBlksWtCpy ( unsigned const int rank, /* my thread rank */ ATL_tsyrk_ammN_t *pd, /* problem def structure */ unsigned const int dblk, /* diagonal block of C to compute */ unsigned int kctr, /* non-zero Kbeg ctr */ TYPE *wC /* my private workspace */ ) { const int TRANS = (pd->TA == AtlasTrans); int kblk = pd->nkblks - kctr; const int b = (dblk == pd->ndiag-1) ? pd->nbf : pd->nb; const int nmu = (dblk == pd->ndiag-1) ? pd->nmuf : pd->nmu; const int nnu = (dblk == pd->ndiag-1) ? pd->nnuf : pd->nnu; TYPE *wA, *wB, *w, *c; TYPE beta = ATL_rone; DoBlkWtCpy(rank, pd, dblk, kblk, b, nmu, nnu, pd->amm_b0, wC); while (kctr = ATL_DecGlobalAtomicCount((pd->KbegCtr)[dblk], rank)) { kblk = pd->nkblks - kctr; DoBlkWtCpy(rank, pd, dblk, kblk, b, nmu, nnu, pd->amm_b1, wC); } /* * Since I'm done with this blk of C, copy it to block-major storage, and * scale by alpha (blk2c won't access only triangle, so must cpy first) */ w = wC + pd->nbnb; pd->blk2d(b, b, *(pd->alpha), wC, ATL_rzero, w, b); /* * Now, seize mutex for diagonal block of original C, and copy back out * only above/below diagonal */ c = pd->C + dblk*(pd->nb)*(pd->ldc+1); ATL_mutex_lock(pd->Cdmuts[dblk]); if (!ATL_IsBitSetBV(pd->dbetaBV, dblk)) /* if I apply beta */ { ATL_SetBitBV(pd->dbetaBV, dblk); /* tell rest of thrs don't apply beta */ beta = *(pd->beta); /* because I'm going to do it */ } if (pd->LOWER) { int j; for (j=0; j < b; j++, c += pd->ldc+1, w += b+1) Mjoin(PATL,axpby)(b-j, ATL_rone, w, 1, beta, c, 1); } else { int j; for (j=0; j < b; j++, c += pd->ldc, w += b) Mjoin(PATL,axpby)(j+1, ATL_rone, w, 1, beta, c, 1); } ATL_mutex_unlock(pd->Cdmuts[dblk]); }
static void DoSyrkK(unsigned int rank, ATL_tsyrk_ammK_t *pd) { TYPE *wA, *wB, *wC; ammkern_t amm = pd->amm_b0; const cm2am_t a2blk=pd->a2blk, b2blk=pd->b2blk; const unsigned int mb=pd->mb, nb=pd->nb, kb=pd->kb, nkblks=pd->nkblks, N=pd->N, nmu=pd->nmu, nnu=pd->nnu; const size_t mulA = (pd->TA) ? 1 : pd->lda; int lda=pd->lda; int kctr; wA = pd->w + pd->wsz*rank; wB = wA + mb*kb; wC = wB + nb*kb; while ((kctr = ATL_DecGlobalAtomicCount(pd->KbCtr, rank))) { const int kblk = nkblks - kctr; const TYPE *a = pd->A + ((size_t)kblk)*mulA*kb; if (kblk != nkblks-1) /* normal full-kb operation */ { a2blk(kb, N, ATL_rone, a, lda, wA); b2blk(kb, N, ATL_rone, a, lda, wB); amm(nmu, nnu, kb, wA, wB, wC, wA, wB, wC); } else /* last block of size kb0 */ { a2blk(pd->kb0, N, ATL_rone, a, lda, wA); b2blk(pd->kb0, N, ATL_rone, a, lda, wB); if (amm == pd->amm_b0) pd->ammK_b0(nmu, nnu, pd->KB0, wA, wB, wC, wA, wB, wC); else pd->ammK_b1(nmu, nnu, pd->KB0, wA, wB, wC, wA, wB, wC); } amm = pd->amm_b1; } /* * If I did no work, zero my wrkspace so I don't screw up combine! */ if (amm != pd->amm_b1) Mjoin(PATL,zero)(mb*nb, wC, 1); }
void *ATL_dyntlaunch(void *vp) #endif { ATL_thread_t *tp = vp, *btp; ATL_LAUNCHSTRUCT_t *lp; const int iam = tp->rank, P = tp->P; int i, src, dest, nthrP2, mask, abit; void *acnt; lp = tp->vp; acnt = lp->acounts[0]; btp = tp - iam; /* * Set my affinity if I haven't already */ #ifdef ATL_PAFF_SELF if (!tp->paff_set) ATL_setmyaffinity(tp); #endif dest = ATL_DecGlobalAtomicCount(acnt, iam); while(dest) { dest = tp->P - dest; ATL_thread_start(btp+dest, dest, 0, ATL_dyntlaunch, btp+dest); dest = ATL_DecGlobalAtomicCount(acnt, iam); } /* * Do the operation */ lp->DoWork(lp, tp); /* * Do combine in minimum spanning tree, combining results as required */ for (i=0; (1<<i) < P; i++); nthrP2 = i; mask = 0; for (i=0; i < nthrP2; i++) { if (!(iam & mask)) { abit = (1<<i); if (!(iam & abit)) { src = iam ^ abit; if (src < P) { while (lp->chkin[src] != ATL_CHK_DONE_OP) ATL_POLL; if (lp->DoComb) lp->DoComb(lp->opstruct, iam, src); } } else { lp->chkin[iam] = ATL_CHK_DONE_OP; ATL_thread_exit(NULL); } } mask |= abit; } return(NULL); }
void Mjoin(PATL,DoWork_tamm_tNK)(ATL_LAUNCHSTRUCT_t *lp, void *vp) { ATL_thread_t *tp = vp; ATL_tamm_tNK_t *pd = lp->opstruct; /* problem definition structure */ const unsigned int rank = tp->rank, mb=pd->mb, kb=pd->KB0, K=pd->K, N=pd->N, nmblks = pd->nmblks, nmbm1=nmblks-1, nmu=pd->nmu, nnu=pd->nnu, lda=pd->lda, ldc=pd->ldc; TYPE *pB, *pA, *pC, *C = pd->C; const TYPE *A = pd->A; int BCOPIED=0; int imtr; const TYPE beta = *pd->beta; const size_t incA = (pd->TA) ? lda : 1; cm2am_t a2blk = pd->a2blk; ablk2cmat_t blk2c = pd->blk2c; ammkern_t amm = pd->amm_b0; pB = pd->w; /* * First guy here starts to copy B */ if (ATL_DecAtomicCount(pd->BassgCtr)) { pd->b2blk(K, N, *pd->alpha, pd->B, pd->ldb, pB); /* * Let waiting threads know B is ready for use */ BCOPIED = ATL_DecAtomicCount(pd->BdoneCtr); ATL_assert(BCOPIED); } pA = pB + pd->bsz + rank*(pd->wsz); pA = ATL_AlignPtr(pA); pC = pA + mb*kb; pC = ATL_AlignPtr(pC); /* * For first block I work on, I must await B to be copied */ if (!BCOPIED) { int iblk; size_t ii; imtr = ATL_DecGlobalAtomicCount(pd->MbCtr, rank); if (imtr) { iblk = nmblks - imtr; ii = mb*iblk; if (iblk != nmbm1) a2blk(K, mb, ATL_rone, A+incA*ii, lda, pA); else a2blk(K, pd->mr, ATL_rone, A+incA*ii, lda, pA); while (ATL_GetAtomicCount(pd->BdoneCtr)) /* await B cpy finish */ ATL_thread_yield(); if (iblk != nmbm1) { amm(nmu, nnu, kb, pA, pB, pC, pA, pB, pC); blk2c(mb, N, ATL_rone, pC, beta, C+ii, ldc); } else { amm(pd->nmuL, nnu, kb, pA, pB, pC, pA, pB, pC); blk2c(pd->mr, N, ATL_rone, pC, beta, C+ii, ldc); } } } /* * Now, B is ready, so just go to town on remaining blocks */ while ((imtr = ATL_DecGlobalAtomicCount(pd->MbCtr, rank))) { const int iblk = nmblks - imtr; const size_t ii = mb*iblk; if (iblk != nmbm1) { a2blk(K, mb, ATL_rone, A+incA*ii, lda, pA); amm(nmu, nnu, kb, pA, pB, pC, pA, pB, pC); blk2c(mb, N, ATL_rone, pC, beta, C+ii, ldc); } else { a2blk(K, pd->mr, ATL_rone, A+incA*ii, lda, pA); amm(pd->nmuL, nnu, kb, pA, pB, pC, pA, pB, pC); blk2c(pd->mr, N, ATL_rone, pC, beta, C+ii, ldc); } } }