/* * This routine handles the case where N <= maxNB && K <= maxKB, so B is * only one block. It is particularly important for the panel factorizations * of both LU and QR. */ int Mjoin(PATL,tammm_tNK) ( enum ATLAS_TRANS TA, enum ATLAS_TRANS TB, ATL_CINT M, ATL_CINT N, ATL_CINT K, const SCALAR alpha, const TYPE *A, ATL_CINT lda, const TYPE *B, ATL_CINT ldb, const SCALAR beta, TYPE *C, ATL_CINT ldc ) { ATL_SZT nmblks; amminfo_t mminfo; unsigned int i, mb, nb, kb, mu, nu, ku, P, mr; ATL_tamm_tNK_t pd; /* problem definition structure */ void *vp; /* * Special case for tiny N&K, and large M */ if (N >= ATL_AMM_MAXNB || K >= ATL_AMM_MAXKB || M < ATL_AMM_MAXMB || M < Mmin(8,ATL_NTHREADS)*ATL_AMM_MAXMB) return(1); Mjoin(PATL,GetRankKInfo)(&mminfo, TA, TB, M, N, K, alpha, beta); pd.a2blk = mminfo.a2blk; pd.b2blk = mminfo.b2blk; pd.blk2c = mminfo.Cblk2cm; pd.amm_b0 = mminfo.amm_b0; pd.TA = (TA == AtlasTrans); pd.TB = (TB == AtlasTrans); pd.N = N; pd.K = K; pd.A = A; pd.B = B; pd.C = C; pd.lda = lda; pd.ldb = ldb; pd.ldc = ldc; pd.alpha = α pd.beta = β mu = mminfo.mu; nu = mminfo.nu; ku = mminfo.ku; pd.mb = mb = mminfo.mb; pd.nmu = mb / mu; pd.nnu = (N+nu-1)/nu; nb = pd.nnu * nu; kb = mminfo.kb; nmblks = M / mb; mr = M - nmblks*mb; if (!mr) { pd.mbL = mr = mb; pd.nmuL = pd.nmu; } else { nmblks++; pd.nmuL = (mr+mu-1)/mu; pd.mbL = pd.nmuL * mu; } pd.mr = mr; pd.nmblks = nmblks; pd.KB0 = K; #if ATL_MAXKMAJ_RKK > 1 if (ATL_AMMFLG_KMAJOR(mminfo.flag)) pd.KB0 = ((K+ku-1)/ku)*ku; #endif /* * Maximum scale is limited by NTHREADS or max number of M-blocks */ P = (ATL_NTHREADS <= nmblks) ? ATL_NTHREADS : nmblks; /* * We have a common B wrk of size KB0*nb, then * for each node, we need workspace: sz(A,C) = mb*K, K*nb, mb*N, laid out * in memory as A,C, then we add safety margin mu*nu*ku so advance loads don't * seg fault, and we add space for aligning the ptrs */ pd.bsz = pd.KB0*nb; pd.wsz = mb*(pd.nnu*nu + pd.bsz) + 2*ATL_DivBySize(ATL_Cachelen); vp = malloc(ATL_MulBySize(pd.wsz*P + pd.bsz+mu*nu*ku) + ATL_Cachelen); if (!vp) return(2); pd.w = ATL_AlignPtr(vp); pd.MbCtr = ATL_SetGlobalAtomicCount(ATL_EstNctr(nmblks, P), nmblks, 0); pd.BassgCtr = ATL_SetAtomicCount(1); pd.BdoneCtr = ATL_SetAtomicCount(1); #ifdef DEBUG1 { ATL_LAUNCHSTRUCT_t ls; ATL_thread_t ts; ts.rank = 0; ts.P = 1; ls.opstruct = &pd; Mjoin(PATL,DoWork_tamm_tNK)(&ls, &ts); } #else ATL_goparallel(P, Mjoin(PATL,DoWork_tamm_tNK), &pd, NULL); #endif ATL_FreeAtomicCount(pd.BdoneCtr); ATL_FreeAtomicCount(pd.BassgCtr); ATL_FreeGlobalAtomicCount(pd.MbCtr); free(vp); return(0); }
int Mjoin(PATL,tsyrk_amm_K) ( const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS Trans, ATL_CINT N, ATL_CINT K, const SCALAR alpha, const TYPE *A, ATL_CINT lda, const SCALAR beta, TYPE *C, ATL_CINT ldc ) { amminfo_t mminfo; ATL_tsyrk_ammK_t pd; ablk2cmat_t Mjoin(PATL,tGetSyammInfo_K) (amminfo_t *out, const int P, enum ATLAS_TRANS TA, ATL_CSZT N,ATL_CSZT K); int kb=ATL_AMM_MAXKB, nkb = K / ATL_AMM_MAXKB, P = ATL_NTHREADS; int ku, kr, mb, nb, mu, nu; size_t sz; void *vp=NULL; if (nkb < P) { kb = ATL_AMM_98KB; nkb = K / ATL_AMM_98KB; if (nkb < P) { nkb = K / ATL_AMM_66KB; kb = ATL_AMM_66KB; } } if (nkb < P) { if (nkb < 2) { Mjoin(PATL,syrk)(Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc); return(0); } P = nkb; } pd.blk2c_b0 = Mjoin(PATL,tGetSyammInfo_K)(&mminfo, P, Trans, N, kb); kb = mminfo.kb; nkb = K / kb; mu = mminfo.mu; nu = mminfo.nu; pd.nmu = (N+mu-1) / mu; pd.nnu = (N+nu-1) / nu; pd.mb = mb = pd.nmu*mu; pd.nb = nb = pd.nnu*nu; pd.kb = mminfo.kb; sz = ((((size_t)mb)*nb)<<1) + (mb+nb)*kb; pd.wsz = sz; sz = ATL_MulBySize(sz)*P; vp = malloc(sz+ATL_Cachelen); if (!vp) return(1); pd.w = ATL_AlignPtr(vp); kr = K - nkb*kb; pd.kb0 = pd.KB0 = kr; ku = mminfo.ku; if (!kr) { pd.kb0 = pd.KB0 = kb; pd.ammK_b0 = mminfo.amm_b0; pd.ammK_b1 = mminfo.amm_b1; } else { #if ATL_AMM_MAXKMAJ > 1 if (ATL_AMMFLG_KMAJOR(mminfo.flag)) { pd.KB0 = ((kr+ku-1)/ku)*ku; if (ATL_AMMFLG_KRUNTIME(mminfo.flag)) { pd.ammK_b0 = mminfo.amm_b0; pd.ammK_b1 = mminfo.amm_b1; } else { pd.ammK_b0 = mminfo.amm_k1_b0; pd.ammK_b1 = mminfo.amm_k1_b1; } } else #endif { if (ATL_AMMFLG_KRUNTIME(mminfo.flag) && kr == (kr/ku)*ku && kr > mminfo.kbmin) { pd.ammK_b0 = mminfo.amm_b0; pd.ammK_b1 = mminfo.amm_b1; } else { pd.ammK_b0 = mminfo.amm_k1_b0; pd.ammK_b1 = mminfo.amm_k1_b1; } } } pd.amm_b0 = mminfo.amm_b0; pd.amm_b1 = mminfo.amm_b1; pd.blk2c_b1 = mminfo.Cblk2cm; pd.a2blk = mminfo.a2blk; pd.b2blk = mminfo.b2blk; pd.A = A; pd.C = C; pd.alpha = α pd.beta = β pd.nkblks = (kr) ? nkb+1 : nkb; pd.KbCtr = ATL_SetGlobalAtomicCount(ATL_EstNctr(pd.nkblks, P), pd.nkblks, 0); pd.Cmut = ATL_mutex_init(); pd.BETA_APPLIED = SCALAR_IS_ONE(beta); pd.LOWER = (Uplo == AtlasLower); pd.TA = (Trans == AtlasTrans); pd.N = N; pd.lda = lda; pd.ldc = ldc; // #define DEBUG1 1 #ifdef DEBUG1 { ATL_LAUNCHSTRUCT_t ls; ATL_thread_t ts; ts.rank = 0; ts.P = 1; ls.opstruct = &pd; Mjoin(PATL,DoWork_syrk_amm_K)(&ls, &ts); } #else ATL_goparallel(P, Mjoin(PATL,DoWork_syrk_amm_K), &pd, Mjoin(PATL,CombSyrk_ammK)); #endif /* * Answer is written back to rank0's workspace, extract it & write to C */ { TYPE *wC = pd.w+kb*(mb+nb), *w = wC + mb*nb, *c = C; /* * Put it into block-major storage in w */ pd.blk2c_b0(N, N, ATL_rone, wC, ATL_rzero, w, N); /* * Now copy out only upper or lower portion */ if (pd.LOWER) { int j; for (j=0; j < N; j++, c += ldc+1, w += N+1) Mjoin(PATL,axpby)(N-j, alpha, w, 1, beta, c, 1); } else { int j; for (j=0; j < N; j++, c += ldc, w += N) Mjoin(PATL,axpby)(j+1, alpha, w, 1, beta, c, 1); } } free(vp); ATL_mutex_free(pd.Cmut); ATL_FreeGlobalAtomicCount(pd.KbCtr); return(0); }
void ATL_goparallel /* * This function is used when you pass a single opstruct to all threads; * In this case, we stash opstruct in launchstruct's vp, and then use the * rank array as opstruct during the spawn. Therefore, these routines * should expect to get their problem def from ls.vp, and their rank from * the second argument. The DoWork function is the function that should * be called from each thread to do the parallel work. This function should * look like: * void DoWork_example(ATL_LAUNCHSTRUCT_t *lp, void *vp) * { * ATL_thread_t *tp = vp; * const int myrank = tp->rank; * my_prob_def_t *pd = lp->vp; * ... do work based on info in struct pointed to by lp->vp ... * } * Your DoWork should perform any needed combine before finishing execution, * and any return values can be passed in the problem definition structure * that you define. */ ( const unsigned int P, /* # of cores to use */ void *DoWork, /* func ptr to work function */ void *opstruct, /* structure giving tasks to threads */ void *DoComb /* function to combine two opstructs */ ) { ATL_thread_t *tp; int *chkin; void *vp, *lc; int i; ATL_LAUNCHSTRUCT_t ls; ls.OpStructIsInit = NULL; ls.DoWork = DoWork; ls.DoComb = DoComb; ls.opstruct = opstruct; #ifdef ATL_OMP_THREADS tp = malloc(sizeof(ATL_thread_t)*P); ATL_assert(tp); for (i=0; i < P; i++) { tp[i].vp = &ls; tp[i].rank = i; tp[i].P = P; } ls.rank2thr = tp; omp_set_num_threads(P); #pragma omp parallel { /* * Make sure we got the requested nodes, and set affinity if supported */ ATL_assert(omp_get_num_threads() == P); #ifdef ATL_PAFF_SELF ATL_setmyaffinity(); #endif i = omp_get_thread_num(); ls.DoWork(&ls, tp+i); } /* * Do combine (linear) if requested */ if (DoComb) for (i=1; i < P; i++) ls.DoComb(ls.opstruct, 0, i); #else #if ATL_USE_DYNAMIC ls.acounts = &lc; ls.acounts[0] = ATL_SetGlobalAtomicCount(P>>1, P-1, 0); vp = malloc(P*(sizeof(ATL_thread_t)+sizeof(int)) + ATL_Cachelen); ATL_assert(vp); chkin = vp; tp = (ATL_thread_t*)(chkin+P); tp = ATL_AlignPtr(tp); #else vp = malloc(P*(sizeof(ATL_thread_t)) + ATL_Cachelen); tp = ATL_AlignPtr(vp); #endif ls.rank2thr = tp; for (i=0; i < P; i++) { tp[i].vp = &ls; tp[i].rank = i; tp[i].P = P; #if ATL_USE_DYNAMIC chkin[i] = 0; #endif } #if ATL_USE_DYNAMIC ls.chkin = (volatile int*) chkin; #endif ATL_thread_start(tp, 0, 1, ATL_dyntlaunch, tp); ATL_thread_join(tp); #if ATL_USE_DYNAMIC ATL_FreeGlobalAtomicCount(ls.acounts[0]); #endif free(vp); #endif }