void test_pairs(int th_idx) { int rem_th, rem_proc; int i, j; void *src, *dst; rem_th = pairs[TH_ME]; rem_proc = TH2PROC(rem_th); prndbg(th_idx, "test_pair: %d<->%d(%d)\n", TH_ME, rem_th, rem_proc); MT_BARRIER(); #if 0 print_array(th_idx, "before", &AELEM(ptrs2[TH_ME],rem_th,0,0), ASIZExITERS); #endif for (i = 0; i < iters; i++) { /* src - addr of my thread block on remote proc/thread */ src = &AELEM(ptrs1[rem_th],TH_ME,i,0); /* src - addr of remote thread block on my proc/thread */ dst = &AELEM(ptrs2[TH_ME],rem_th,i,0); /* get from my pair */ assert(!ARMCI_Get(src, dst, ASIZE_BYTES, rem_proc)); } MT_BARRIER(); #if 0 print_array(th_idx, "rcvd", &AELEM(ptrs2[TH_ME],rem_th,0,0), ASIZExITERS); #endif /* check results */ check_result(&AELEM(ptrs2[TH_ME],rem_th,0,0), rem_th); }
/* test Put/Get/Acc sequence regardless of communication pattern * tgt -- remote target for put/get/acc (none if -1) * rmt -- list of remote thread that put/acc to here (correctness is cheked here) * rmt_cnt -- # of threads in rmt */ void test_PutGetAcc(int th_idx, int tgt, int *rmt, int rmt_cnt) { /* a - local thread, b - remote thread */ int a, b, b_proc, stride[2], count[2]; int i, j; void *src, *dst; #ifdef DEBUG for (i = 0, cbufl = 0; i < rmt_cnt; i++) cbufl += sprintf(cbuf+cbufl, " %d", rmt[i]); prndbg(th_idx, "test_PutGetAcc: put/acc to %d, get from %d, check put/acc from %s\n", tgt, tgt, rmt_cnt ? cbuf : "none"); #endif a = TH_ME; stride[0] = ASIZE_BYTES; count[0] = ASIZE_BYTES; count[1] = 1; /* init arrays */ init_array(th_idx, ptrs1[TH_ME]); init_array(th_idx, ptrs2[TH_ME]); MT_BARRIER(); /* put - put a.ptrs1[b] into b.ptrs2[a] */ if (tgt != -1) { b = tgt; b_proc = TH2PROC(b); for (i = 0; i < iters; i++) { src = &AELEM(ptrs1[a], b, i, 0); /* a.ptrs1[b] */ dst = &AELEM(ptrs2[b], a, i, 0); /* b.ptrs2[a] */ // assert(!ARMCI_Put(src, dst, ASIZE_BYTES, b_proc)); assert(!ARMCI_PutS(src, stride, dst, stride, count, 1, b_proc)); } ARMCI_Fence(b_proc); } MT_BARRIER(); print_array(th_idx, "PUT:ptrs1[TH_ME]", ptrs1[TH_ME]); print_array(th_idx, "PUT:ptrs2[TH_ME]", ptrs2[TH_ME]); MT_BARRIER(); /* chk put(s) from b(s): a.ptrs2[b] */ for (j = 0; j < rmt_cnt; j++) { b = rmt[j]; b_proc = TH2PROC(b); check_PutGetAcc(th_idx, b, PUT, &AELEM(ptrs2[a], b, 0, 0)); } //return; // REMOVE WHEN DONE /* init arrays */ init_array(th_idx, ptrs1[TH_ME]); init_array(th_idx, ptrs2[TH_ME]); MT_BARRIER(); /* get - get b.ptrs1[a] into a.ptrs2[b] */ if (tgt != -1) { b = tgt; b_proc = TH2PROC(b); for (i = 0; i < iters; i++) { src = &AELEM(ptrs1[b], a, i, 0); /* b.ptrs1[a] */ dst = &AELEM(ptrs2[a], b, i, 0); /* a.ptrs2[b] */ assert(!ARMCI_GetS(src, stride, dst, stride, count, 1, b_proc)); } } print_array(th_idx, "GET:ptrs1[TH_ME]", ptrs1[TH_ME]); print_array(th_idx, "GET:ptrs2[TH_ME]", ptrs2[TH_ME]); MT_BARRIER(); /* chk get from b: a.ptrs2[b] */ if (tgt != -1) { check_PutGetAcc(th_idx, b, GET, &AELEM(ptrs2[a], b, 0, 0)); } #if 1 /* init arrays */ init_array(th_idx, ptrs1[TH_ME]); init_array(th_idx, ptrs2[TH_ME]); MT_BARRIER(); /* acc - acc a.ptrs1[b] * scale + b.ptrs2[a] into b.ptrs2[a] */ if (tgt != -1) { b = tgt; b_proc = TH2PROC(b); for (i = 0; i < iters; i++) { src = &AELEM(ptrs1[a], b, i, 0); /* a.ptrs1[b] */ dst = &AELEM(ptrs2[b], a, i, 0); /* b.ptrs2[a] */ assert(!ARMCI_AccS(ARMCI_ACC_DBL,&scale,src,stride,dst,stride,count,1,b_proc)); } ARMCI_Fence(b_proc); } MT_BARRIER(); print_array(th_idx, "ACC:ptrs1[TH_ME]", ptrs1[TH_ME]); print_array(th_idx, "ACC:ptrs2[TH_ME]", ptrs2[TH_ME]); MT_BARRIER(); /* chk acc(s) from b(s): a.ptrs2[b] */ for (j = 0; j < rmt_cnt; j++) { b = rmt[j]; b_proc = TH2PROC(b); check_PutGetAcc(th_idx, b, ACC, &AELEM(ptrs2[a], b, 0, 0)); } #endif MT_BARRIER(); }
void *lu(void *lu_arg) { int n, bs, th_idx; int i, il, j, jl, k, kl; int I, J, K; double *A, *B, *C, *D; int dimI, dimJ, dimK; int strI, strJ, strK; unsigned int t1, t2, t3, t4, t11, t22; int diagowner; double *buf1, *buf2; n = ((int *)lu_arg)[0]; bs = ((int *)lu_arg)[1]; th_idx = ((int *)lu_arg)[2]; #ifdef DEBUG printf("DBG: starting thread %d(idx=%d) on node %d\n", me_th[th_idx], th_idx, me); fflush(stdout); #endif /* temporary memories */ buf1 = (double *)malloc(block_size*block_size*sizeof(double)); buf2 = (double *)malloc(block_size*block_size*sizeof(double)); for (k=0, K=0; k<n; k+=bs, K++) { kl = k + bs; if (kl > n) { kl = n; strK = kl - k; } else { strK = bs; } /* factor diagonal block */ diagowner = block_owner(K, K); if (diagowner == me_th[th_idx]) { A = a[K+K*nblocks]; print_block_dbg(A, "th=%d, idx=%d: before lu0 a[%d]:\n", me_th[th_idx], th_idx, K+K*nblocks); lu0(A, strK, strK); } MT_BARRIER(); /* divide column k by diagonal block */ if(block_owner(K, K) == me_th[th_idx]) D = a[K+K*nblocks]; else { D = buf1; get_remote(D, K, K); } for (i=kl, I=K+1; i<n; i+=bs, I++) { if (block_owner(I, K) == me_th[th_idx]) { /* parcel out blocks */ il = i + bs; if (il > n) { il = n; strI = il - i; } else { strI = bs; } A = a[I+K*nblocks]; bdiv(A, D, strI, strK, strI, strK); } } /* modify row k by diagonal block */ for (j=kl, J=K+1; j<n; j+=bs, J++) { if (block_owner(K, J) == me_th[th_idx]) { /* parcel out blocks */ jl = j+bs; if (jl > n) { jl = n; strJ = jl - j; } else { strJ = bs; } A = a[K+J*nblocks]; bmodd(D, A, strK, strJ, strK, strK); } } MT_BARRIER(); /* modify subsequent block columns */ for (i=kl, I=K+1; i<n; i+=bs, I++) { il = i+bs; if (il > n) { il = n; strI = il - i; } else { strI = bs; } if(block_owner(I,K) == me_th[th_idx]) A = a[I+K*nblocks]; else { A = buf1; get_remote(A, I, K); } for (j=kl, J=K+1; j<n; j+=bs, J++) { jl = j + bs; if (jl > n) { jl = n; strJ= jl - j; } else { strJ = bs; } if (block_owner(I, J) == me_th[th_idx]) { /* parcel out blocks */ if(block_owner(K,J) == me_th[th_idx]) B = a[K+J*nblocks]; else { B = buf2; get_remote(B, K, J); } C = a[I+J*nblocks]; bmod(A, B, C, strI, strJ, strK, strI, strK, strI); } } } } free(buf1); free(buf2); return lu_arg; }