void update_particles_epiphany(Particle* particles, ParticleV* state, int n, int s, float dt, float es, int iter, int cores) { int N = n*s; // open device for threads int dd = coprthr_dopen(COPRTHR_DEVICE_E32,COPRTHR_O_THREAD); printf("dd=%d\n",dd); if (dd<0) ERROR("device open failed\n"); coprthr_program_t prg; if (s==1) prg = coprthr_cc_read_bin("./mpi_tfunc.cbin.3.e32", 0); else prg = coprthr_cc_read_bin("./mpi_tfunc2.cbin.3.e32", 0); // special off-chip thread function coprthr_sym_t thr = coprthr_getsym(prg,"nbody_thread"); printf("prg=%p thr=%p\n",prg,thr); // write data to shared DRAM coprthr_mem_t p_mem = coprthr_dmalloc(dd,N*sizeof(Particle), 0); coprthr_dwrite(dd,p_mem,0,particles,N*sizeof(Particle),COPRTHR_E_WAIT); coprthr_mem_t pn_mem; pn_mem = coprthr_dmalloc(dd,N*sizeof(Particle), 0); // special off-chip memory coprthr_mem_t v_mem = coprthr_dmalloc(dd,N*sizeof(ParticleV), 0); coprthr_dwrite(dd,v_mem,0,state,N*sizeof(ParticleV),COPRTHR_E_WAIT); my_args_t args; args.n = n; args.s = s; args.cnt = iter; args.dt = dt; args.es = es; args.p = coprthr_memptr(p_mem,0); args.pn = coprthr_memptr(pn_mem,0); args.v = coprthr_memptr(v_mem,0); args.fbinfo.smem_start = fix.smem_start; args.fbinfo.smem_len = fix.smem_len; args.fbinfo.line_length = fix.line_length; args.fbinfo.xres = var.xres; args.fbinfo.yres = var.yres; args.fbinfo.xres_virtual = var.xres_virtual; args.fbinfo.yres_virtual = var.yres_virtual; args.fbinfo.xoffset = var.xoffset; args.fbinfo.yoffset = var.yoffset; args.fbinfo.bits_per_pixel = var.bits_per_pixel; int flag = 0; coprthr_mem_t p_mem_switch; system("clear"); coprthr_mpiexec(dd, cores, thr, &args, sizeof(args),0); // read back data from memory on device p_mem_switch = (s>1 && (iter+2)%2) ? pn_mem : p_mem; flag++; coprthr_dread(dd,p_mem_switch,0,particles,N*sizeof(Particle),COPRTHR_E_WAIT); coprthr_dclose(dd); }
int main(int argc, char* argv[]) { int i; int n = SIZE; /* open device for threads */ int dd = coprthr_dopen(COPRTHR_DEVICE_E32,COPRTHR_O_THREAD); /* compile thread function */ coprthr_program_t prg = coprthr_cc_read_bin("./memory_device.e32",0); coprthr_sym_t thr = coprthr_getsym(prg,"my_thread"); printf("dd=%d prg=%p krn=%p\n",dd,prg,thr); /* allocate memory shared with coprocessor device */ coprthr_mem_t aa_mem = coprthr_dmalloc(dd,n*sizeof(int),0); coprthr_mem_t bb_mem = coprthr_dmalloc(dd,n*sizeof(int),0); coprthr_mem_t cc_mem = coprthr_dmalloc(dd,n*sizeof(int),0); int* aa = (int*)coprthr_memptr(aa_mem,0); int* bb = (int*)coprthr_memptr(bb_mem,0); int* cc = (int*)coprthr_memptr(cc_mem,0); /* set args to pass to thread on coprocessor device */ coprthr_mem_t args_mem = coprthr_dmalloc(dd,sizeof(struct my_args),0); struct my_args* pargs = (struct my_args*)coprthr_memptr(args_mem,0); pargs->n = n; pargs->aa = aa, pargs->bb = bb, pargs->cc = cc; /* initialize A, B, and C arrays */ for (i=0; i<n; i++) { aa[i] = i; bb[i] = 2*i; cc[i] = 3; } // Execute kernel on coprocessor device coprthr_dexec(dd,16,thr,(void*)&args_mem, 0 ); coprthr_dwait(dd); for(i=0; i<n; i++) printf("%d: %d + %d = %d\n",i,aa[i],bb[i],cc[i]); /* clean up */ coprthr_dfree(dd,args_mem); coprthr_dfree(dd,aa_mem); coprthr_dfree(dd,bb_mem); coprthr_dfree(dd,cc_mem); coprthr_dclose(dd); }
void update_stencil_epiphany(float* A, float* B, int ni, int nj, int di, int dj, int niter, float w0, float w1, float w2, float w3, float w4) { int dd = coprthr_dopen(COPRTHR_DEVICE_E32,COPRTHR_O_THREAD); printf("dd=%d\n",dd); if (dd<0) ERROR("device open failed\n"); coprthr_mem_t A_mem = coprthr_dmalloc(dd,ni*nj*sizeof(float),0); coprthr_mem_t B_mem = coprthr_dmalloc(dd,ni*nj*sizeof(float),0); coprthr_program_t prg = coprthr_cc_read_bin("./mpi_tfunc.cbin.3.e32", 0); coprthr_sym_t thr = coprthr_getsym(prg,"stencil_thread"); printf("prg=%p thr=%p\n",prg,thr); coprthr_dwrite(dd,A_mem,0,A,ni*nj*sizeof(float),COPRTHR_E_WAIT); coprthr_dwrite(dd,B_mem,0,B,ni*nj*sizeof(float),COPRTHR_E_WAIT); // should really copy this on device my_args_t args = { .ni = ni, .nj = nj, .di = di, .dj = dj, .niter = niter, .A = coprthr_memptr(A_mem,0), .B = coprthr_memptr(B_mem,0), .w0 = w0, .w1 = w1, .w2 = w2, .w3 = w3, .w4 = w4 }; coprthr_mpiexec(dd, di*dj, thr, &args, sizeof(args),0); coprthr_dread(dd,B_mem,0,B,ni*nj*sizeof(float),COPRTHR_E_WAIT); print_stencil(B, ni, nj); } void update_stencil_cpu(float* A, float* B, int ni, int nj, int niter, float w0, float w1, float w2, float w3, float w4) { // this does not handle edges here, which must be initialized in both A and B int i, j, iter = niter; while(iter--) { for (j=1; j<nj-1; j++) { for (i=1; i<ni-1; i++) { int x = j*ni+i; B[x] = w0*A[x-1] + w1*A[x] + w2*A[x+1] + w3*A[x-ni] + w4*A[x+ni]; } } float* tmp = B; B = A; A = tmp; } if(niter%2 == 0) for (j=1; j<nj-1; j++) for (i=1; i<ni-1; i++) B[j*ni+i] = A[j*ni+i]; }
int main() { int i; int dd = coprthr_dopen(TEST_COPRTHR_DEVICE,COPRTHR_O_STREAM); printf("dd=%d\n",dd); coprthr_program_t prg = coprthr_dcompile(dd,src,sizeof(src),"",0); coprthr_kernel_t krn = coprthr_getsym(prg,"my_kern"); printf("prg=%p krn=%p\n",prg,krn); float* a = (float*)malloc(SIZE*sizeof(float)); float* b = (float*)malloc(SIZE*sizeof(float)); float* c = (float*)malloc(SIZE*sizeof(float)); for(i=0; i<SIZE; i++) { a[i] = 1.0f * i; b[i] = 2.0f * i; c[i] = 0.0f; } coprthr_mem_t mema = coprthr_dmalloc(dd,SIZE*sizeof(float),0); coprthr_mem_t memb = coprthr_dmalloc(dd,SIZE*sizeof(float),0); coprthr_mem_t memc = coprthr_dmalloc(dd,SIZE*sizeof(float),0); coprthr_dwrite(dd,mema,0,a,SIZE*sizeof(float),COPRTHR_E_NOWAIT); coprthr_dwrite(dd,memb,0,b,SIZE*sizeof(float),COPRTHR_E_NOWAIT); coprthr_dwrite(dd,memc,0,c,SIZE*sizeof(float),COPRTHR_E_NOWAIT); unsigned int nargs = 3; void* args[] = { &mema, &memb, &memc }; unsigned int nthr = SIZE; coprthr_dexec(dd,krn,nargs,args,nthr,0,COPRTHR_E_NOWAIT); coprthr_dcopy(dd,memc,0,memb,0,SIZE*sizeof(float),COPRTHR_E_NOWAIT); coprthr_kernel_t v_krn[] = { krn, krn }; unsigned int v_nargs[] = { nargs, nargs }; void** v_args[] = { args, args }; unsigned int v_nthr[] = { nthr, nthr }; coprthr_dnexec(dd,1,v_krn,v_nargs,v_args,v_nthr,0,COPRTHR_E_NOWAIT); coprthr_dread(dd,memc,0,c,SIZE*sizeof(float),COPRTHR_E_NOWAIT); coprthr_dwait(dd); for(i=0; i<SIZE; i++) printf("%f + %f = %f\n",a[i],b[i],c[i]); coprthr_dfree(dd,mema); coprthr_dfree(dd,memb); coprthr_dfree(dd,memc); free(a); free(b); free(c); coprthr_dclose(dd); }
int main() { int i; int dd = coprthr_dopen(COPRTHR_DEVICE_X86_64,COPRTHR_O_STREAM); printf("dd=%d\n",dd); coprthr_program_t prg = coprthr_dcompile(dd,src,sizeof(src),"",0); coprthr_kernel_t krn = coprthr_getsym(prg,"my_kern"); printf("prg=%p krn=%p\n",prg,krn); int* a = (int*)malloc(SIZE*sizeof(int)); int* b = (int*)malloc(SIZE*sizeof(int)); int* c = (int*)malloc(SIZE*sizeof(int)); int* d = (int*)malloc(SIZE*sizeof(int)); for(i=0; i<SIZE; i++) { a[i] = 1 * i; b[i] = 2 * i; c[i] = 0; d[i] = 0; } coprthr_mem_t mema = coprthr_dmalloc(dd,SIZE*sizeof(int),0); coprthr_mem_t memb = coprthr_dmalloc(dd,SIZE*sizeof(int),0); coprthr_mem_t memc = coprthr_dmalloc(dd,SIZE*sizeof(int),0); coprthr_mem_t memd = coprthr_dmalloc(dd,SIZE*sizeof(int),0); coprthr_event_t ev[10]; ev[0] = coprthr_dwrite(dd,mema,0,a,SIZE*sizeof(float),COPRTHR_E_NOWAIT); ev[1] = coprthr_dwrite(dd,memb,0,b,SIZE*sizeof(float),COPRTHR_E_NOWAIT); ev[2] = coprthr_dwrite(dd,memc,0,c,SIZE*sizeof(float),COPRTHR_E_NOWAIT); ev[3] = coprthr_dwrite(dd,memd,0,d,SIZE*sizeof(float),COPRTHR_E_NOWAIT); for(i=0;i<4;i++) coprthr_dwaitev(dd,ev[i]); unsigned int nargs = 4; void* args[] = { &mema, &memb, &memc, &memd }; unsigned int nthr = SIZE; ev[4] = coprthr_dexec(dd,krn,nargs,args,nthr,0,COPRTHR_E_NOWAIT); ev[5] = coprthr_dread(dd,memc,0,c,SIZE*sizeof(float),COPRTHR_E_NOWAIT); for(i=0; i<SIZE; i++) d[i] = 1; coprthr_dwrite(dd,memd,0,d,SIZE*sizeof(float),COPRTHR_E_NOW); for(i=4;i<6;i++) coprthr_dwaitev(dd,ev[i]); for(i=0; i<SIZE; i++) printf("%d + %d = %d\n",a[i],b[i],c[i]); coprthr_dfree(dd,mema); coprthr_dfree(dd,memb); coprthr_dfree(dd,memc); free(a); free(b); free(c); coprthr_dclose(dd); }