void finalizeTmatCommunication(LSMSCommunication &comm) { // TODO: make it non-blocking ... beware of the memcpy & temp_buff shmem_quiet(); /* MPI_Status status; for(int i=0; i<comm.numTmatFrom; i++) { int from=comm.tmatFrom[i].remoteNode; for(int j=0; j<comm.tmatFrom[i].numTmats; j++) { // printf("Finalize recieve request %d from node %d\n",j,from); MPI_Wait(&comm.tmatFrom[i].communicationRequest[j],&status); } } for(int i=0; i<comm.numTmatTo; i++) { int to=comm.tmatTo[i].remoteNode; for(int j=0; j<comm.tmatTo[i].numTmats; j++) { // printf("Finalize send request %d to node %d\n",j,to); MPI_Wait(&comm.tmatTo[i].communicationRequest[j],&status); } } */ }
int main(int argc, char* argv[]) { int i, j, num_pes; int failed = 0; shmem_init(); if (shmem_my_pe() == 0) { num_pes=shmem_n_pes(); for(j = 0; j < num_pes; j++) { memset(target, 0, sizeof(long) * 10); shmem_long_get_nbi(target, source, 10, j); shmem_quiet(); for (i = 0; i < 10; i++) { if (source[i] != target[i]) { fprintf(stderr,"[%d] get_nbi from PE %d: target[%d] = %ld, expected %ld\n", shmem_my_pe(), j, i, target[i], source[i]); failed = 1; } } if (failed) shmem_global_exit(1); } } shmem_finalize(); return 0; }
static void test_prepost(void) { int i, j, k; tmp = 0; total = 0; shmem_barrier_all(); for (i = 0 ; i < niters - 1 ; ++i) { cache_invalidate(); shmem_barrier_all(); tmp = timer(); for (j = 0 ; j < npeers ; ++j) { for (k = 0 ; k < nmsgs ; ++k) { shmem_putmem(recv_buf + (nbytes * (k + j * nmsgs)), send_buf + (nbytes * (k + j * nmsgs)), nbytes, send_peers[npeers - j - 1]); } } shmem_quiet(); shmem_short_wait((short*) (recv_buf + (nbytes * ((nmsgs - 1) + (npeers - 1) * nmsgs))), 0); total += (timer() - tmp); memset(recv_buf, 0, npeers * nmsgs * nbytes); } shmem_double_sum_to_all(&tmp, &total, 1, 0, 0, world_size, reduce_pWrk, reduce_pSync); display_result("pre-post", (niters * npeers * nmsgs * 2) / (tmp / world_size)); }
static inline int mca_atomic_basic_fop(void *target, void *prev, uint64_t value, size_t size, int pe, struct oshmem_op_t *op) { int rc = OSHMEM_SUCCESS; long long temp_value = 0; atomic_basic_lock(pe); rc = MCA_SPML_CALL(get(target, size, (void*)&temp_value, pe)); memcpy(prev, (void*) &temp_value, size); op->o_func.c_fn((void*) value, (void*) &temp_value, size / op->dt_size); if (rc == OSHMEM_SUCCESS) { rc = MCA_SPML_CALL(put(target, size, (void*)&temp_value, pe)); shmem_quiet(); } atomic_basic_unlock(pe); return rc; }
static void test_one_way(void) { int i, k; int pe_size = world_size; tmp = 0; total = 0; shmem_barrier_all(); if (world_size % 2 == 1) { pe_size = world_size - 1; } if (!(world_size % 2 == 1 && rank == (world_size - 1))) { if (rank < world_size / 2) { for (i = 0 ; i < niters ; ++i) { cache_invalidate(); shmem_barrier(0, 0, pe_size, barrier_pSync); tmp = timer(); for (k = 0 ; k < nmsgs ; ++k) { shmem_putmem(recv_buf + (nbytes * k), send_buf + (nbytes * k), nbytes, rank + (world_size / 2)); } shmem_quiet(); total += (timer() - tmp); } } else { for (i = 0 ; i < niters ; ++i) { cache_invalidate(); shmem_barrier(0, 0, pe_size, barrier_pSync); tmp = timer(); shmem_short_wait((short*) (recv_buf + (nbytes * (nmsgs - 1))), 0); total += (timer() - tmp); memset(recv_buf, 0, npeers * nmsgs * nbytes); } } shmem_double_sum_to_all(&tmp, &total, 1, 0, 0, pe_size, reduce_pWrk, reduce_pSync); display_result("single direction", (niters * nmsgs) / (tmp / world_size)); } shmem_barrier_all(); }
int main(int argc, char **argv) { int j; int my_pe,n_pes; int *flag,*one; size_t max_elements,max_elements_bytes; char *srce_char,*targ_char; short *srce_short,*targ_short; int *srce_int,*targ_int; long *srce_long,*targ_long; shmem_init(); my_pe = shmem_my_pe(); n_pes = shmem_n_pes(); flag = shmem_malloc((size_t) sizeof(int)); one = shmem_malloc((size_t) sizeof(int)); *one = 1; /* fail if trying to use odd number of processors */ if ( (n_pes % 2) != 0 ){ fprintf(stderr, "FAIL - test requires even number of PEs\n"); exit(1); } if(my_pe == 0) fprintf(stderr, "shmem_num_put_nb(%s)\n", argv[0]); /* shmem_putmem_nb test */ *flag = 0; max_elements = (size_t) (MAX_SIZE / sizeof(char)); max_elements_bytes = (size_t) (sizeof(char)*max_elements); if(my_pe == 0) fprintf(stderr,"shmem_putmem_nb max_elements = %d\n",max_elements); srce_char = shmem_malloc(max_elements_bytes); targ_char = shmem_malloc(max_elements_bytes); if((srce_char == NULL) || (targ_char == NULL)) shmalloc_error(); if ( (my_pe % 2) == 0 ) for(j = 0; j < max_elements; j++) srce_char[j] = (char)(my_pe+j); else for(j = 0; j < max_elements; j++) targ_char[j] = (char)(my_pe+j); shmem_barrier_all(); if ( (my_pe % 2) == 0 ) { shmem_putmem_nb(targ_char,srce_char,max_elements,my_pe+1,NULL); shmem_quiet(); shmem_int_put(flag,one,(size_t)1,my_pe+1); } else { shmem_int_wait(flag,0); for(j = 0; j < max_elements; j++) if ( targ_char[j] != (char)(my_pe+j-1) ) fprintf(stderr, "FAIL: PE [%d] targ_char[%d]=%d my_pe+j-1=%d\n", my_pe,j,targ_char[j],my_pe+j-1); } shmem_free(srce_char); shmem_free(targ_char); /* shmem_put16_nb test */ *flag = 0; max_elements = (size_t) (MAX_SIZE / sizeof(short)); if(max_elements > 20000) max_elements=20000; max_elements_bytes = (size_t) (sizeof(short)*max_elements); if(my_pe == 0) fprintf(stderr,"shmem_put16_nb max_elements = %d\n",max_elements); srce_short = shmem_malloc(max_elements_bytes); targ_short = shmem_malloc(max_elements_bytes); if((srce_short == NULL) || (targ_short == NULL)) shmalloc_error(); if ( (my_pe % 2) == 0 ) for(j = 0; j < max_elements; j++) srce_short[j] = (short)(my_pe+j); else for(j = 0; j < max_elements; j++) targ_short[j] = (short)(my_pe+j); shmem_barrier_all(); if ( (my_pe % 2) == 0 ) { shmem_put16_nb(targ_short,srce_short,max_elements,my_pe+1,NULL); shmem_quiet(); shmem_int_put(flag,one,(size_t)1,my_pe+1); } else { shmem_int_wait(flag,0); for(j = 0; j < max_elements; j++) if ( targ_short[j] != (short)(my_pe+j-1) ) fprintf(stderr, "FAIL: PE [%d] targ_short[%d]=%d my_pe+j-1=%d\n", my_pe,j,targ_short[j],my_pe+j-1); } shmem_free(srce_short); shmem_free(targ_short); /* shmem_put32_nb test */ *flag = 0; max_elements = (size_t) (MAX_SIZE / sizeof(int)); max_elements_bytes = (size_t) (sizeof(int)*max_elements); if(my_pe == 0) fprintf(stderr,"shmem_put32_nb max_elements = %d\n",max_elements); srce_int = shmem_malloc(max_elements_bytes); targ_int = shmem_malloc(max_elements_bytes); if((srce_int == NULL) || (targ_int == NULL)) shmalloc_error(); if ( (my_pe % 2) == 0 ) for(j = 0; j < max_elements; j++) srce_int[j] = (int)(my_pe+j); else for(j = 0; j < max_elements; j++) targ_int[j] = (int)(my_pe+j); shmem_barrier_all(); if ( (my_pe % 2) == 0 ) { shmem_put32_nb(targ_int,srce_int,max_elements,my_pe+1,NULL); shmem_quiet(); shmem_int_put(flag,one,(size_t)1,my_pe+1); } else { shmem_int_wait(flag,0); for(j = 0; j < max_elements; j++) if ( targ_int[j] != (int)(my_pe+j-1) ) fprintf(stderr, "FAIL: PE [%d] targ_int[%d]=%d my_pe+j-1=%d\n", my_pe,j,targ_int[j],my_pe+j-1); } shmem_free(srce_int); shmem_free(targ_int); /* shmem_put64_nb test */ *flag = 0; max_elements = (size_t) (MAX_SIZE / sizeof(long)); max_elements_bytes = (size_t) (sizeof(long)*max_elements); if(my_pe == 0) fprintf(stderr,"shmem_put64_nb max_elements = %d\n",max_elements); srce_long = shmem_malloc(max_elements_bytes); targ_long = shmem_malloc(max_elements_bytes); if((srce_long == NULL) || (targ_long == NULL)) shmalloc_error(); if ( (my_pe % 2) == 0 ) for(j = 0; j < max_elements; j++) srce_long[j] = (long)(my_pe+j); else for(j = 0; j < max_elements; j++) targ_long[j] = (long)(my_pe+j); shmem_barrier_all(); if ( (my_pe % 2) == 0 ) { shmem_put64_nb(targ_long,srce_long,max_elements,my_pe+1,NULL); shmem_quiet(); shmem_int_put(flag,one,(size_t)1,my_pe+1); } else { shmem_int_wait(flag,0); for(j = 0; j < max_elements; j++) if ( targ_long[j] != (long)(my_pe+j-1) ) fprintf(stderr, "FAIL: PE [%d] targ_long[%d]=%d my_pe+j-1=%d\n", my_pe,j,targ_long[j],my_pe+j-1); } shmem_free(srce_long); shmem_free(targ_long); /* shmem_put128_nb test */ *flag = 0; max_elements = (size_t) (MAX_SIZE / sizeof(long)); if ( (max_elements % 2) != 0) max_elements = max_elements-1; max_elements_bytes = (size_t) (sizeof(long)*max_elements); max_elements = max_elements/2; if(my_pe == 0) fprintf(stderr,"shmem_put128_nb max_elements = %d\n",max_elements); srce_long = shmem_malloc(max_elements_bytes); targ_long = shmem_malloc(max_elements_bytes); if((srce_long == NULL) || (targ_long == NULL)) shmalloc_error(); if ( (my_pe % 2) == 0 ) for(j = 0; j < 2*max_elements; j++) srce_long[j] = (long)(my_pe+j); else for(j = 0; j < 2*max_elements; j++) targ_long[j] = (long)(my_pe+j); shmem_barrier_all(); if ( (my_pe % 2) == 0 ) { shmem_put128_nb(targ_long,srce_long,max_elements,my_pe+1,NULL); shmem_quiet(); shmem_int_put(flag,one,(size_t)1,my_pe+1); } else { shmem_int_wait(flag,0); for(j = 0; j < 2*max_elements; j++) if ( targ_long[j] != (long)(my_pe+j-1) ) fprintf(stderr, "FAIL: PE [%d] targ_long[%d]=%d my_pe+j-1=%d\n", my_pe,j,targ_long[j],my_pe+j-1); } shmem_free(srce_long); shmem_free(targ_long); #ifdef SHMEM_C_GENERIC_32 /* shmem_put_nb (GENERIC 32) test */ *flag = 0; max_elements = (size_t) (MAX_SIZE / sizeof(int)); max_elements_bytes = (size_t) (sizeof(int)*max_elements); if(my_pe == 0) fprintf(stderr,"shmem_put_nb (GENERIC 32) max_elements = %d\n",max_elements); srce_int = shmem_malloc(max_elements_bytes); targ_int = shmem_malloc(max_elements_bytes); if((srce_int == NULL) || (targ_int == NULL)) shmalloc_error(); if ( (my_pe % 2) == 0 ) for(j = 0; j < max_elements; j++) srce_int[j] = (int)(my_pe+j); else for(j = 0; j < max_elements; j++) targ_int[j] = (int)(my_pe+j); shmem_barrier_all(); if ( (my_pe % 2) == 0 ) { shmem_put_nb(targ_int,srce_int,max_elements,my_pe+1,NULL); shmem_quiet(); shmem_int_put(flag,one,(size_t)1,my_pe+1); } else { shmem_int_wait(flag,0); for(j = 0; j < max_elements; j++) if ( targ_int[j] != (int)(my_pe+j-1) ) fprintf(stderr, "FAIL: PE [%d] targ_int[%d]=%d my_pe+j-1=%d\n", my_pe,j,targ_int[j],my_pe+j-1); } shmem_free(srce_int); shmem_free(targ_int); #else /* shmem_put_nb (GENERIC 64) test */ *flag = 0; max_elements = (size_t) (MAX_SIZE / sizeof(long)); max_elements_bytes = (size_t) (sizeof(long)*max_elements); if(my_pe == 0) fprintf(stderr,"shmem_put_nb (GENERIC 64) max_elements = %d\n",max_elements); srce_long = shmem_malloc(max_elements_bytes); targ_long = shmem_malloc(max_elements_bytes); if((srce_long == NULL) || (targ_long == NULL)) shmalloc_error(); if ( (my_pe % 2) == 0 ) for(j = 0; j < max_elements; j++) srce_long[j] = (long)(my_pe+j); else for(j = 0; j < max_elements; j++) targ_long[j] = (long)(my_pe+j); shmem_barrier_all(); if ( (my_pe % 2) == 0 ) { shmem_put_nb(targ_long,srce_long,max_elements,my_pe+1,NULL); shmem_quiet(); shmem_int_put(flag,one,(size_t)1,my_pe+1); } else { shmem_int_wait(flag,0); for(j = 0; j < max_elements; j++) if ( targ_long[j] != (long)(my_pe+j-1) ) fprintf(stderr, "FAIL: PE [%d] targ_long[%d]=%d my_pe+j-1=%d\n", my_pe,j,targ_long[j],my_pe+j-1); } shmem_free(srce_long); shmem_free(targ_long); #endif #ifdef NEEDS_FINALIZE shmem_finalize(); #endif return 0; }
int main(int argc, char **argv) { const int ITER_CNT = 100; const long int MAX_MSG_SIZE = 1048576; int* source_addr; int peer; long int i=0, buff_size; int j=0; long long int start_time, stop_time, res; double time; shmem_init(); int pe_id = shmem_my_pe(); source_addr = (int*) shmem_malloc(MAX_MSG_SIZE); if(pe_id == 1) { if(shmem_n_pes()!=4) fprintf(stderr,"Num PEs should be ==4"); printf("#Message Cnt;Time(s);MR(msgs/sec)\n"); } if (pe_id==1) peer = 3; else if(pe_id==3) peer = 1; get_rtc_res_(&res); for (i = 0; i < SHMEM_BARRIER_SYNC_SIZE; i += 1){ pSync[i] = SHMEM_SYNC_VALUE; } /* Collective operation: Implicit barrier on return from attach */ shmem_barrier_all(); if(pe_id == 1 || pe_id == 3) { for(buff_size=1; buff_size<=MAX_MSG_SIZE; buff_size*=2) { isdone=0; shmem_barrier(1,1,2,pSync); get_rtc_(&start_time); for(j=1;j<=ITER_CNT;j++) { shmem_putmem(source_addr, source_addr, buff_size, peer); shmem_quiet(); shmem_int_put(&isdone, &j, 1, peer); shmem_quiet(); shmem_int_wait(&isdone,j-1); shmem_putmem(source_addr, source_addr, buff_size, peer); shmem_quiet(); } shmem_barrier(1,1,2,pSync); get_rtc_(&stop_time); time = (stop_time - start_time)*1.0/(double)res/ITER_CNT; if(pe_id == 1) { printf("%20ld;%20.12f;%20.12f\n", buff_size, time, (double)buff_size/time); } fflush(stdout); } } shmem_barrier_all(); shmem_finalize(); }
int main(int argc, char* argv[]) { int me, num_procs, l, j; int Verbose = 0; start_pes(0); me = _my_pe(); num_procs = _num_pes(); if ( num_procs < 2 ) { if (me ==0) printf("PE[0] requires 2 or more PEs?\n"); return 1; } for (l = 0 ; l < loops ; ++l) { if ((src_int = shmalloc(2*num_procs*sizeof(int))) == NULL) { printf("PE-%d int shmalloc() failed?\n", me); exit(1); } dst_int = &src_int[num_procs]; for(j=0; j < num_procs; j++) { src_int[j] = 4; dst_int[j] = 0; } if ((src_float = shmalloc(2*num_procs*sizeof(float))) == NULL) { printf("PE-%d float shmalloc() failed?\n", me); exit(1); } dst_float = &src_float[num_procs]; for(j=0; j < num_procs; j++) { src_float[j] = 4.0; dst_float[j] = 0.0; } if ((src_double = shmalloc(2*num_procs*sizeof(double))) == NULL) { printf("PE-%d double shmalloc() failed?\n", me); exit(1); } dst_double = &src_double[num_procs]; for(j=0; j < num_procs; j++) { src_double[j] = 8.0; dst_double[j] = 0.0; } if ((src_long = shmalloc(2*num_procs*sizeof(long))) == NULL) { printf("PE-%d long shmalloc() failed?\n", me); exit(1); } dst_long = &src_long[num_procs]; for(j=0; j < num_procs; j++) { src_long[j] = 8; dst_long[j] = 0; } if ((src_llong = shmalloc(2*num_procs*sizeof(long long))) == NULL) { printf("PE-%d long shmalloc() failed?\n", me); exit(1); } dst_llong = &src_llong[num_procs]; for(j=0; j < num_procs; j++) { src_llong[j] = 16; dst_llong[j] = 0; } shmem_barrier_all(); if ( me != 0 ) { /* is 'src_*' accessible from PE0? should be. */ if (!shmem_addr_accessible(src_int,0)) { printf("PE-%d local src_int %p not accessible from PE-%d?\n", me, (void*)src_int, 0); exit(1); } if (!shmem_addr_accessible(src_float,0)) { printf("PE-%d local src_float %p not accessible from PE-%d?\n", me, (void*)src_float, 0); exit(1); } if (!shmem_addr_accessible(src_double,0)) { printf("PE-%d local src_double %p not accessible from PE-%d?\n", me, (void*)src_double, 0); exit(1); } if (!shmem_addr_accessible(src_long,0)) { printf("PE-%d local src_long %p not accessible from PE-%d?\n", me, (void*)src_long, 0); exit(1); } if (!shmem_addr_accessible(src_llong,0)) { printf("PE-%d local src_llong %p not accessible from PE-%d?\n", me, (void*)src_llong, 0); exit(1); } } shmem_barrier_all(); if ( me == 0 ) { shmem_quiet(); for(j=1; j < num_procs; j++) { dst_int[j] = shmem_int_swap(src_int+j,0,j); if (dst_int[j] != 4) { printf("PE-%d dst_int[%d] %d != 4?\n",me,j,dst_int[j]); exit(1); } } shmem_barrier_all(); /* verify remote data */ for(j=1; j < num_procs; j++) { itmp = shmem_int_g(src_int+j,j); if (itmp != 0) { printf("PE-0 int PE[%d] rem(%d) != 0?\n",j,itmp); exit(1); } /* swap back */ dst_int[j] = shmem_int_swap(src_int+j,dst_int[j],j); if (dst_int[j] != 0) { printf("PE-0 dst_int[%d] %d != 0?\n",j,dst_int[j]); exit(1); } itmp = shmem_int_g(src_int+j,j); if (itmp != 4) { printf("PE-0 PE[%d] rem %d != 4?\n",j,itmp); exit(1); } } for(j=1; j < num_procs; j++) { dst_float[j] = shmem_float_swap(src_float+j,0.0,j); if (dst_float[j] != 4.0) { printf("PE-0 dst_float[%d] %f != 4.0?\n",j,dst_float[j]); exit(1); } /* verify remote data */ ftmp = shmem_float_g(src_float+j,j); if (ftmp != 0.0) { printf("PE-0 float rem(%f) != 0.0?\n",ftmp); exit(1); } /* swap back */ dst_float[j] = shmem_float_swap(src_float+j,dst_float[j],j); if (dst_float[j] != 0.0) { printf("PE-0 dst_float[%d] %f != 0.0?\n",j,dst_float[j]); exit(1); } ftmp = shmem_float_g(src_float+j,j); if (ftmp != 4.0) { printf("PE-%d float rem(%f) != 4.0?\n",me,ftmp); exit(1); } } for(j=1; j < num_procs; j++) { dst_double[j] = shmem_double_swap(src_double+j,0.0,j); if (dst_double[j] != 8.0) { printf("PE-0 dst_double[%d] %f != 8.0?\n",j,dst_double[j]); exit(1); } /* verify remote data */ dtmp = shmem_double_g(src_double+j,j); if (dtmp != 0.0) { printf("PE-0 float rem(%f) != 0.0?\n",dtmp); exit(1); } dst_double[j] = shmem_double_swap(src_double+j,dst_double[j],j); if (dst_double[j] != 0.0) { printf("PE-0 dst_double[%d] %f != 0.0?\n",j,dst_double[j]); exit(1); } dtmp = shmem_double_g(src_double+j,j); if (dtmp != 8.0) { printf("PE-0 double rem(%f) != 8.0?\n",dtmp); exit(1); } } for(j=1; j < num_procs; j++) { dst_long[j] = shmem_long_swap(src_long+j,0,j); if (dst_long[j] != 8) { printf("PE-0 dst_long[%d] %ld != 8?\n",j,dst_long[j]); exit(1); } } shmem_barrier_all(); /* verify remote data */ for(j=1; j < num_procs; j++) { ltmp = shmem_long_g(src_long+j,j); if (ltmp != 0) { printf("PE-0 PE[%d]long rem(%ld) != 0?\n",j,ltmp); exit(1); } /* swap back */ dst_long[j] = shmem_long_swap(src_long+j,dst_long[j],j); if (dst_long[j] != 0) { printf("PE-%d dst_long[%d] %ld != 0?\n",me,j,dst_long[j]); exit(1); } ltmp = shmem_long_g(src_long+j,j); if (ltmp != 8) { printf("PE-%d long rem(%ld) != 8?\n",me,ltmp); exit(1); } } for(j=1; j < num_procs; j++) { dst_llong[j] = shmem_longlong_swap(src_llong+j,0,j); if (dst_llong[j] != 16) { printf("PE-%d dst_llong[%d] %lld != 16?\n",me,j,dst_llong[j]); exit(1); } } shmem_barrier_all(); /* verify remote data */ for(j=1; j < num_procs; j++) { lltmp = shmem_longlong_g(src_llong+j,j); if (lltmp != 0) { printf("PE-%d long long rem(%lld) != 0?\n",me,lltmp); exit(1); } /* swap back */ dst_llong[j] = shmem_longlong_swap(src_llong+j,dst_llong[j],j); if (dst_llong[j] != 0) { printf("PE-%d dst_llong[%d] %lld != 0?\n", me,j,dst_llong[j]); exit(1); } lltmp = shmem_longlong_g(src_llong+j,j); if (lltmp != 16) { printf("PE-%d longlong rem(%lld) != 16?\n",me,lltmp); exit(1); } } } else { shmem_int_wait_until(&src_int[me],SHMEM_CMP_EQ,0); shmem_barrier_all(); shmem_long_wait_until(&src_long[me],SHMEM_CMP_EQ,0); shmem_barrier_all(); shmem_longlong_wait_until(&src_llong[me],SHMEM_CMP_EQ,0); shmem_barrier_all(); } shmem_barrier_all(); shfree(src_int); shfree(src_float); shfree(src_double); shfree(src_long); shfree(src_llong); } if (Verbose) fprintf(stderr,"[%d] exit\n",_my_pe()); return 0; }
int hyperquick(int *A, int N, int npes){ int pivot; int i; //the step two of algo.....broadcast the new pivot //pivot = quicksort(A, 0, n-1); next_pivot = A[N/2]; //the median //shmem_barrier_all(); //printf("(%d) N= %d\n",me,N); shmem_broadcast32(&next_pivot,&next_pivot,1,0,0,0,npes,pSync); shmem_barrier_all(); /*printf("Process %d the pivot:%d",me, pivot); shmem_barrier_all(); //just for the sake of clear display...can be removed in the end printf("\nThe sorted list is of process %d: ",me); for(i=0;i<N/npes;i++){ printf("%d, ",A[i]); } printf("\n");*/ printf("the new pivot of process %d: %d\n",me,next_pivot); // to check the broadcast of new pivots int check,j; //to check the division of the sorted arrays according to the new pivot. shmem_barrier_all(); check = uplowPartition(A, next_pivot, N*npes, npes); shmem_barrier_all(); printf("(%d)",me); for(int j=0;j<N;j++){ printf("%d, ",A[j]); } printf("new partition: %d",check); shmem_barrier_all(); printf("\n"); if(me < npes/2){ printf("\n"); pe = me +npes/2; nelems[0] = N - check; printf (" process %d pe : %d nelems : %d\n",me,pe,nelems[0]);//to test the value printf("(%d) addr = %d , value = %d , pe = %d\n ",me, &nelems_import[0],nelems[0],pe);//to test the value shmem_int_p(nelems_import,nelems[0],pe); shmem_quiet(); shmem_int_put(temp_arr,&A[check],nelems[0],pe); } shmem_barrier_all();//check if the entire barrier is needed if(me >= npes/ 2){ pe = me-npes/2;//check if it is synced nelems[0]= check; printf (" process %d pe : %d nelems : %d\n",me,pe,nelems[0]);//to test the value shmem_int_p(nelems_import,nelems[0],pe); shmem_quiet(); shmem_int_put(temp_arr,A,nelems[0],pe); } shmem_barrier_all();//again sync is required...check it with profiling //this snippet is to check if the processors have got the high and low lists respectively ------------------- printf("(%d) nelems_import = %d\n",me,nelems_import[0]);//to test the value printf("(%d) new elements = ",me); for(i=0;i<nelems_import[0];i++){ printf("%d, ",temp_arr[i]); } printf("\n"); //------------------------------------here this checking snippet ends---- //----------------------------------merging of arrays begin------------------------- if(me < npes/2){ i=0; for(j=nelems_import[0];j<(nelems_import[0]+check);j++){ temp_arr[j] = A[i]; i++; } } if(me >= npes/2){ i=check; for(j=nelems_import[0];j<(nelems_import[0]+N-check);j++){ temp_arr[j] = A[i]; i++; } } shmem_barrier_all(); //to test if the arrays are merged properly int size; if(me < npes/2){ size = (nelems_import[0]+check); printf("(%d) merged array:",me); for(j=0;j<size;j++){ printf("%d, ",temp_arr[j]); } printf("\n"); } if(me >= npes/2){ size = (nelems_import[0]+N-check); printf("(%d) merged array:",me); for(j=0;j<size;j++){ printf("%d, ",temp_arr[j]); } printf("\n"); } //-----------------------check of merging finishes-------- //--------------------------------------------------merging finishes------------------------------ //-----------------------sort again----------------------------------------------- if(me < npes/2){ quicksort(temp_arr,0,(nelems_import[0]+check-1)); } if(me >= npes/2){ quicksort(temp_arr,0,(nelems_import[0]+N-check-1)); } //sorting routine checked...once program is done we can remove this part------------- shmem_barrier_all();//test purpose only if(me < npes/2){ printf("(%d) sorted list: ",me); for(i=0;i<size;i++){ printf("%d, ",temp_arr[i]); A[i] = temp_arr[i]; } printf("\n"); } if(me >= npes/2){ printf("(%d) sorted list: ",me); for(i=0;i<size;i++){ printf("%d, ",temp_arr[i]); A[i] = temp_arr[i]; } printf("\n"); } //------------------------------------------------------------- //--------------------------------------------------------------------------------- //hyperquick(A,size,npes/2); }
JNIEXPORT void JNICALL Java_shmem_ShMem_quiet(JNIEnv *env, jclass clazz) { shmem_quiet(); }
void communicateSingleAtomData(LSMSCommunication &comm, int from, int to, int &local_id, AtomData &atom, int tag) { //The buffers used in this func are pre-allocated within initializeCommunication() of size 's' below //int s=sizeof(AtomData)+sizeof(Real)*(2*3*MAXPTS+2*MAXCORE)+sizeof(int)*3*2*MAXCORE+sizeof(int); // 304 bytes transferred in each of the ITER_MAX iterations const int maxPts=MAXPTS; const int maxCore=MAXCORE; int t,i; static int count=0; const int ITER_MAX=1; int sec_id; if(comm.comm.rank==from) { for (i=0;i<ITER_MAX;i++){ int pos=0; memcpy(&p2p_buf[pos],&local_id,int_size); pos+=int_size; memcpy(&p2p_buf[pos],&atom.jmt,int_size); pos+=int_size; memcpy(&p2p_buf[pos],&atom.jws,int_size); pos+=int_size; memcpy(&p2p_buf[pos],&atom.xstart,double_size); pos+=double_size; memcpy(&p2p_buf[pos],&atom.rmt,double_size); pos+=double_size; memcpy(&p2p_buf[pos],atom.header,80*char_size); pos+=80*char_size; memcpy(&p2p_buf[pos],&atom.alat,double_size); pos+=double_size; memcpy(&p2p_buf[pos],&atom.efermi,double_size); pos+=double_size; memcpy(&p2p_buf[pos],&atom.vdif,double_size); pos+=double_size; memcpy(&p2p_buf[pos],&atom.ztotss,double_size); pos+=double_size; memcpy(&p2p_buf[pos],&atom.zcorss,double_size); pos+=double_size; memcpy(&p2p_buf[pos],atom.evec,3*double_size); pos+=3*double_size; memcpy(&p2p_buf[pos],&atom.nspin,int_size); pos+=int_size; memcpy(&p2p_buf[pos],&atom.numc,int_size); pos+=int_size; t=atom.vr.n_row(); memcpy(&p2p_buf[pos],&t,int_size); pos+=int_size; memcpy(&p2p_buf[pos],&atom.vr(0,0),2*t*double_size); pos+=2*t*double_size; memcpy(&p2p_buf[pos],&atom.rhotot(0,0),2*t*double_size); pos+=2*t*double_size; memcpy(&p2p_buf[pos],&atom.corden(0,0),2*t*double_size); pos+=2*t*double_size; t=atom.ec.n_row(); memcpy(&p2p_buf[pos],&t,int_size); pos+=int_size; memcpy(&p2p_buf[pos],&atom.ec(0,0),2*t*double_size); pos+=2*t*double_size; memcpy(&p2p_buf[pos],&atom.nc(0,0),2*t*int_size); pos+=2*t*int_size; memcpy(&p2p_buf[pos],&atom.lc(0,0),2*t*int_size); pos+=2*t*int_size; memcpy(&p2p_buf[pos],&atom.kc(0,0),2*t*int_size); pos+=2*t*int_size; shmem_int_wait_until((sync_send_flag+to),_SHMEM_CMP_EQ,1); shmem_putmem(p2p_buf, p2p_buf, 1048576, to); shmem_int_add((sync_send_flag+to),-1,comm.comm.rank); shmem_int_add((sync_recv_flag+comm.comm.rank),1,to); shmem_quiet(); }// end of false for loop } if(comm.comm.rank==to) { for(i=0;i<ITER_MAX;i++) { int pos=0; sync_recv_flag[from]=0; shmem_int_add((sync_send_flag+comm.comm.rank),1,from); shmem_quiet(); shmem_int_wait_until((sync_recv_flag+from),_SHMEM_CMP_EQ,1); memcpy(&local_id,&p2p_buf[pos],int_size); pos+=int_size; memcpy(&atom.jmt,&p2p_buf[pos],int_size); pos+=int_size; memcpy(&atom.jws,&p2p_buf[pos],int_size); pos+=int_size; memcpy(&atom.xstart,&p2p_buf[pos],double_size); pos+=double_size; memcpy(&atom.rmt,&p2p_buf[pos],double_size); pos+=double_size; memcpy(atom.header,&p2p_buf[pos],80*char_size); pos+=80*char_size; memcpy(&atom.alat,&p2p_buf[pos],double_size); pos+=double_size; memcpy(&atom.efermi,&p2p_buf[pos],double_size); pos+=double_size; memcpy(&atom.vdif,&p2p_buf[pos],double_size); pos+=double_size; memcpy(&atom.ztotss,&p2p_buf[pos],double_size); pos+=double_size; memcpy(&atom.zcorss,&p2p_buf[pos],double_size); pos+=double_size; memcpy(atom.evec,&p2p_buf[pos],3*double_size); pos+=3*double_size; memcpy(&atom.nspin,&p2p_buf[pos],int_size); pos+=int_size; memcpy(&atom.numc,&p2p_buf[pos],int_size); pos+=int_size; memcpy(&t,&p2p_buf[pos],int_size); pos+=int_size; if(t!=atom.vr.n_row()) atom.resizePotential(t); memcpy(&atom.vr(0,0),&p2p_buf[pos],2*t*double_size); pos+=2*t*double_size; memcpy(&atom.rhotot(0,0),&p2p_buf[pos],2*t*double_size); pos+=2*t*double_size; memcpy(&atom.corden(0,0),&p2p_buf[pos],2*t*double_size); pos+=2*t*double_size; memcpy(&t,&p2p_buf[pos],int_size); pos+=int_size; if(t!=atom.nc.n_row()) atom.resizeCore(t); memcpy(&atom.ec(0,0),&p2p_buf[pos],2*t*double_size); pos+=2*t*double_size; memcpy(&atom.nc(0,0),&p2p_buf[pos],2*t*int_size); pos+=2*t*int_size; memcpy(&atom.lc(0,0),&p2p_buf[pos],2*t*int_size); pos+=2*t*int_size; memcpy(&atom.kc(0,0),&p2p_buf[pos],2*t*int_size); pos+=2*t*int_size; shmem_int_add((sync_recv_flag+from),-1,comm.comm.rank); shmem_quiet(); } } }
int main(int argc, char *argv[]){ int i,n,next_pivot, pivot; long pSync[_SHMEM_BCAST_SYNC_SIZE]; for (i=0; i < SHMEM_BCAST_SYNC_SIZE; i++) { pSync[i] = _SHMEM_SYNC_VALUE; } start_pes(0); me = shmem_my_pe(); npes = shmem_n_pes(); shmem_barrier_all(); srand (me+time(NULL)); N = atoi(argv[1]); //int *nelems = (int*) shmalloc(sizeof(int)); //int *nelems_import= (int*) shmalloc(sizeof(int));; printf("%d: Size = %d with np=%d\n",me,N,npes); A = (int *)shmalloc((N/npes)*sizeof(int)); temp_arr = (int *)shmalloc((N/npes)*sizeof(int)); if(A==NULL){ printf("\nOut of memory"); return 1; } n= N/npes; i=0; while(i<N/npes){ A[i] = rand()%(10000-0); i++; } printf("\nprocess %d elements:",me); for(i=0;i<(N/npes);i++){ printf("%d, ", A[i]); } next_pivot = A[0]; //the step two of algo.....broadcast the new pivot shmem_broadcast32(&next_pivot,A,1,0,0,0,npes,pSync); shmem_barrier_all(); pivot = quicksort(A, 0, n-1); printf("Process %d the pivot:%d",me, pivot); shmem_barrier_all(); //just for the sake of clear display...can be removed in the end printf("\nThe sorted list is of process %d: ",me); for(i=0;i<n;i++){ printf("%d, ",A[i]); } printf("\n"); printf("the new pivot of process %d: %d\n",me,next_pivot); // to check the broadcast of new pivots int check,j; //to check the division of the sorted arrays according to the new pivot. shmem_barrier_all(); check = uplowPartition(next_pivot); shmem_barrier_all(); printf("(%d)",me); for(int j=0;j<N/npes;j++){ printf("%d, ",A[j]); } printf("new partition: %d",check); shmem_barrier_all(); if(me < npes/2) { i=0; // printf("Hello from %d", me); printf("\n"); for(j=check;j<N/npes;j++){ temp_arr[i] = A[j]; i++; } i=0; printf("(%d)",me); for(j=check;j<N/npes;j++){ printf("%d, ",temp_arr[i]) ; i++; } // printf("\n"); } shmem_barrier_all(); if(me >= npes/2) { // printf("Hello from %d", me); printf("\n"); for(j=0;j<check;j++){ temp_arr[j] = A[j]; } printf("(%d)",me); for(j=0;j<check;j++){ printf("%d, ",temp_arr[j]) ; } // printf("\n"); } shmem_barrier_all(); printf("\n"); if(me < npes/2){ printf("\n"); pe = me +npes/2; nelems[0] = N/npes - check; printf (" process %d pe : %d nelems : %d\n",me,pe,nelems[0]);//to test the value printf("(%d) addr = %d , value = %d , pe = %d\n ",me, &nelems_import[0],nelems[0],pe);//to test the value shmem_int_p(nelems_import,nelems[0],pe); shmem_quiet(); shmem_int_put(temp_arr,&A[check],nelems[0],pe); } shmem_barrier_all();//check if the entire barrier is needed if(me >= npes/ 2){ pe = me-npes/2;//check if it is synced nelems[0]= check; printf (" process %d pe : %d nelems : %d\n",me,pe,nelems[0]);//to test the value shmem_int_p(nelems_import,nelems[0],pe); shmem_quiet(); shmem_int_put(temp_arr,A,nelems[0],pe); } shmem_barrier_all();//again sync is required...check it with profiling //this snippet is to check if the processors have got the high and low lists respectively ------------------- printf("(%d) nelems_import = %d\n",me,nelems_import[0]);//to test the value printf("(%d) new elements = ",me); for(i=0;i<nelems_import[0];i++){ printf("%d, ",temp_arr[i]); } printf("\n"); //------------------------------------here this checking snippet ends---- //----------------------------------merging of arrays begin------------------------- if(me < npes/2){ i=0; for(j=nelems_import[0];j<(nelems_import[0]+check);j++){ temp_arr[j] = A[i]; i++; } } if(me >= npes/2){ i=check; for(j=nelems_import[0];j<(nelems_import[0]+N/npes-check);j++){ temp_arr[j] = A[i]; i++; } } shmem_barrier_all(); //to test if the arrays are merged properly int size; if(me < npes/2){ size = (nelems_import[0]+check); printf("(%d) merged array:",me); for(j=0;j<size;j++){ printf("%d, ",temp_arr[j]); } printf("\n"); } if(me >= npes/2){ size = (nelems_import[0]+N/npes-check); printf("(%d) merged array:",me); for(j=0;j<size;j++){ printf("%d, ",temp_arr[j]); } printf("\n"); } //-----------------------check of merging finishes-------- //--------------------------------------------------merging finishes------------------------------ //-----------------------sort again----------------------------------------------- if(me < npes/2){ quicksort(temp_arr,0,(nelems_import[0]+check-1)); } if(me >= npes/2){ quicksort(temp_arr,0,(nelems_import[0]+N/npes-check-1)); } //sorting routine checked...once program is done we can remove this part------------- shmem_barrier_all();//test purpose only if(me < npes/2){ printf("(%d) sorted list: ",me); for(i=0;i<size;i++){ printf("%d, ",temp_arr[i]); } printf("\n"); } if(me >= npes/2){ printf("(%d) sorted list: ",me); for(i=0;i<size;i++){ printf("%d, ",temp_arr[i]); } printf("\n"); } //------------------------------------------------------------- //--------------------------------------------------------------------------------- shfree(temp_arr); shfree(A); shmem_finalize(); }
int main(int argc, char **argv) { int i,j,iter; int my_pe,n_pes; int *flag,*one; size_t max_elements,max_elements_bytes; size_t elements[16] = {1,2,4,8,12,16,24,32,64,128,256,512,1024,2048,4096,8192}; int num_elements = 16; short *srce_short,*targ_short; int *srce_int,*targ_int; long *srce_long,*targ_long; float *srce_float,*targ_float; double *srce_double,*targ_double; shmem_init(); my_pe = shmem_my_pe(); n_pes = shmem_n_pes(); flag = shmem_malloc((size_t) sizeof(int)); one = shmem_malloc((size_t) sizeof(int)); *one = 1; /* fail if trying to use odd number of processors */ if ( (n_pes % 2) != 0 ){ fprintf(stderr, "FAIL - test requires even number of PEs\n"); exit(1); } if(my_pe == 0) fprintf(stderr, "shmem_both_put_nb_size(%s)\n", argv[0]); /* alloc arrays */ max_elements = (size_t) (MAX_SIZE / sizeof(int)); max_elements_bytes = (size_t) (sizeof(int)*max_elements); if(my_pe == 0) fprintf(stderr,"shmem_int_put_nb max_elements = %d\n",max_elements); srce_int = shmem_malloc(max_elements_bytes); targ_int = shmem_malloc(max_elements_bytes); if((srce_int == NULL) || (targ_int == NULL)) shmalloc_error(); max_elements = (size_t) (MAX_SIZE / sizeof(short)); max_elements_bytes = (size_t) (sizeof(short)*max_elements); if(my_pe == 0) fprintf(stderr,"shmem_short_put max_elements = %d\n",max_elements); srce_short = shmem_malloc(max_elements_bytes); targ_short = shmem_malloc(max_elements_bytes); if((srce_short == NULL) || (targ_short == NULL)) shmalloc_error(); max_elements = (size_t) (MAX_SIZE / sizeof(long)); max_elements_bytes = (size_t) (sizeof(long)*max_elements); if(my_pe == 0) fprintf(stderr,"shmem_long_put_nb max_elements = %d\n",max_elements); srce_long = shmem_malloc(max_elements_bytes); targ_long = shmem_malloc(max_elements_bytes); if((srce_long == NULL) || (targ_long == NULL)) shmalloc_error(); max_elements = (size_t) (MAX_SIZE / sizeof(float)); max_elements_bytes = (size_t) (sizeof(float)*max_elements); if(my_pe == 0) fprintf(stderr,"shmem_float_put_nb max_elements = %d\n",max_elements); srce_float = shmem_malloc(max_elements_bytes); targ_float = shmem_malloc(max_elements_bytes); if((srce_float == NULL) || (targ_float == NULL)) shmalloc_error(); max_elements = (size_t) (MAX_SIZE / sizeof(double)); max_elements_bytes = (size_t) (sizeof(double)*max_elements); if(my_pe == 0) fprintf(stderr,"shmem_double_put_nb max_elements = %d\n",max_elements); srce_double = shmem_malloc(max_elements_bytes); targ_double = shmem_malloc(max_elements_bytes); if((srce_double == NULL) || (targ_double == NULL)) shmalloc_error(); if(my_pe == 0) fprintf(stderr,"Actual value used for max_elements = %d\n",max_elements); /* try the different sizes MAX_ITER times */ for (iter = 0; iter < MAX_ITER; iter++) { for (i = 0; i < num_elements; i++) { *flag = 0; if (elements[i] <= max_elements) { if ( (my_pe % 2) == 0 ) for(j = 0; j < elements[i]; j++) { srce_short[j] = (short)(my_pe+j); srce_int[j] = (int)(iter*10000+elements[i]*100+my_pe+j); srce_long[j] = (long)(iter*10000+elements[i]*100+my_pe+j); srce_float[j] = (float)(iter*10000+elements[i]*100+my_pe+j); srce_double[j] = (double)(iter*10000+elements[i]*100+my_pe+j); } else for(j = 0; j < elements[i]; j++) { targ_short[j] = (short)(my_pe+j); targ_int[j] = (int)(iter*10000+elements[i]*100+my_pe+j); targ_long[j] = (long)(iter*10000+elements[i]*100+my_pe+j); targ_float[j] = (float)(iter*10000+elements[i]*100+my_pe+j); targ_double[j] = (double)(iter*10000+elements[i]*100+my_pe+j); } shmem_barrier_all(); if ( (my_pe % 2) == 0 ) { #ifndef OPENSHMEM shmemx_int_put_nb(targ_int,srce_int,elements[i],my_pe+1,NULL); shmemx_long_put_nb(targ_long,srce_long,elements[i],my_pe+1,NULL); shmemx_float_put_nb(targ_float,srce_float,elements[i],my_pe+1,NULL); shmemx_double_put_nb(targ_double,srce_double,elements[i],my_pe+1,NULL); #else shmem_int_put_nbi(targ_int,srce_int,elements[i],my_pe+1); shmem_long_put_nbi(targ_long,srce_long,elements[i],my_pe+1); shmem_float_put_nbi(targ_float,srce_float,elements[i],my_pe+1); shmem_double_put_nbi(targ_double,srce_double,elements[i],my_pe+1); #endif /* this one is blocking */ shmem_short_put(targ_short,srce_short,elements[i],my_pe+1); shmem_quiet(); shmem_int_put(flag,one,(size_t)1,my_pe+1); } else { shmem_int_wait(flag,0); for(j = 0; j < elements[i]; j++) { if ( targ_short[j] != (short)(my_pe+j-1) ) fprintf(stderr, "FAIL: PE [%d] iter=%d i=%d targ_short[%d]=%d not equal %d\n", my_pe,iter,i,j,targ_short[j],my_pe+j-1); if ( targ_int[j] != (int)(iter*10000+elements[i]*100+my_pe+j-1) ) fprintf(stderr, "FAIL: PE [%d] iter=%d i=%d targ_int[%d]=%d not equal %d\n", my_pe,iter,i,j,targ_int[j],iter*10000+elements[i]*100+my_pe+j-1); if ( targ_long[j] != (long)(iter*10000+elements[i]*100+my_pe+j-1) ) fprintf(stderr, "FAIL: PE [%d] iter=%d i=%d targ_long[%d]=%d not equal %d\n", my_pe,iter,i,j,targ_long[j],iter*10000+elements[i]*100+my_pe+j-1); if ( targ_float[j] != (float)(iter*10000+elements[i]*100+my_pe+j-1) ) fprintf(stderr, "FAIL: PE [%d] iter=%d i=%d targ_long[%d]=%f not equal %d\n", my_pe,iter,i,j,targ_float[j],iter*10000+elements[i]*100+my_pe+j-1); if ( targ_double[j] != (double)(iter*10000+elements[i]*100+my_pe+j-1) ) fprintf(stderr, "FAIL: PE [%d] iter=%d i=%d targ_double[%d]=%f not equal %d\n", my_pe,iter,i,j,targ_double[j],iter*10000+elements[i]*100+my_pe+j-1); } } } } } shmem_free(srce_short); shmem_free(targ_short); shmem_free(srce_int); shmem_free(targ_int); shmem_free(srce_long); shmem_free(targ_long); shmem_free(srce_float); shmem_free(targ_float); shmem_free(srce_double); shmem_free(targ_double); #ifdef NEEDS_FINALIZE shmem_finalize(); #endif return 0; }