int main(void) { int i,j,k; int *target; int *source; int me, npes; struct timeval start, end; long time_taken,start_time,end_time; start_pes(0); me = _my_pe(); npes = _num_pes(); source = (int *) shmalloc( N_ELEMENTS * sizeof(*source) ); time_taken = 0; for (i = 0; i < N_ELEMENTS; i += 1) { source[i] = (i + 1)*10 + me; } target = (int *) shmalloc( N_ELEMENTS * sizeof(*target)*npes ); for (i = 0; i < N_ELEMENTS * npes ; i += 1) { target[i] = -90; } for (i = 0; i < _SHMEM_COLLECT_SYNC_SIZE ; i += 1) { pSyncA[i] = _SHMEM_SYNC_VALUE; pSyncB[i] = _SHMEM_SYNC_VALUE; } shmem_barrier_all(); for(i=0;i<10000;i++){ gettimeofday(&start, NULL); start_time = (start.tv_sec * 1000000.0) + start.tv_usec; /* alternate between 2 pSync arrays to synchronize * consequent collectives of even and odd iterations */ if(i % 2) shmem_collect32(target, source, N_ELEMENTS, 0, 0, npes, pSyncA); else shmem_collect32(target, source, N_ELEMENTS, 0, 0, npes, pSyncB); gettimeofday(&end, NULL); end_time = (end.tv_sec * 1000000.0) + end.tv_usec; if(me==0){ time_taken = time_taken + (end_time - start_time); } } if(me == 0) printf("Time required to collect %d bytes of data, with %d PEs is %ld microseconds\n",(4*N_ELEMENTS * npes),npes,time_taken/10000); shmem_barrier_all(); shfree(target); shfree(source); return 0; }
/* * Aggregates the per PE timing 'count' information */ static unsigned int * gather_rank_counts(_timer_t * const timer) { if(timer->count_iter > 0){ const unsigned int num_records = NUM_PES * timer->num_iters; unsigned int * my_counts = shmem_malloc(timer->num_iters * sizeof(unsigned int)); assert(my_counts); memcpy(my_counts, timer->count, timer->num_iters*sizeof(unsigned int)); unsigned int * all_counts = shmem_malloc( num_records * sizeof(unsigned int) ); assert(all_counts); shmem_barrier_all(); shmem_collect32(all_counts, my_counts, timer->num_iters, 0, 0, NUM_PES, pSync); shmem_barrier_all(); shmem_free(my_counts); return all_counts; } else{ return NULL; } }
void FORTRANIFY (shmem_collect4) (void *target, const void *source, int *nelems, int *PE_start, int *logPE_stride, int *PE_size, int *pSync) { shmem_collect32 (target, source, *nelems, *PE_start, *logPE_stride, *PE_size, (long *) pSync); }
void collect(int *target, int *src, int elements, int me, int npes, int loops) { int i; double start_time, elapsed_time; long total_bytes = loops * elements * sizeof(*src); long *ps, *pSync, *pSync1; pSync = (long*) shmem_malloc( 2 * sizeof(long) * _SHMEM_COLLECT_SYNC_SIZE ); pSync1 = &pSync[_SHMEM_COLLECT_SYNC_SIZE]; for (i = 0; i < _SHMEM_COLLECT_SYNC_SIZE; i++) { pSync[i] = pSync1[i] = _SHMEM_SYNC_VALUE; } target = (int *) shmem_malloc( elements * sizeof(*target) * npes ); if (me==0 && Verbose) { fprintf(stdout,"%s: %d loops of collect32(%ld bytes) over %d PEs: ", __FUNCTION__,loops,(elements*sizeof(*src)),npes); fflush(stdout); } shmem_barrier_all(); start_time = shmemx_wtime(); for(i = 0; i < loops; i++) { ps = (i & 1) ? pSync1 : pSync; shmem_collect32( target, src, elements, 0, 0, npes, ps ); } elapsed_time = shmemx_wtime() - start_time; if (me==0 && Verbose) { printf("%7.3f secs\n", elapsed_time); printf(" %7.5f usecs / collect32(), %ld Kbytes @ %7.4f MB/sec\n\n", (elapsed_time/((double)loops*npes))*1000000.0, (total_bytes/1024), ((double)total_bytes/(1024.0*1024.0)) / elapsed_time ); } shmem_barrier_all(); shmem_free(target); shmem_free( pSync ); shmem_barrier_all(); }
int main(void) { int i, me, npes; int *target; start_pes(0); me = _my_pe(); npes = _num_pes(); source[0] = me * 2; source[1] = me * 2 + 1; target = (int *)shmalloc(sizeof(int) * npes * 2); for (i=0; i < _SHMEM_COLLECT_SYNC_SIZE; i++) { pSync[i] = _SHMEM_SYNC_VALUE; } shmem_barrier_all(); /* Wait for all PEs to initialize pSync */ shmem_collect32(target, source, 2, 0, 0, npes, pSync); printf("%d: %d", me, target[0]); for (i = 1; i < npes * 2; i++) printf(", %d", target[i]); printf("\n"); return 0; }
int osh_coll_tc9(const TE_NODE *node, int argc, const char *argv[]) { /* General initialisations */ int rc = TC_PASS; int ii, numprocs, count, d, nlong; int32_t *source, *target, *displ; long *pSync; UNREFERENCED_PARAMETER(node); UNREFERENCED_PARAMETER(argc); UNREFERENCED_PARAMETER(argv); numprocs = _num_pes(); nlong = _my_pe() + 1; source = NULL; displ = NULL; target = NULL; pSync = NULL; if (numprocs == 1) { log_debug(OSH_TC, "Using more than 1 CPU makes the tests of this program more interesting\n"); return TC_SETUP_FAIL; } displ = malloc(sizeof(int) * numprocs); count = 0; for (ii = 0; ii < numprocs; ii++) { displ[ii] = count; count = count + ii + 1; } pSync = shmalloc(sizeof(long) *_SHMEM_COLLECT_SYNC_SIZE); for (ii=0; ii < _SHMEM_COLLECT_SYNC_SIZE; ii++) pSync[ii] = _SHMEM_SYNC_VALUE; target = shmalloc(sizeof(int) * count); for (ii = 0; ii < count; ii++) target[ii] = 0; source = shmalloc(sizeof(int) * numprocs); for (ii = 0; ii < nlong; ii++) source[ii] = ii; shmem_barrier_all(); /* Wait for all CPUs to initialize pSync */ /* Collect function */ shmem_collect32( target, source, nlong, 0, 0, numprocs, pSync ); ii = d = 0; while (ii < numprocs) { for(count = 0; count <= ii; count++) if (target[d + count] != count) rc = TC_FAIL; d = displ[count]; ii++; } /* Finalizes */ shfree(source); shfree(target); shfree(pSync); free(displ); return rc; }
static int test_item3(void) { int rc = TC_PASS; TYPE_VALUE* shmem_addr = NULL; TYPE_VALUE my_value = 0; TYPE_VALUE* check_arr = NULL; int num_proc = 0; int my_proc = 0; int peer_proc = 0; int i = 0; int j = 0; int k = 0; int flag = 0; int missed_values = 0; static long* pSync = NULL; num_proc = _num_pes(); my_proc = _my_pe(); shmem_addr = shmalloc(sizeof(*shmem_addr)); check_arr = shmalloc(sizeof(*check_arr) * num_proc); pSync = shmalloc(sizeof(*pSync) * _SHMEM_COLLECT_SYNC_SIZE); for (i = 0; i < _SHMEM_COLLECT_SYNC_SIZE; i++) { pSync[i] = _SHMEM_SYNC_VALUE; } if (shmem_addr && pSync && check_arr) { static TYPE_VALUE value = 0; /* Store my value */ my_value = (TYPE_VALUE)my_proc; *shmem_addr = DEFAULT_VALUE; shmem_barrier_all(); for (i = 0; (i < __cycle_count) && (rc == TC_PASS); i++) { missed_values = 0; my_value = (TYPE_VALUE)my_proc; value = FUNC_VALUE(shmem_addr, my_value, peer_proc); shmem_barrier_all(); shmem_collect32(check_arr, &value, (sizeof(value) + 3 ) / 4, 0, 0, num_proc, pSync); shmem_barrier_all(); for (j = 0; j < num_proc ; j++) { flag = 0; for (k = 0; k < num_proc; k++) { if (sys_fcompare(check_arr[k], j)) { flag = 1; break; } } if (flag == 0) { missed_values++; } if (missed_values > 1) { rc = TC_FAIL; break; } } } shmem_barrier_all(); log_debug(OSH_TC, "my(#%d:%lld) missed_values expected = 1 vs missed_values = %d\n", my_proc, (INT64_TYPE)my_value, missed_values); } else { rc = TC_SETUP_FAIL; } if (shmem_addr) { shfree(shmem_addr); } if (pSync) { shfree(pSync); } return rc; }