void mca_memheap_modex_recv_all(void) { int i; int j; int nprocs, my_pe; oshmem_proc_t *proc; mca_spml_mkey_t *mkey; void* dummy_rva; if (!mca_memheap_base_key_exchange) return; /* init rkey cache */ nprocs = oshmem_num_procs(); my_pe = oshmem_my_proc_id(); /* Note: * Doing exchange via rml till we figure out problem with grpcomm.modex and barrier */ for (i = 0; i < nprocs; i++) { if (i == my_pe) continue; proc = oshmem_proc_group_find(oshmem_group_all, i); for (j = 0; j < memheap_map->n_segments; j++) { mkey = mca_memheap_base_get_cached_mkey(i, memheap_map->mem_segs[j].start, proc->transport_ids[0], &dummy_rva); if (!mkey) { MEMHEAP_ERROR("Failed to receive mkeys"); oshmem_shmem_abort(-1); } } } /* * There is an issue with orte_grpcomm.barrier usage as * ess/pmi directs to use grpcomm/pmi in case slurm srun() call grpcomm/pmi calls PMI_Barrier() * that is a function of external library. * There is no opal_progress() in such way. As a result slow PEs send a request (MEMHEAP_RKEY_REQ) to * fast PEs waiting on barrier and do not get a respond (MEMHEAP_RKEY_RESP). * * there are following ways to solve one: * 1. calculate requests from remote PEs and do ORTE_PROGRESSED_WAIT waiting for expected value; * 2. use shmem_barrier_all(); * 3. rework pmi/barrier to use opal_progress(); * 4. use orte_grpcomm.barrier carefully; * * It seems there is no need to use orte_grpcomm.barrier here */ if (memheap_map->mem_segs[HEAP_SEG_INDEX].shmid != MEMHEAP_SHM_INVALID) { /* unfortunately we must do barrier here to assure that everyone are attached to our segment * good thing that this code path only invoked on older linuxes (-mca shmalloc_use_hugepages 3|4) * try to minimize damage here by waiting 5 seconds and doing progress */ shmem_barrier_all(); /* keys exchanged, segments attached, now we can safely cleanup */ if (memheap_map->mem_segs[HEAP_SEG_INDEX].type == MAP_SEGMENT_ALLOC_SHM) { shmctl(memheap_map->mem_segs[HEAP_SEG_INDEX].shmid, IPC_RMID, NULL ); } } }
static int test_item3(void) { int rc = TC_PASS; TYPE_VALUE* shmem_addr = NULL; TYPE_VALUE my_value = 0; TYPE_VALUE* check_arr = NULL; int num_proc = 0; int my_proc = 0; int peer_proc = 0; int i = 0; int j = 0; int k = 0; int flag = 0; int missed_values = 0; static long* pSync = NULL; num_proc = _num_pes(); my_proc = _my_pe(); shmem_addr = shmalloc(sizeof(*shmem_addr)); check_arr = shmalloc(sizeof(*check_arr) * num_proc); pSync = shmalloc(sizeof(*pSync) * _SHMEM_COLLECT_SYNC_SIZE); for (i = 0; i < _SHMEM_COLLECT_SYNC_SIZE; i++) { pSync[i] = _SHMEM_SYNC_VALUE; } if (shmem_addr && pSync && check_arr) { static TYPE_VALUE value = 0; /* Store my value */ my_value = (TYPE_VALUE)my_proc; *shmem_addr = DEFAULT_VALUE; shmem_barrier_all(); for (i = 0; (i < __cycle_count) && (rc == TC_PASS); i++) { missed_values = 0; my_value = (TYPE_VALUE)my_proc; value = FUNC_VALUE(shmem_addr, my_value, peer_proc); shmem_barrier_all(); shmem_collect32(check_arr, &value, (sizeof(value) + 3 ) / 4, 0, 0, num_proc, pSync); shmem_barrier_all(); for (j = 0; j < num_proc ; j++) { flag = 0; for (k = 0; k < num_proc; k++) { if (sys_fcompare(check_arr[k], j)) { flag = 1; break; } } if (flag == 0) { missed_values++; } if (missed_values > 1) { rc = TC_FAIL; break; } } } shmem_barrier_all(); log_debug(OSH_TC, "my(#%d:%lld) missed_values expected = 1 vs missed_values = %d\n", my_proc, (INT64_TYPE)my_value, missed_values); } else { rc = TC_SETUP_FAIL; } if (shmem_addr) { shfree(shmem_addr); } if (pSync) { shfree(pSync); } return rc; }
int main (int argc, char **argv) { int i; int nextpe; int me, npes; int success1, success2, success3, success4, success5, success6, success7, success8; short src1[N]; int src2[N]; long src3[N]; long double src4[N]; long long src5[N]; double src6[N]; float src7[N]; char *src8; short src9; int src10; long src11; double src12; float src13; int fail_count = 0; shmem_init (); me = shmem_my_pe (); npes = shmem_n_pes (); if (npes > 1) { success1 = 0; success2 = 0; success3 = 0; success4 = 0; success5 = 0; success6 = 0; success7 = 0; success8 = 0; src8 = (char *) malloc (N * sizeof (char)); for (i = 0; i < N; i += 1) { src1[i] = (short) me; src2[i] = me; src3[i] = (long) me; src4[i] = (long double) me; src5[i] = (long long) me; src6[i] = (double) me; src7[i] = (float) me; src8[i] = (char) me; } src9 = (short) me; src10 = me; src11 = (long) me; src12 = (double) me; src13 = (float) me; for (i = 0; i < N; i += 1) { dest1[i] = -9; dest2[i] = -9; dest3[i] = -9; dest4[i] = -9; dest5[i] = -9; dest6[i] = -9; dest7[i] = -9.0; dest8[i] = -9; } dest9 = -9; dest10 = -9; dest11 = -9; dest12 = -9; dest13 = -9.0; nextpe = (me + 1) % npes; /* Testing shmem_short_put, shmem_short_put, shmem_int_put, shmem_long_put, shmem_longdouble_put, shmem_longlong_put, shmem_double_put, shmem_float_put, shmem_putmem */ shmem_barrier_all (); shmem_short_put (dest1, src1, N, nextpe); shmem_int_put (dest2, src2, N, nextpe); shmem_long_put (dest3, src3, N, nextpe); shmem_longdouble_put (dest4, src4, N, nextpe); shmem_longlong_put (dest5, src5, N, nextpe); shmem_double_put (dest6, src6, N, nextpe); shmem_float_put (dest7, src7, N, nextpe); shmem_putmem (dest8, src8, N * sizeof (char), nextpe); shmem_barrier_all (); if (me == 0) { for (i = 0; i < N; i += 1) { if (dest1[i] != (npes - 1)) { success1 = 1; } if (dest2[i] != (npes - 1)) { success2 = 1; } if (dest3[i] != (npes - 1)) { success3 = 1; } if (dest4[i] != (npes - 1)) { success4 = 1; } if (dest5[i] != (npes - 1)) { success5 = 1; } if (dest6[i] != (npes - 1)) { success6 = 1; } if (dest7[i] != (npes - 1)) { success7 = 1; } if (dest8[i] != (npes - 1)) { success8 = 1; } } if (success1 == 0) printf ("Test shmem_short_put: Passed\n"); else { printf ("Test shmem_short_put: Failed\n"); fail_count++; } if (success2 == 0) printf ("Test shmem_int_put: Passed\n"); else { printf ("Test shmem_int_put: Failed\n"); fail_count++; } if (success3 == 0) printf ("Test shmem_long_put: Passed\n"); else { printf ("Test shmem_long_put: Failed\n"); fail_count++; } if (success4 == 0) printf ("Test shmem_longdouble_put: Passed\n"); else { printf ("Test shmem_longdouble_put: Failed\n"); fail_count++; } if (success5 == 0) printf ("Test shmem_longlong_put: Passed\n"); else { printf ("Test shmem_longlong_put: Failed\n"); fail_count++; } if (success6 == 0) printf ("Test shmem_double_put: Passed\n"); else { printf ("Test shmem_double_put: Failed\n"); fail_count++; } if (success7 == 0) printf ("Test shmem_float_put: Passed\n"); else { printf ("Test shmem_float_put: Failed\n"); fail_count++; } if (success8 == 0) printf ("Test shmem_putmem: Passed\n"); else { printf ("Test shmem_putmem: Failed\n"); fail_count++; } } shmem_barrier_all (); /* Testing shmem_put32, shmem_put64, shmem_put128 */ if (sizeof (int) == 4) { for (i = 0; i < N; i += 1) { dest2[i] = -9; dest3[i] = -9; dest4[i] = -9; } success2 = 0; success3 = 0; success4 = 0; shmem_barrier_all (); shmem_put32 (dest2, src2, N, nextpe); shmem_put64 (dest3, src3, N, nextpe); shmem_put128 (dest4, src4, N, nextpe); shmem_barrier_all (); if (me == 0) { for (i = 0; i < N; i += 1) { if (dest2[i] != (npes - 1)) { success2 = 1; } if (dest3[i] != (npes - 1)) { success3 = 1; } if (dest4[i] != (npes - 1)) { success4 = 1; } } if (success2 == 0) printf ("Test shmem_put32: Passed\n"); else { printf ("Test shmem_put32: Failed\n"); fail_count++; } if (success3 == 0) printf ("Test shmem_put64: Passed\n"); else { printf ("Test shmem_put64: Failed\n"); fail_count++; } if (success4 == 0) printf ("Test shmem_put128: Passed\n"); else { printf ("Test shmem_put128: Failed\n"); fail_count++; } } } else if (sizeof (int) == 8) { for (i = 0; i < N; i += 1) { dest1[i] = -9; dest2[i] = -9; dest3[i] = -9; } success1 = 0; success2 = 0; success3 = 0; shmem_barrier_all (); shmem_put32 (dest1, src1, N, nextpe); shmem_put64 (dest2, src2, N, nextpe); shmem_put128 (dest3, src3, N, nextpe); shmem_barrier_all (); if (me == 0) { for (i = 0; i < N; i += 1) { if (dest1[i] != (npes - 1)) { success1 = 1; } if (dest2[i] != (npes - 1)) { success2 = 1; } if (dest3[i] != (npes - 1)) { success3 = 1; } } if (success1 == 0) printf ("Test shmem_put32: Passed\n"); else { printf ("Test shmem_put32: Failed\n"); fail_count++; } if (success2 == 0) printf ("Test shmem_put64: Passed\n"); else { printf ("Test shmem_put64: Failed\n"); fail_count++; } if (success3 == 0) printf ("Test shmem_put128: Passed\n"); else { printf ("Test shmem_put128: Failed\n"); fail_count++; } } } /* Testing shmem_iput32, shmem_iput64, shmem_iput128 */ shmem_barrier_all (); if (sizeof (int) == 4) { for (i = 0; i < N; i += 1) { dest2[i] = -9; dest3[i] = -9; dest4[i] = -9; } success2 = 0; success3 = 0; success4 = 0; shmem_barrier_all (); shmem_iput32 (dest2, src2, 1, 2, N / 2, nextpe); shmem_iput64 (dest3, src3, 1, 2, N / 2, nextpe); shmem_iput128 (dest4, src4, 1, 2, N / 2, nextpe); shmem_barrier_all (); if (me == 0) { for (i = 0; i < N / 2; i += 1) { if (dest2[i] != (npes - 1)) { success2 = 1; } if (dest3[i] != (npes - 1)) { success3 = 1; } if (dest4[i] != (npes - 1)) { success4 = 1; } } if (success2 == 0) printf ("Test shmem_iput32: Passed\n"); else { printf ("Test shmem_iput32: Failed\n"); fail_count++; } if (success3 == 0) printf ("Test shmem_iput64: Passed\n"); else { printf ("Test shmem_iput64: Failed\n"); fail_count++; } if (success4 == 0) printf ("Test shmem_iput128: Passed\n"); else { printf ("Test shmem_iput128: Failed\n"); fail_count++; } } } else if (sizeof (int) == 8) { for (i = 0; i < N; i += 1) { dest1[i] = -9; dest2[i] = -9; dest3[i] = -9; } success1 = 0; success2 = 0; success3 = 0; shmem_barrier_all (); shmem_iput32 (dest1, src1, 1, 2, N / 2, nextpe); shmem_iput64 (dest2, src2, 1, 2, N / 2, nextpe); shmem_iput128 (dest3, src3, 1, 2, N / 2, nextpe); shmem_barrier_all (); if (me == 0) { for (i = 0; i < N / 2; i += 1) { if (dest1[i] != (npes - 1)) { success1 = 1; } if (dest2[i] != (npes - 1)) { success2 = 1; } if (dest3[i] != (npes - 1)) { success3 = 1; } } if (success1 == 0) printf ("Test shmem_iput32: Passed\n"); else { printf ("Test shmem_iput32: Failed\n"); fail_count++; } if (success2 == 0) printf ("Test shmem_iput64: Passed\n"); else { printf ("Test shmem_iput64: Failed\n"); fail_count++; } if (success3 == 0) printf ("Test shmem_iput128: Passed\n"); else { printf ("Test shmem_iput128: Failed\n"); fail_count++; } } } /* Testing shmem_short_iput, shmem_int_iput, shmem_long_iput, shmem_double_iput, shmem_float_iput */ for (i = 0; i < N; i += 1) { dest1[i] = -9; dest2[i] = -9; dest3[i] = -9; dest6[i] = -9; dest7[i] = -9; } success1 = 0; success2 = 0; success3 = 0; success6 = 0; success7 = 0; shmem_barrier_all (); shmem_short_iput (dest1, src1, 1, 2, N / 2, nextpe); shmem_int_iput (dest2, src2, 1, 2, N / 2, nextpe); shmem_long_iput (dest3, src3, 1, 2, N / 2, nextpe); shmem_double_iput (dest6, src6, 1, 2, N / 2, nextpe); shmem_float_iput (dest7, src7, 1, 2, N / 2, nextpe); shmem_barrier_all (); if (me == 0) { for (i = 0; i < N / 2; i += 1) { if (dest1[i] != (npes - 1)) { success1 = 1; } if (dest2[i] != (npes - 1)) { success2 = 1; } if (dest3[i] != (npes - 1)) { success3 = 1; } if (dest6[i] != (npes - 1)) { success6 = 1; } if (dest7[i] != (npes - 1)) { success7 = 1; } } if (success1 == 0) printf ("Test shmem_short_iput: Passed\n"); else { printf ("Test shmem_short_iput: Failed\n"); fail_count++; } if (success2 == 0) printf ("Test shmem_int_iput: Passed\n"); else { printf ("Test shmem_int_iput: Failed\n"); fail_count++; } if (success3 == 0) printf ("Test shmem_long_iput: Passed\n"); else { printf ("Test shmem_long_iput: Failed\n"); fail_count++; } if (success6 == 0) printf ("Test shmem_double_iput: Passed\n"); else { printf ("Test shmem_double_iput: Failed\n"); fail_count++; } if (success7 == 0) printf ("Test shmem_float_iput: Passed\n"); else { printf ("Test shmem_float_iput: Failed\n"); fail_count++; } } /* Testing shmem_double_p, shmem_float_p, shmem_int_p, shmem_long_p, shmem_short_p */ shmem_barrier_all (); shmem_short_p (&dest9, src9, nextpe); shmem_int_p (&dest10, src10, nextpe); shmem_long_p (&dest11, src11, nextpe); shmem_double_p (&dest12, src12, nextpe); shmem_float_p (&dest13, src13, nextpe); shmem_barrier_all (); if (me == 0) { if (dest9 == (npes - 1)) printf ("Test shmem_short_p: Passed\n"); else { printf ("Test shmem_short_p: Failed\n"); fail_count++; } if (dest10 == (npes - 1)) printf ("Test shmem_int_p: Passed\n"); else { printf ("Test shmem_int_p: Failed\n"); fail_count++; } if (dest11 == (npes - 1)) printf ("Test shmem_long_p: Passed\n"); else { printf ("Test shmem_long_p: Failed\n"); fail_count++; } if (dest12 == (npes - 1)) printf ("Test shmem_double_p: Passed\n"); else { printf ("Test shmem_double_p: Failed\n"); fail_count++; } if (dest13 == (npes - 1)) printf ("Test shmem_float_p: Passed\n"); else { printf ("Test shmem_float_p: Failed\n"); fail_count++; } } shmem_barrier_all (); if (me == 0) { if (fail_count == 0) printf("All Tests Passed\n"); else printf("%d Tests Failed\n", fail_count); } } else { printf ("Number of PEs must be > 1 to test shmem put, test skipped\n"); } shmem_finalize (); return 0; }
int main (int argc, char *argv[]) { double t, tv[2]; int reps = 10000; int doprint = 0; char *progName; int minWords = 1; int maxWords = 1; int incWords; int nwords; int nproc; int proc; int peer; int c; int r; int i; long *rbuf; long *tbuf; start_pes (0); proc = _my_pe (); nproc = _num_pes (); for (progName = argv[0] + strlen (argv[0]); progName > argv[0] && *(progName - 1) != '/'; progName--) ; while ((c = getopt (argc, argv, "n:eh")) != -1) switch (c) { case 'n': if ((reps = getSize (optarg)) <= 0) usage (progName); break; case 'e': doprint++; break; case 'h': help (progName); default: usage (progName); } if (optind == argc) minWords = 1; else if ((minWords = getSize (argv[optind++])) <= 0) usage (progName); if (optind == argc) maxWords = minWords; else if ((maxWords = getSize (argv[optind++])) < minWords) usage (progName); if (optind == argc) incWords = 0; else if ((incWords = getSize (argv[optind++])) < 0) usage (progName); if (!(rbuf = (long *) shmalloc (maxWords * sizeof (long)))) { perror ("Failed memory allocation"); exit (1); } memset (rbuf, 0, maxWords * sizeof (long)); shmem_barrier_all (); if (!(tbuf = (long *) malloc (maxWords * sizeof (long)))) { perror ("Failed memory allocation"); exit (1); } if (nproc == 1) return 0; for (i = 0; i < maxWords; i++) tbuf[i] = 1000 + (i & 255); if (doprint) printf ("%d(%d): Shmem PING reps %d minWords %d maxWords %d incWords %d\n", proc, nproc, reps, minWords, maxWords, incWords); shmem_barrier_all (); peer = proc ^ 1; if (peer >= nproc) doprint = 0; for (nwords = minWords; nwords <= maxWords; nwords = incWords ? nwords + incWords : nwords ? 2 * nwords : 1) { r = reps; shmem_barrier_all (); tv[0] = gettime (); if (peer < nproc) { if (proc & 1) { r--; shmem_wait (&rbuf[nwords - 1], 0); rbuf[nwords - 1] = 0; } while (r-- > 0) { shmem_long_put (rbuf, tbuf, nwords, peer); shmem_wait (&rbuf[nwords - 1], 0); rbuf[nwords - 1] = 0; } if (proc & 1) shmem_long_put (rbuf, tbuf, nwords, peer); } tv[1] = gettime (); t = dt (&tv[1], &tv[0]) / (2 * reps); shmem_barrier_all (); printStats (proc, peer, doprint, nwords, t); } shmem_barrier_all (); free (tbuf); shfree (rbuf); return 0; }
JNIEXPORT void JNICALL Java_shmem_ShMem_barrierAll(JNIEnv *env, jclass clazz) { shmem_barrier_all(); }
static int test_item7(void) { int rc = TC_PASS; TYPE_VALUE* target_addr = NULL; TYPE_VALUE* source_addr = NULL; TYPE_VALUE source_value = 0; TYPE_VALUE expect_value = 0; int num_proc = 0; int my_proc = 0; num_proc = _num_pes(); my_proc = _my_pe(); target_addr = (TYPE_VALUE*)shmalloc(sizeof(*target_addr) * __max_buffer_size); source_addr = (TYPE_VALUE*)shmalloc(sizeof(*source_addr) * __max_buffer_size); if (target_addr && source_addr) { TYPE_VALUE value = DEFAULT_VALUE; int i = 0; int j = 0; long cur_buf_size = 0; for (i = 0; (i < __cycle_count) && (rc == TC_PASS); i++) { cur_buf_size = sys_max(1, (i + 1) * __max_buffer_size / __cycle_count); pWrk = shmalloc(sizeof(*pWrk) * sys_max(cur_buf_size/2 + 1, _SHMEM_REDUCE_MIN_WRKDATA_SIZE)); if (pWrk) { /* Set initial target value */ value = DEFAULT_VALUE; fill_buffer((void *)target_addr, cur_buf_size, (void *)&value, sizeof(value)); /* Give some time to all PE for setting their values */ shmem_barrier_all(); /* Set my value */ source_value = (TYPE_VALUE)(BASE_VALUE + my_proc); fill_buffer((void *)source_addr, cur_buf_size, (void *)&source_value, sizeof(source_value)); /* Define expected value */ expect_value = ( my_proc % 2 ? DEFAULT_VALUE : BASE_VALUE ); /* This guarantees that PE set initial value before peer change one */ for ( j = 0; j < _SHMEM_REDUCE_SYNC_SIZE; j++ ) { pSync[j] = _SHMEM_SYNC_VALUE; } shmem_barrier_all(); /* Put value to peer */ FUNC_VALUE(target_addr, source_addr, cur_buf_size, 0, 1, ((num_proc / 2) + (num_proc % 2)), pWrk, pSync); /* Get value put by peer: * These routines start the remote transfer and may return before the data * is delivered to the remote PE */ shmem_barrier_all(); { int wait = WAIT_COUNT; while (wait--) { value = *target_addr; if (expect_value == value) break; sleep(1); } } rc = (!compare_buffer_with_const(target_addr, cur_buf_size, &expect_value, sizeof(expect_value)) ? TC_PASS : TC_FAIL); log_debug(OSH_TC, "my#%d source = %lld expected = %lld actual = %lld buffer size = %lld\n", my_proc, (INT64_TYPE)source_value, (INT64_TYPE)expect_value, (INT64_TYPE)value, (INT64_TYPE)cur_buf_size); if (rc) { TYPE_VALUE* check_addr = target_addr; int odd_index = compare_buffer_with_const(check_addr, cur_buf_size, &expect_value, sizeof(expect_value)); int show_index = (odd_index > 1 ? odd_index - 2 : 0); int show_size = sizeof(*check_addr) * sys_min(3, cur_buf_size - odd_index - 1); log_debug(OSH_TC, "index of incorrect value: 0x%08X (%d)\n", odd_index - 1, odd_index - 1); log_debug(OSH_TC, "buffer interval: 0x%08X - 0x%08X\n", show_index, show_index + show_size); show_buffer(check_addr + show_index, show_size); } shfree(pWrk); } else { rc = TC_SETUP_FAIL; } } } else { rc = TC_SETUP_FAIL; } if (source_addr) { shfree(source_addr); } if (target_addr) { shfree(target_addr); } return rc; }
static int test_item8(void) { int rc = TC_PASS; static TYPE_VALUE target_addr[MAX_BUFFER_SIZE * 2]; static TYPE_VALUE source_addr[MAX_BUFFER_SIZE * 2]; TYPE_VALUE source_value = 0; TYPE_VALUE expect_value = 0; int num_proc = 0; int my_proc = 0; long* pSyncMult = NULL; TYPE_VALUE* pWrkMult = NULL; int pSyncNum = 2; int pWrkNum = 2; num_proc = _num_pes(); my_proc = _my_pe(); pSyncMult = shmalloc(sizeof(*pSyncMult) * pSyncNum * _SHMEM_REDUCE_SYNC_SIZE); if (pSyncMult) { TYPE_VALUE value = DEFAULT_VALUE; int i = 0; int j = 0; long cur_buf_size = 0; for ( j = 0; j < pSyncNum * _SHMEM_REDUCE_SYNC_SIZE; j++ ) { pSyncMult[j] = _SHMEM_SYNC_VALUE; } /* Give some time to all PE for setting their values */ shmem_barrier_all(); pWrkMult = shmalloc(sizeof(*pWrkMult) * pWrkNum * sys_max(MAX_BUFFER_SIZE, _SHMEM_REDUCE_MIN_WRKDATA_SIZE)); if (pWrkMult) { value = DEFAULT_VALUE; source_value = (TYPE_VALUE)(BASE_VALUE + my_proc); fill_buffer((void *)source_addr, MAX_BUFFER_SIZE * 2, (void *)&source_value, sizeof(source_value)); fill_buffer((void *)target_addr, MAX_BUFFER_SIZE * 2, (void *)&value, sizeof(value)); shmem_barrier_all(); for (i = 0; (i < __cycle_count) && (rc == TC_PASS); i++) { cur_buf_size = sys_max(1, (i + 1) * MAX_BUFFER_SIZE / __cycle_count); /* Set initial target value */ value = DEFAULT_VALUE; /* Set my value */ source_value = (TYPE_VALUE)(BASE_VALUE + my_proc); /* Define expected value */ expect_value = ( my_proc % 2 ? DEFAULT_VALUE : BASE_VALUE ); /* Put value to peer */ FUNC_VALUE(target_addr + (i % 2) * MAX_BUFFER_SIZE, source_addr + (i % 2) * MAX_BUFFER_SIZE, cur_buf_size, 0, 1, ((num_proc / 2) + (num_proc % 2)), pWrkMult + (i % pWrkNum) * sys_max(MAX_BUFFER_SIZE, _SHMEM_REDUCE_MIN_WRKDATA_SIZE), pSyncMult + (i % pSyncNum) * _SHMEM_REDUCE_SYNC_SIZE); rc = (!compare_buffer_with_const(target_addr + (i % 2) * MAX_BUFFER_SIZE, cur_buf_size, &expect_value, sizeof(expect_value)) ? TC_PASS : TC_FAIL); log_debug(OSH_TC, "my#%d source = %lld expected = %lld actual = %lld buffer size = %lld\n", my_proc, (INT64_TYPE)source_value, (INT64_TYPE)expect_value, (INT64_TYPE)value, (INT64_TYPE)cur_buf_size); if (rc) { TYPE_VALUE* check_addr = target_addr + (i % 2) * MAX_BUFFER_SIZE; int odd_index = compare_buffer_with_const(check_addr, cur_buf_size, &expect_value, sizeof(expect_value)); int show_index = (odd_index > 1 ? odd_index - 2 : 0); int show_size = sizeof(*check_addr) * sys_min(3, cur_buf_size - odd_index - 1); log_debug(OSH_TC, "index of incorrect value: 0x%08X (%d)\n", odd_index - 1, odd_index - 1); log_debug(OSH_TC, "buffer interval: 0x%08X - 0x%08X\n", show_index, show_index + show_size); show_buffer(check_addr + show_index, show_size); } fill_buffer((void *)(source_addr + (i % 2) * MAX_BUFFER_SIZE), cur_buf_size, (void *)&source_value, sizeof(source_value)); fill_buffer((void *)(target_addr + (i % 2) * MAX_BUFFER_SIZE ), cur_buf_size, (void *)&value, sizeof(value)); } shfree(pWrkMult); } else { rc = TC_SETUP_FAIL; } shfree(pSyncMult); } else { rc = TC_SETUP_FAIL; } return rc; }
void sor (float **current_ptr, float **next_ptr) { int i, j, my_start, my_end, my_num_rows; float *U_Curr_Above = (float *) shmalloc ((sizeof (float)) * ((int) floor (WIDTH / H))); /* 1d array holding values from bottom row of PE above */ float *U_Curr_Below = (float *) shmalloc ((sizeof (float)) * ((int) floor (WIDTH / H))); /* 1d array holding values from top row of PE below */ float *U_Send_Buffer = (float *) shmalloc ((sizeof (float)) * ((int) floor (WIDTH / H))); /* 1d array holding values that are currently being sent */ //float U_Curr_Above[(int)floor(WIDTH/H)]; /* 1d array holding values from bottom row of PE above */ //float U_Curr_Below[(int)floor(WIDTH/H)]; /* 1d array holding values from top row of PE below */ //float U_Send_Buffer[(int)floor(WIDTH/H)]; /* 1d array holding values that are currently being sent */ float W = 1.5; //MPI_Request request; //MPI_Status status; //MPI_Comm_size(MPI_COMM_WORLD,&p); //MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); my_start = get_start (my_rank); my_end = get_end (my_rank); my_num_rows = get_num_rows (my_rank); /* * Communicating ghost rows - only bother if p > 1 */ if (p > 1) { /* send/receive bottom rows */ if (my_rank < (p - 1)) { /* populate send buffer with bottow row */ for (i = 0; i < (int) floor (WIDTH / H); i++) { U_Send_Buffer[i] = current_ptr[my_num_rows - 1][i]; } /* non blocking send */ //MPI_Isend(U_Send_Buffer,(int)floor(WIDTH/H),MPI_FLOAT,my_rank+1,0,MPI_COMM_WORLD,&request); shmem_float_put (U_Curr_Above, U_Send_Buffer, (int) floor (WIDTH / H), my_rank + 1); } //if (my_rank > ROOT) { /* blocking receive */ //MPI_Recv(U_Curr_Above,(int)floor(WIDTH/H),MPI_FLOAT,my_rank-1,0,MPI_COMM_WORLD,&status); //} //MPI_Barrier(MPI_COMM_WORLD); shmem_barrier_all (); /* send/receive top rows */ if (my_rank > ROOT) { /* populate send buffer with top row */ for (i = 0; i < (int) floor (WIDTH / H); i++) { U_Send_Buffer[i] = current_ptr[0][i]; } /* non blocking send */ //MPI_Isend(U_Send_Buffer,(int)floor(WIDTH/H),MPI_FLOAT,my_rank-1,0,MPI_COMM_WORLD,&request); shmem_float_put (U_Curr_Below, U_Send_Buffer, (int) floor (WIDTH / H), my_rank - 1); } //if (my_rank < (p-1)) { /* blocking receive */ //MPI_Recv(U_Curr_Below,(int)floor(WIDTH/H),MPI_FLOAT,my_rank+1,0,MPI_COMM_WORLD,&status); //} //MPI_Barrier(MPI_COMM_WORLD); shmem_barrier_all (); } /* solve next reds (i+j odd) */ for (j = my_start; j <= my_end; j++) { for (i = 0; i < (int) floor (WIDTH / H); i++) { if ((i + j) % 2 != 0) { next_ptr[j - my_start][i] = get_val_par (U_Curr_Above, current_ptr, U_Curr_Below, my_rank, i, j) + (W / 4) * (get_val_par (U_Curr_Above, current_ptr, U_Curr_Below, my_rank, i - 1, j) + get_val_par (U_Curr_Above, current_ptr, U_Curr_Below, my_rank, i + 1, j) + get_val_par (U_Curr_Above, current_ptr, U_Curr_Below, my_rank, i, j - 1) + get_val_par (U_Curr_Above, current_ptr, U_Curr_Below, my_rank, i, j + 1) - 4 * (get_val_par (U_Curr_Above, current_ptr, U_Curr_Below, my_rank, i, j)) - (pow (H, 2) * f (i, j))); enforce_bc_par (next_ptr, my_rank, i, j); } } } /* solve next blacks (i+j) even .... using next reds */ for (j = my_start; j <= my_end; j++) { for (i = 0; i < (int) floor (WIDTH / H); i++) { if ((i + j) % 2 == 0) { next_ptr[j - my_start][i] = get_val_par (U_Curr_Above, current_ptr, U_Curr_Below, my_rank, i, j) + (W / 4) * (get_val_par (U_Curr_Above, next_ptr, U_Curr_Below, my_rank, i - 1, j) + get_val_par (U_Curr_Above, next_ptr, U_Curr_Below, my_rank, i + 1, j) + get_val_par (U_Curr_Above, next_ptr, U_Curr_Below, my_rank, i, j - 1) + get_val_par (U_Curr_Above, next_ptr, U_Curr_Below, my_rank, i, j + 1) - 4 * (get_val_par (U_Curr_Above, next_ptr, U_Curr_Below, my_rank, i, j)) - (pow (H, 2) * f (i, j))); enforce_bc_par (next_ptr, my_rank, i, j); } } } shfree(U_Send_Buffer); shfree(U_Curr_Below); shfree(U_Curr_Above); }
int main() { int start,stride,rmlast,rstride,np_aset,inset,lpe; int my_pe,n_pes; int i,fail,n_err,asfail,nasfail; char Case[40]; static int sSource_int[NREDUCE]; static int sTarget_int[NREDUCE]; static int spWrk_int[PWRKELEM]; static long spSync[_SHMEM_REDUCE_SYNC_SIZE]; shmem_init(); my_pe = shmem_my_pe(); n_pes = shmem_n_pes(); lpe=my_pe; dpSync=shmem_malloc(_SHMEM_REDUCE_SYNC_SIZE*sizeof(long)); for(i=0;i<_SHMEM_REDUCE_SYNC_SIZE;i++) { gpSync[i]=_SHMEM_SYNC_VALUE; dpSync[i]=_SHMEM_SYNC_VALUE; spSync[i]=_SHMEM_SYNC_VALUE; } dSource_int=shmem_malloc(NREDUCE*sizeof(int)); dTarget_int=shmem_malloc(NREDUCE*sizeof(int)); dpWrk_int=shmem_malloc((NREDUCE/2+1 > _SHMEM_REDUCE_MIN_WRKDATA_SIZE ? NREDUCE/2+1 : _SHMEM_REDUCE_MIN_WRKDATA_SIZE)*sizeof(int)); for(start=0;start<=MAXSTART;start++) { rstride=1; for(stride=0;stride<=MAXSTRIDE;stride++) { for(rmlast=0;rmlast<=MAXRMLAST;rmlast++) { np_aset=(n_pes+rstride-1-start)/rstride-rmlast; /* number of processes in the active set */ if(np_aset > 0) /* if active set is not empty */ { if(my_pe==0) printf("\nActive set triplet: PE_start=%d,logPE_stride=%d,PE_size=%d \n",start,stride,np_aset); if((my_pe>=start) && ((my_pe-start)%rstride==0) && ((my_pe-start)/rstride<np_aset)) inset=1; else inset=0; /* Initialize Source and Target arrays */ for(i=0;i<NREDUCE;i++) { sSource_int[i]=SINIT; sTarget_int[i]=TINIT; gSource_int[i]=SINIT; gTarget_int[i]=TINIT; dSource_int[i]=SINIT; dTarget_int[i]=TINIT; } shmem_barrier_all(); /* CASE: static arrays, source is different from target */ sprintf(Case,"static, source!=target"); if(inset) asfail=or_int(sSource_int,sTarget_int,start,stride,np_aset,rstride,0,dpWrk_int,gpSync,Case); else { /* check that values of source and target have not been changed */ nasfail+=check_sval_notchanged(sSource_int,Case); nasfail+=check_tval_notchanged(sTarget_int,Case); } /* CASE: global arrays, source is different from target */ sprintf(Case,"global, source!=target"); if(inset) asfail=or_int(gSource_int,gTarget_int,start,stride,np_aset,rstride,0,spWrk_int,dpSync,Case); else { /* check that values of source and target have not been changed */ nasfail+=check_sval_notchanged(gSource_int,Case); nasfail+=check_tval_notchanged(gTarget_int,Case); } /* CASE: symmetric heap arrays, source is different from target */ sprintf(Case,"sym heap, source!=target"); if(inset) asfail=or_int(dSource_int,dTarget_int,start,stride,np_aset,rstride,0,gpWrk_int,spSync,Case); else { /* check that values of source and target have not been changed */ nasfail+=check_sval_notchanged(dSource_int,Case); nasfail+=check_tval_notchanged(dTarget_int,Case); } /* Reinitialize Source arrays for new tests */ for(i=0;i<NREDUCE;i++) { sSource_int[i]=SINIT; gSource_int[i]=SINIT; dSource_int[i]=SINIT; } shmem_barrier_all(); /* CASE: static arrays, source and target are the same array */ sprintf(Case,"static, source==target"); if(inset) asfail=or_int(sSource_int,sSource_int,start,stride,np_aset,rstride,1,gpWrk_int,dpSync,Case); else /* check that values of source have not been changed */ nasfail+=check_sval_notchanged(sSource_int,Case); /* CASE: global arrays, source and target are the same array */ sprintf(Case,"global, source==target"); if(inset) asfail=or_int(gSource_int,gSource_int,start,stride,np_aset,rstride,1,dpWrk_int,spSync,Case); else /* check that values of source have not been changed */ nasfail+=check_sval_notchanged(gSource_int,Case); /* CASE: symmetric heap arrays, source and target are the same array */ sprintf(Case,"sym heap, source==target"); if(inset) asfail=or_int(dSource_int,dSource_int,start,stride,np_aset,rstride,1,spWrk_int,gpSync,Case); else /* check that values of source have not been changed */ nasfail+=check_sval_notchanged(dSource_int,Case); } /* end of if active set is not empty */ } /* end of for loop on rmlast */ rstride*=2; } /* end of for loop on stride */ } /* end of for loop on start */ shmem_barrier_all(); #ifdef NEEDS_FINALIZE shmem_finalize(); #endif return(0); }
int HPCC_SHMEMRandomAccess(HPCC_Params *params) { s64Int i; static s64Int NumErrors, GlbNumErrors; int NumProcs, logNumProcs, MyProc; u64Int GlobalStartMyProc; int Remainder; /* Number of processors with (LocalTableSize + 1) entries */ u64Int Top; /* Number of table entries in top of Table */ s64Int LocalTableSize; /* Local table width */ u64Int MinLocalTableSize; /* Integer ratio TableSize/NumProcs */ u64Int logTableSize, TableSize; double CPUTime; /* CPU time to update table */ double RealTime; /* Real time to update table */ double TotalMem; static int sAbort, rAbort; int PowerofTwo; double timeBound = -1; /* OPTIONAL time bound for execution time */ u64Int NumUpdates_Default; /* Number of updates to table (suggested: 4x number of table entries) */ u64Int NumUpdates; /* actual number of updates to table - may be smaller than * NumUpdates_Default due to execution time bounds */ s64Int ProcNumUpdates; /* number of updates per processor */ #ifdef RA_TIME_BOUND s64Int GlbNumUpdates; /* for reduction */ #endif static long llpSync[_SHMEM_BCAST_SYNC_SIZE]; static long long int llpWrk[_SHMEM_REDUCE_SYNC_SIZE]; static long ipSync[_SHMEM_BCAST_SYNC_SIZE]; static int ipWrk[_SHMEM_REDUCE_SYNC_SIZE]; FILE *outFile = NULL; double *GUPs; double *temp_GUPs; int numthreads; for (i = 0; i < _SHMEM_BCAST_SYNC_SIZE; i += 1){ ipSync[i] = _SHMEM_SYNC_VALUE; llpSync[i] = _SHMEM_SYNC_VALUE; } params->SHMEMGUPs = -1; GUPs = ¶ms->SHMEMGUPs; NumProcs = shmem_n_pes(); MyProc = shmem_my_pe(); if (0 == MyProc) { outFile = stdout; setbuf(outFile, NULL); } params->HPLMaxProcMem = 200000; TotalMem = params->HPLMaxProcMem; /* max single node memory */ TotalMem *= NumProcs; /* max memory in NumProcs nodes */ TotalMem /= sizeof(u64Int); /* calculate TableSize --- the size of update array (must be a power of 2) */ for (TotalMem *= 0.5, logTableSize = 0, TableSize = 1; TotalMem >= 1.0; TotalMem *= 0.5, logTableSize++, TableSize <<= 1) ; /* EMPTY */ /* determine whether the number of processors is a power of 2 */ if ( (NumProcs & (NumProcs -1)) == 0) { PowerofTwo = HPCC_TRUE; Remainder = 0; Top = 0; MinLocalTableSize = (TableSize / NumProcs); LocalTableSize = MinLocalTableSize; GlobalStartMyProc = (MinLocalTableSize * MyProc); } else { if(MyProc == 0) { printf("Number of processes must be power of 2\n"); } return 0; } sAbort = 0; HPCC_Table = HPCC_XMALLOC( s64Int, LocalTableSize ); if (! HPCC_Table) sAbort = 1; shmem_barrier_all(); shmem_int_sum_to_all(&rAbort, &sAbort, 1, 0, 0, NumProcs, ipWrk, ipSync); shmem_barrier_all(); if (rAbort > 0) { if (MyProc == 0) fprintf(outFile, "Failed to allocate memory for the main table.\n"); /* check all allocations in case there are new added and their order changes */ if (HPCC_Table) HPCC_free( HPCC_Table ); goto failed_table; } params->SHMEMRandomAccess_N = (s64Int)TableSize; /* Default number of global updates to table: 4x number of table entries */ NumUpdates_Default = 4 * TableSize; ProcNumUpdates = 4*LocalTableSize; NumUpdates = NumUpdates_Default; if (MyProc == 0) { fprintf( outFile, "Running on %d processors%s\n", NumProcs, PowerofTwo ? " (PowerofTwo)" : ""); fprintf( outFile, "Total Main table size = 2^" FSTR64 " = " FSTR64 " words\n", logTableSize, TableSize ); if (PowerofTwo) fprintf( outFile, "PE Main table size = 2^" FSTR64 " = " FSTR64 " words/PE\n", (logTableSize - logNumProcs), TableSize/NumProcs ); else fprintf( outFile, "PE Main table size = (2^" FSTR64 ")/%d = " FSTR64 " words/PE MAX\n", logTableSize, NumProcs, LocalTableSize); fprintf( outFile, "Default number of updates (RECOMMENDED) = " FSTR64 "\n", NumUpdates_Default); params->SHMEMRandomAccess_ExeUpdates = NumUpdates; } /* Initialize main table */ for (i=0; i<LocalTableSize; i++) HPCC_Table[i] = i + GlobalStartMyProc; shmem_barrier_all(); RealTime = -RTSEC(); Power2NodesRandomAccessUpdate(logTableSize, TableSize, LocalTableSize, MinLocalTableSize, GlobalStartMyProc, Top, logNumProcs, NumProcs, Remainder, MyProc, ProcNumUpdates); shmem_barrier_all(); /* End timed section */ RealTime += RTSEC(); /* Print timing results */ if (MyProc == 0){ params->SHMEMRandomAccess_time = RealTime; *GUPs = 1e-9*NumUpdates / RealTime; fprintf( outFile, "Real time used = %.6f seconds\n", RealTime ); fprintf( outFile, "%.9f Billion(10^9) Updates per second [GUP/s]\n", *GUPs ); fprintf( outFile, "%.9f Billion(10^9) Updates/PE per second [GUP/s]\n", *GUPs / NumProcs ); /* No longer reporting per CPU number */ /* *GUPs /= NumProcs; */ } /* distribute result to all nodes */ temp_GUPs = GUPs; shmem_barrier_all(); shmem_broadcast64(GUPs,temp_GUPs,1,0,0,0,NumProcs,llpSync); shmem_barrier_all(); /* Verification phase */ /* Begin timing here */ RealTime = -RTSEC(); HPCC_Power2NodesSHMEMRandomAccessCheck(logTableSize, TableSize, LocalTableSize, GlobalStartMyProc, logNumProcs, NumProcs, MyProc, ProcNumUpdates, &NumErrors); shmem_barrier_all(); shmem_longlong_sum_to_all( &GlbNumErrors, &NumErrors, 1, 0,0, NumProcs,llpWrk, llpSync); shmem_barrier_all(); /* End timed section */ RealTime += RTSEC(); if(MyProc == 0){ params->SHMEMRandomAccess_CheckTime = RealTime; fprintf( outFile, "Verification: Real time used = %.6f seconds\n", RealTime); fprintf( outFile, "Found " FSTR64 " errors in " FSTR64 " locations (%s).\n", GlbNumErrors, TableSize, (GlbNumErrors <= 0.01*TableSize) ? "passed" : "failed"); if (GlbNumErrors > 0.01*TableSize) params->Failure = 1; params->SHMEMRandomAccess_Errors = (s64Int)GlbNumErrors; params->SHMEMRandomAccess_ErrorsFraction = (double)GlbNumErrors / (double)TableSize; params->SHMEMRandomAccess_Algorithm = 1; } /* End verification phase */ /* Deallocate memory (in reverse order of allocation which should help fragmentation) */ HPCC_free( HPCC_Table ); failed_table: if (0 == MyProc) if (outFile != stderr) fclose( outFile ); shmem_barrier_all(); return 0; }
int main (int argc, char **argv) { /* arrays used to contain each PE's rows - specify cols, no need to spec rows */ float **U_Curr; float **U_Next; /* helper variables */ /* available iterator */ int i, j, k, m, n; int per_proc, remainder, my_start_row, my_end_row, my_num_rows; int verbose = 0; int show_time = 0; double time; double t, tv[2]; /*OpenSHMEM initilization*/ start_pes (0); p = _num_pes (); my_rank = _my_pe (); if (p > 8) { fprintf(stderr, "Ignoring test when run with more than 8 pes\n"); return 77; } /* argument processing done by everyone */ int c, errflg; extern char *optarg; extern int optind, optopt; while ((c = getopt (argc, argv, "e:h:m:tw:v")) != -1) { switch (c) { case 'e': EPSILON = atof (optarg); break; case 'h': HEIGHT = atoi (optarg); break; case 'm': /* selects the numerical methods */ switch (atoi (optarg)) { case 1: /* jacobi */ meth = 1; break; case 2: /* gauss-seidel */ meth = 2; break; case 3: /* sor */ meth = 3; break; } break; case 't': show_time++; /* overridden by -v (verbose) */ break; case 'w': WIDTH = atoi (optarg); break; case 'v': verbose++; break; /* handle bad arguments */ case ':': /* -h or -w without operand */ if (ROOT == my_rank) fprintf (stderr, "Option -%c requires an operand\n", optopt); errflg++; break; case '?': if (ROOT == my_rank) fprintf (stderr, "Unrecognized option: -%c\n", optopt); errflg++; break; } } if (ROOT == my_rank && argc < 2) { printf ("Using defaults: -h 20 -w 20 -m 2\n"); } // if (0 < errflg) // exit(EXIT_FAILURE); /* wait for user to input runtime params */ for (i = 0; i < _SHMEM_REDUCE_SYNC_SIZE; i += 1) pSync[i] = _SHMEM_SYNC_VALUE; shmem_barrier_all (); /* broadcast method to use */ shmem_broadcast32 (&meth, &meth, 1, 0, 0, 0, p, pSync); switch (meth) { case 1: method = &jacobi; break; case 2: method = &gauss_seidel; break; case 3: method = &sor; break; } /* let each processor decide what rows(s) it owns */ my_start_row = get_start (my_rank); my_end_row = get_end (my_rank); my_num_rows = get_num_rows (my_rank); if (0 < verbose) printf ("proc %d contains (%d) rows %d to %d\n", my_rank, my_num_rows, my_start_row, my_end_row); fflush (stdout); /* allocate 2d array */ U_Curr = (float **) malloc (sizeof (float *) * my_num_rows); U_Curr[0] = (float *) malloc (sizeof (float) * my_num_rows * (int) floor (WIDTH / H)); for (i = 1; i < my_num_rows; i++) { U_Curr[i] = U_Curr[i - 1] + (int) floor (WIDTH / H); } /* allocate 2d array */ U_Next = (float **) malloc (sizeof (float *) * my_num_rows); U_Next[0] = (float *) malloc (sizeof (float) * my_num_rows * (int) floor (WIDTH / H)); for (i = 1; i < my_num_rows; i++) { U_Next[i] = U_Next[i - 1] + (int) floor (WIDTH / H); } /* initialize global grid */ init_domain (U_Curr, my_rank); init_domain (U_Next, my_rank); /* iterate for solution */ if (my_rank == ROOT) { tv[0] = gettime (); } k = 1; while (1) { method (U_Curr, U_Next); local_convergence_sqd = get_convergence_sqd (U_Curr, U_Next, my_rank); shmem_barrier_all (); shmem_float_sum_to_all (&convergence_sqd, &local_convergence_sqd, 1, 0, 0, p, pWrk, pSync); if (my_rank == ROOT) { convergence = sqrt (convergence_sqd); if (verbose == 1) { printf ("L2 = %f\n", convergence); } } /* broadcast method to use */ shmem_barrier_all (); shmem_broadcast32 (&convergence, &convergence, 1, 0, 0, 0, p, pSync); if (convergence <= EPSILON) { break; } /* copy U_Next to U_Curr */ for (j = my_start_row; j <= my_end_row; j++) { for (i = 0; i < (int) floor (WIDTH / H); i++) { U_Curr[j - my_start_row][i] = U_Next[j - my_start_row][i]; } } k++; //MPI_Barrier(MPI_COMM_WORLD); shmem_barrier_all (); } /* say something at the end */ if (my_rank == ROOT) { //time = MPI_Wtime() - time; tv[1] = gettime (); t = dt (&tv[1], &tv[0]); printf ("Estimated time to convergence in %d iterations using %d processors on a %dx%d grid is %f seconds\n", k, p, (int) floor (WIDTH / H), (int) floor (HEIGHT / H), t / 1000000.0); } //MPI_Finalize(); exit (EXIT_SUCCESS); return 0; }
int main(int argc, char **argv) { int i,j; long modj,oldj,oldxmodj,newcount; int my_pe,n_pes; size_t max_elements_bytes; static long *x; shmem_init(); my_pe = shmem_my_pe(); n_pes = shmem_n_pes(); #ifdef HAVE_SET_CACHE_INV shmem_set_cache_inv(); #endif /* fail if trying to use only one processor */ if ( n_pes <= 1 ){ fprintf(stderr, "FAIL - test requires at least two PEs\n"); exit(1); } if(my_pe == 0) fprintf(stderr, "shmem_lock_set_clear(%s) n_pes=%d\n", argv[0],n_pes); /* shmalloc x on all pes (only use the one on PE 0) */ max_elements_bytes = (size_t) (sizeof(long) * n_pes); x = shmem_malloc( max_elements_bytes ); for(i=0; i<n_pes; i++) x[i] = 0; count = 0; shmem_barrier_all(); for(i=0; i<ITER; i++) { if (my_pe != 0) { /* emulate oldj = shmem_long_finc(&count, 0); */ shmem_set_lock(&lock); shmem_long_get(&oldj,&count,1,0); /* get oldj from PE 0's count */ newcount = oldj+1; shmem_long_put(&count,&newcount,1,0); /* update count on PE 0 */ shmem_quiet; /* insure that write completes */ shmem_clear_lock(&lock); /* end of emulation */ modj = (oldj % (n_pes-1)); /* PE 0 is just the counter/checker */ /* increment value in x[modj] */ oldxmodj = shmem_long_finc(&x[modj], 0); /* printf("PE=%d,oldj=%ld,modj=%ld,oldxmodj=%ld\n",my_pe,oldj,modj,oldxmodj); */ } } shmem_barrier_all(); if (my_pe == 0) { /* check x[j] array on PE 0 */ for(j=1 ; j<n_pes; j++) { if (x[j-1] != (long) ITER) fprintf(stderr, "FAIL PE %d of %d: x[%d] = %ld expected = %ld\n", my_pe, n_pes, j-1, x[j-1], (long) ITER); } } shmem_barrier_all(); #ifdef NEEDS_FINALIZE shmem_finalize(); #endif return 0; }
int main(int argc, char **argv) { int i,ps,ps_cnt=2; int *target; int *source; int me, npes, elements=N_ELEMENTS, loops=DFLT_LOOPS; char *pgm; double start_time, time_taken; shmem_init(); me = shmem_my_pe(); npes = shmem_n_pes(); if ((pgm=strrchr(argv[0],'/'))) pgm++; else pgm = argv[0]; while ((i = getopt (argc, argv, "hve:l:p:s")) != EOF) { switch (i) { case 'v': Verbose++; break; case 'e': if ((elements = atoi_scaled(optarg)) <= 0) { fprintf(stderr,"ERR: Bad elements count %d\n",elements); shmem_finalize(); return 1; } break; case 'l': if ((loops = atoi_scaled(optarg)) <= 0) { fprintf(stderr,"ERR: Bad loop count %d\n",loops); shmem_finalize(); return 1; } break; case 'p': if ((ps_cnt = atoi_scaled(optarg)) <= 0) { fprintf(stderr,"ERR: Bad pSync[] elements %d\n",loops); shmem_finalize(); return 1; } break; case 's': Serialize++; break; case 'h': if (me == 0) usage(pgm); return 0; default: if (me == 0) { fprintf(stderr,"%s: unknown switch '-%c'?\n",pgm,i); usage(pgm); } shmem_finalize(); return 1; } } ps_cnt *= _SHMEM_BCAST_SYNC_SIZE; pSync = shmem_malloc( ps_cnt * sizeof(long) ); for (i = 0; i < ps_cnt; i++) pSync[i] = _SHMEM_SYNC_VALUE; source = (int *) shmem_malloc( elements * sizeof(*source) ); target = (int *) shmem_malloc( elements * sizeof(*target) ); for (i = 0; i < elements; i += 1) { source[i] = i + 1; target[i] = -90; } if (me==0 && Verbose) fprintf(stderr,"ps_cnt %d loops %d nElems %d\n", ps_cnt,loops,elements); shmem_barrier_all(); for(time_taken = 0.0, ps = i = 0; i < loops; i++) { start_time = shmemx_wtime(); shmem_broadcast32(target, source, elements, 0, 0, 0, npes, &pSync[ps]); if (Serialize) shmem_barrier_all(); time_taken += (shmemx_wtime() - start_time); if (ps_cnt > 1 ) { ps += _SHMEM_BCAST_SYNC_SIZE; if ( ps >= ps_cnt ) ps = 0; } } if(me == 0 && Verbose) { printf("%d loops of Broadcast32(%ld bytes) over %d PEs: %7.3f secs\n", loops, (elements*sizeof(*source)), npes, time_taken); elements = (elements * loops * sizeof(*source)) / (1024*1024); printf(" %7.5f secs per broadcast() @ %7.4f MB/sec\n", (time_taken/(double)loops), ((double)elements / time_taken) ); } if (Verbose > 1) fprintf(stderr,"[%d] pre B1\n",me); shmem_barrier_all(); if (Verbose > 1) fprintf(stderr,"[%d] post B1\n",me); shmem_free(pSync); shmem_free(target); shmem_free(source); shmem_finalize(); return 0; }
int main () { int quantum = -1, checktick (); int BytesPerWord; int k; ssize_t j, i; STREAM_TYPE scalar; /* --- SETUP --- determine precision and check timing --- */ printf (HLINE); printf ("STREAM version $Revision: 5.10 $\n"); printf (HLINE); BytesPerWord = sizeof (STREAM_TYPE); printf ("This system uses %d bytes per array element.\n", BytesPerWord); /* SHMEM initialize */ start_pes (0); _world_size = _num_pes (); _world_rank = _my_pe (); STREAM_TYPE *a = (STREAM_TYPE *) shmalloc ((STREAM_ARRAY_SIZE + OFFSET) * sizeof (STREAM_TYPE)); STREAM_TYPE *b = (STREAM_TYPE *) shmalloc ((STREAM_ARRAY_SIZE + OFFSET) * sizeof (STREAM_TYPE)); STREAM_TYPE *c = (STREAM_TYPE *) shmalloc ((STREAM_ARRAY_SIZE + OFFSET) * sizeof (STREAM_TYPE)); /* wait for user to input runtime params */ for (int j = 0; j < _SHMEM_BARRIER_SYNC_SIZE; j++) { pSync0[j] = pSync1[j] = pSync2[j] = _SHMEM_SYNC_VALUE; } int size = _world_size; if (!(size == 0) && !(size & (size - 1))) ; else { printf ("Program only works for a PE size of power-of-2\n"); exit (-1); } if (_world_rank == 0) { printf (HLINE); #ifdef N printf ("***** WARNING: ******\n"); printf (" It appears that you set the preprocessor variable N when compiling this code.\n"); printf (" This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n"); printf (" Reverting to default value of STREAM_ARRAY_SIZE=%llu\n", (unsigned long long) STREAM_ARRAY_SIZE); printf ("***** WARNING: ******\n"); #endif printf ("Array size = %llu (elements), Offset = %d (elements)\n", (unsigned long long) STREAM_ARRAY_SIZE, OFFSET); printf ("Memory per array = %.1f MiB (= %.1f GiB).\n", BytesPerWord * ((double) STREAM_ARRAY_SIZE / 1024.0 / 1024.0), BytesPerWord * ((double) STREAM_ARRAY_SIZE / 1024.0 / 1024.0 / 1024.0)); printf ("Total memory required = %.1f MiB (= %.1f GiB).\n", (3.0 * BytesPerWord) * ((double) STREAM_ARRAY_SIZE / 1024.0 / 1024.), (3.0 * BytesPerWord) * ((double) STREAM_ARRAY_SIZE / 1024.0 / 1024. / 1024.)); printf ("Each kernel will be executed %d times.\n", NTIMES); printf (" The *best* time for each kernel (excluding the first iteration)\n"); printf (" will be used to compute the reported bandwidth.\n"); printf ("Number of SHMEM PEs requested = %i\n", _world_size); } int blocksize = 10000; assert (STREAM_ARRAY_SIZE % blocksize == 0); // do something really minor /* Get initial value for system clock. */ for (j = 0; j < STREAM_ARRAY_SIZE; j++) { a[j] = 1.0; b[j] = 2.0; c[j] = 0.0; } printf (HLINE); if (_world_rank == 0) { if ((quantum = checktick ()) >= 1) printf ("Your clock granularity/precision appears to be " "%d microseconds.\n", quantum); else { printf ("Your clock granularity appears to be " "less than one microsecond.\n"); quantum = 1; } } shmem_barrier_all (); // assign fixed iterations per PE // since we know default STREAM array size // we are hardcoding this, but if the value // changes, then this blocking factor must // also change // basically, each PE works on this block // size at a time time_start = mysecond (); /* Initialize */ next_p = shmem_int_fadd (&gcounter, 1, ROOT); for (j = 0; j < STREAM_ARRAY_SIZE; j += blocksize) { if (next_p == count_p) { for (i = j; i < (j + blocksize); i++) { a[i] = 2.0E0 * a[i]; } next_p = shmem_int_fadd (&gcounter, 1, ROOT); } count_p++; } time_end = mysecond (); clock_time_PE = time_end - time_start; shmem_double_sum_to_all (&total_clock_time, &clock_time_PE, 1, 0, 0, _world_size, pWrk0, pSync0); if (_world_rank == 0) { printf ("Each test below will take on the order" " of %d microseconds.\n", (int) (total_clock_time * 1.0E6)); printf (" (= %d clock ticks)\n", (int) ((1.0E6 * total_clock_time) / quantum)); printf ("Increase the size of the arrays if this shows that\n"); printf ("you are not getting at least 20 clock ticks per test.\n"); printf (HLINE); printf ("WARNING -- The above is only a rough guideline.\n"); printf ("For best results, please be sure you know the\n"); printf ("precision of your system timer.\n"); printf (HLINE); } /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ // reduction required, as each PE only fills a,b,c partially scalar = 3.0; for (k = 0; k < NTIMES; k++) { for (j = 0; j < STREAM_ARRAY_SIZE; j += blocksize) { if (next_p == count_p) { for (i = j; i < (j + blocksize); i++) { a[i] = 1.0; b[i] = 2.0; c[i] = 0.0; a[i] = 2.0E0 * a[i]; } next_p = shmem_int_fadd (&gcounter, 1, ROOT); } count_p++; //shmem_double_max_to_all (a + j, a + j, blocksize, 0, // 0, _world_size, pWrk1, pSync1); shmem_barrier_all (); flat_tree (a + j, a + j, blocksize); } shmem_barrier_all (); time_start = mysecond (); for (j = 0; j < STREAM_ARRAY_SIZE; j += blocksize) { if (next_p == count_p) { for (i = j; i < (j + blocksize); i++) { c[i] = a[i]; } next_p = shmem_int_fadd (&gcounter, 1, ROOT); } count_p++; //shmem_double_max_to_all (c + j, c + j, blocksize, 0, // 0, _world_size, pWrk1, pSync1); shmem_barrier_all (); flat_tree (c + j, c + j, blocksize); } shmem_barrier_all (); time_end = mysecond () - time_start; shmem_double_max_to_all (×[0][k], &time_end, 1, 0, 0, _world_size, pWrk0, pSync0); time_start = mysecond (); for (j = 0; j < STREAM_ARRAY_SIZE; j += blocksize) { if (next_p == count_p) { for (i = j; i < (j + blocksize); i++) { b[i] = scalar * c[i]; } next_p = shmem_int_fadd (&gcounter, 1, ROOT); } count_p++; //shmem_double_max_to_all (b + j, b + j, blocksize, 0, // 0, _world_size, pWrk1, pSync1); shmem_barrier_all (); flat_tree (b + j, b + j, blocksize); } shmem_barrier_all (); time_end = mysecond () - time_start; shmem_double_sum_to_all (×[1][k], &time_end, 1, 0, 0, _world_size, pWrk0, pSync0); time_start = mysecond (); for (j = 0; j < STREAM_ARRAY_SIZE; j += blocksize) { if (next_p == count_p) { for (i = j; i < (j + blocksize); i++) { c[i] = a[i] + b[i]; } next_p = shmem_int_fadd (&gcounter, 1, ROOT); } count_p++; //shmem_double_max_to_all (c + j, c + j, blocksize, 0, // 0, _world_size, pWrk1, pSync1); shmem_barrier_all (); flat_tree (c + j, c + j, blocksize); } shmem_barrier_all (); time_end = mysecond () - time_start; shmem_double_sum_to_all (×[2][k], &time_end, 1, 0, 0, _world_size, pWrk0, pSync0); time_start = mysecond (); for (j = 0; j < STREAM_ARRAY_SIZE; j += blocksize) { if (next_p == count_p) { for (i = j; i < (j + blocksize); i++) { a[i] = b[i] + scalar * c[i]; } next_p = shmem_int_fadd (&gcounter, 1, ROOT); } count_p++; //shmem_double_max_to_all (a + j, a + j, blocksize, 0, // 0, _world_size, pWrk1, pSync1); shmem_barrier_all (); flat_tree (a + j, a + j, blocksize); } shmem_barrier_all (); time_end = mysecond () - time_start; shmem_double_sum_to_all (×[3][k], &time_end, 1, 0, 0, _world_size, pWrk0, pSync0); } shmem_barrier_all (); /* --- SUMMARY --- */ for (k = 1; k < NTIMES; k++) /* note -- skip first iteration */ { for (j = 0; j < 4; j++) { avgtime[j] = avgtime[j] + times[j][k]; mintime[j] = MIN (mintime[j], times[j][k]); maxtime[j] = MAX (maxtime[j], times[j][k]); } } if (_world_rank == 0) { printf ("Function Best Rate MB/s Avg time Min time Max time\n"); for (j = 0; j < 4; j++) { avgtime[j] = avgtime[j] / (double) (NTIMES - 1); printf ("%s%12.1f %11.6f %11.6f %11.6f\n", label[j], 1.0E-06 * bytes[j] / mintime[j], avgtime[j], mintime[j], maxtime[j]); } printf (HLINE); } /* --- Check Results --- */ if (_world_rank == 0) { checkSTREAMresults (a, b, c); printf (HLINE); } shfree (a); shfree (b); shfree (c); return 0; }
int main(int argc, char *argv[]){ int i,n,next_pivot, pivot; long pSync[_SHMEM_BCAST_SYNC_SIZE]; for (i=0; i < SHMEM_BCAST_SYNC_SIZE; i++) { pSync[i] = _SHMEM_SYNC_VALUE; } start_pes(0); me = shmem_my_pe(); npes = shmem_n_pes(); shmem_barrier_all(); srand (me+time(NULL)); N = atoi(argv[1]); //int *nelems = (int*) shmalloc(sizeof(int)); //int *nelems_import= (int*) shmalloc(sizeof(int));; printf("%d: Size = %d with np=%d\n",me,N,npes); A = (int *)shmalloc((N/npes)*sizeof(int)); temp_arr = (int *)shmalloc((N/npes)*sizeof(int)); if(A==NULL){ printf("\nOut of memory"); return 1; } n= N/npes; i=0; while(i<N/npes){ A[i] = rand()%(10000-0); i++; } printf("\nprocess %d elements:",me); for(i=0;i<(N/npes);i++){ printf("%d, ", A[i]); } next_pivot = A[0]; //the step two of algo.....broadcast the new pivot shmem_broadcast32(&next_pivot,A,1,0,0,0,npes,pSync); shmem_barrier_all(); pivot = quicksort(A, 0, n-1); printf("Process %d the pivot:%d",me, pivot); shmem_barrier_all(); //just for the sake of clear display...can be removed in the end printf("\nThe sorted list is of process %d: ",me); for(i=0;i<n;i++){ printf("%d, ",A[i]); } printf("\n"); printf("the new pivot of process %d: %d\n",me,next_pivot); // to check the broadcast of new pivots int check,j; //to check the division of the sorted arrays according to the new pivot. shmem_barrier_all(); check = uplowPartition(next_pivot); shmem_barrier_all(); printf("(%d)",me); for(int j=0;j<N/npes;j++){ printf("%d, ",A[j]); } printf("new partition: %d",check); shmem_barrier_all(); if(me < npes/2) { i=0; // printf("Hello from %d", me); printf("\n"); for(j=check;j<N/npes;j++){ temp_arr[i] = A[j]; i++; } i=0; printf("(%d)",me); for(j=check;j<N/npes;j++){ printf("%d, ",temp_arr[i]) ; i++; } // printf("\n"); } shmem_barrier_all(); if(me >= npes/2) { // printf("Hello from %d", me); printf("\n"); for(j=0;j<check;j++){ temp_arr[j] = A[j]; } printf("(%d)",me); for(j=0;j<check;j++){ printf("%d, ",temp_arr[j]) ; } // printf("\n"); } shmem_barrier_all(); printf("\n"); if(me < npes/2){ printf("\n"); pe = me +npes/2; nelems[0] = N/npes - check; printf (" process %d pe : %d nelems : %d\n",me,pe,nelems[0]);//to test the value printf("(%d) addr = %d , value = %d , pe = %d\n ",me, &nelems_import[0],nelems[0],pe);//to test the value shmem_int_p(nelems_import,nelems[0],pe); shmem_quiet(); shmem_int_put(temp_arr,&A[check],nelems[0],pe); } shmem_barrier_all();//check if the entire barrier is needed if(me >= npes/ 2){ pe = me-npes/2;//check if it is synced nelems[0]= check; printf (" process %d pe : %d nelems : %d\n",me,pe,nelems[0]);//to test the value shmem_int_p(nelems_import,nelems[0],pe); shmem_quiet(); shmem_int_put(temp_arr,A,nelems[0],pe); } shmem_barrier_all();//again sync is required...check it with profiling //this snippet is to check if the processors have got the high and low lists respectively ------------------- printf("(%d) nelems_import = %d\n",me,nelems_import[0]);//to test the value printf("(%d) new elements = ",me); for(i=0;i<nelems_import[0];i++){ printf("%d, ",temp_arr[i]); } printf("\n"); //------------------------------------here this checking snippet ends---- //----------------------------------merging of arrays begin------------------------- if(me < npes/2){ i=0; for(j=nelems_import[0];j<(nelems_import[0]+check);j++){ temp_arr[j] = A[i]; i++; } } if(me >= npes/2){ i=check; for(j=nelems_import[0];j<(nelems_import[0]+N/npes-check);j++){ temp_arr[j] = A[i]; i++; } } shmem_barrier_all(); //to test if the arrays are merged properly int size; if(me < npes/2){ size = (nelems_import[0]+check); printf("(%d) merged array:",me); for(j=0;j<size;j++){ printf("%d, ",temp_arr[j]); } printf("\n"); } if(me >= npes/2){ size = (nelems_import[0]+N/npes-check); printf("(%d) merged array:",me); for(j=0;j<size;j++){ printf("%d, ",temp_arr[j]); } printf("\n"); } //-----------------------check of merging finishes-------- //--------------------------------------------------merging finishes------------------------------ //-----------------------sort again----------------------------------------------- if(me < npes/2){ quicksort(temp_arr,0,(nelems_import[0]+check-1)); } if(me >= npes/2){ quicksort(temp_arr,0,(nelems_import[0]+N/npes-check-1)); } //sorting routine checked...once program is done we can remove this part------------- shmem_barrier_all();//test purpose only if(me < npes/2){ printf("(%d) sorted list: ",me); for(i=0;i<size;i++){ printf("%d, ",temp_arr[i]); } printf("\n"); } if(me >= npes/2){ printf("(%d) sorted list: ",me); for(i=0;i<size;i++){ printf("%d, ",temp_arr[i]); } printf("\n"); } //------------------------------------------------------------- //--------------------------------------------------------------------------------- shfree(temp_arr); shfree(A); shmem_finalize(); }
/* Performance test for shmem_XX_put (latency and bandwidth) */ #include <stdio.h> #include <stdlib.h> #include <time.h> #include <shmem.h> #include <sys/time.h> long double time_taken; long pSync[_SHMEM_REDUCE_SYNC_SIZE]; long double pWrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE]; //#define N_ELEMENTS 25600/*Data size chosen to be able to capture time required*/ int main(void) { int i,j,k; int *target; int *source; int me, npes; int nxtpe; struct timeval start, end; long double start_time,end_time; int N_ELEMENTS = (4194304*2)/sizeof(int); start_pes(0); me = _my_pe(); npes = _num_pes(); for (i = 0; i < _SHMEM_BCAST_SYNC_SIZE; i += 1) { pSync[i] = _SHMEM_SYNC_VALUE; } nxtpe = (me+1)%npes; source = (int *) shmalloc( N_ELEMENTS * sizeof(*source) ); target = (int *) shmalloc( N_ELEMENTS * sizeof(*target) ); if(me == 0) printf("Put performance test results:\nSize (Bytes)\t\tTime (Microseconds)\t\tBandwidth (Bytes/Second)\n"); for (i = 0; i < N_ELEMENTS; i += 1) { source[i] = i + 1; target[i] = -90; } shmem_barrier_all(); /*For int put we take average of all the times realized by a pair of PEs, thus * reducing effects of physical location of PEs*/ for (i=1;i<=N_ELEMENTS;i=i*2) { time_taken = 0; for(j=0;j<10000;j++){ gettimeofday(&start, NULL); start_time = (start.tv_sec * 1000000.0) + start.tv_usec; shmem_int_put(target, source, i,nxtpe); gettimeofday(&end, NULL); end_time = (end.tv_sec * 1000000.0) + end.tv_usec; time_taken = time_taken + (end_time - start_time); } shmem_longdouble_sum_to_all(&time_taken, &time_taken,1, 0, 0, npes, pWrk, pSync); if(me == 0){ time_taken = time_taken/(npes*10000); /*Average time across all PEs for one put*/ if (i*sizeof(i) < 1048576) printf("%ld \t\t\t\t %lf\t\t\t\t %lf\n",i*sizeof(i), (double)time_taken,(double)((i*sizeof(i))/(time_taken))); else printf("%ld \t\t\t %lf\t\t\t\t %lf\n",i*sizeof(i), (double)time_taken,(double)((i*sizeof(i))/(time_taken))); } } shmem_barrier_all(); shfree(target); shfree(source); return 0; }
static int test_item4(void) { int rc = TC_PASS; TYPE_VALUE* target_addr = NULL; TYPE_VALUE* source_addr = NULL; TYPE_VALUE source_value = 0; TYPE_VALUE expect_value = 0; int num_proc = 0; int my_proc = 0; num_proc = _num_pes(); my_proc = _my_pe(); pWrk = shmalloc(sizeof(*pWrk) * sys_max(1/2 + 1, _SHMEM_REDUCE_MIN_WRKDATA_SIZE)); if (pWrk) { source_addr = shmalloc(sizeof(*source_addr)); target_addr = source_addr; } if (target_addr && source_addr) { TYPE_VALUE value = DEFAULT_VALUE; int j = 0; /* Set my value */ source_value = (TYPE_VALUE)my_proc; *source_addr = source_value; /* Define expected value */ expect_value = 0; /* This guarantees that PE set initial value before peer change one */ for ( j = 0; j < _SHMEM_REDUCE_SYNC_SIZE; j++ ) { pSync[j] = _SHMEM_SYNC_VALUE; } shmem_barrier_all(); /* Put value to peer */ FUNC_VALUE(target_addr, source_addr, 1, 0, 0, num_proc, pWrk, pSync); /* Get value put by peer: * These routines start the remote transfer and may return before the data * is delivered to the remote PE */ shmem_barrier_all(); { int total_wait = 0; while (*target_addr == DEFAULT_VALUE && total_wait < 1000 * WAIT_COUNT) { total_wait++; usleep(1); } value = *target_addr; } rc = (expect_value == value ? TC_PASS : TC_FAIL); log_debug(OSH_TC, "my#%d source = %lld expected = %lld actual = %lld\n", my_proc, (INT64_TYPE)source_value, (INT64_TYPE)expect_value, (INT64_TYPE)value); } else { rc = TC_SETUP_FAIL; } if (source_addr) { shfree(source_addr); } if (pWrk) { shfree(pWrk); pWrk = NULL; } return rc; }
static int test_item3(void) { int rc = TC_PASS; TYPE_VALUE* shmem_addr = NULL; TYPE_VALUE my_value = 0; TYPE_VALUE peer_value = 0; TYPE_VALUE expect_value = 0; int num_proc = 0; int my_proc = 0; int peer_proc = 0; num_proc = _num_pes(); my_proc = _my_pe(); shmem_addr = shmalloc(sizeof(*shmem_addr)); if (shmem_addr) { TYPE_VALUE value = -1; INT64_TYPE i = 0; /* Set my value */ my_value = (-1); *shmem_addr = my_value; for (i = 0; i < COUNT_VALUE; i++) { /* Define peer and it value */ peer_proc = (my_proc + 1) % num_proc; peer_value = (peer_proc % 2 ? 1 : -1) * (i * STEP_VALUE); /* Define expected value */ expect_value = (my_proc % 2 ? 1 : -1) * (i * STEP_VALUE); /* This guarantees that PE set initial value before peer change one */ shmem_barrier_all(); /* Write value to peer */ FUNC_VALUE(shmem_addr, peer_value, peer_proc); /* Get value put by peer: * These routines start the remote transfer and may return before the data * is delivered to the remote PE */ wait_for_put_completion(peer_proc,10 /* wait for 10 secs */); value = *shmem_addr; rc = (expect_value == value ? TC_PASS : TC_FAIL); log_debug(OSH_TC, "my(#%d:%lld) peer(#%d:%lld) expected = %lld vs got = %lld\n", my_proc, (INT64_TYPE)my_value, peer_proc, (INT64_TYPE)peer_value, (INT64_TYPE)expect_value, (INT64_TYPE)value); } } else { rc = TC_SETUP_FAIL; } if (shmem_addr) { shfree(shmem_addr); } return rc; }
/**************************************************************************** * Test Case processing procedure ***************************************************************************/ int osh_reduce_tc23(const TE_NODE *node, int argc, const char *argv[]) { int rc = TC_PASS; rc = __parse_opt(node, argc, argv); if (rc == TC_PASS) { pSync = shmalloc(sizeof(*pSync) * _SHMEM_REDUCE_SYNC_SIZE); if (!pSync) { rc = TC_SETUP_FAIL; } } else { rc = TC_SETUP_FAIL; } /* Every PE does reduction of the single value as symmetric data object to itself */ if (rc == TC_PASS) { rc = test_item1(); log_item(node, 1, rc); shmem_barrier_all(); } /* All PEs reduce the single value */ if (rc == TC_PASS) { rc = test_item2(); log_item(node, 2, rc); shmem_barrier_all(); } /* Every PE does reduction of the single value as symmetric data object to itself * (target and source are the same array) */ if (rc == TC_PASS) { rc = test_item3(); log_item(node, 3, rc); shmem_barrier_all(); } /* All PEs reduce the single value * (target and source are the same array) */ if (rc == TC_PASS) { rc = test_item4(); log_item(node, 4, rc); shmem_barrier_all(); } /* Every PE does reduction of the buffer as symmetric data object to itself */ if (rc == TC_PASS) { rc = test_item5(); log_item(node, 5, rc); shmem_barrier_all(); } /* All PEs reduce the buffer */ if (rc == TC_PASS) { rc = test_item6(); log_item(node, 6, rc); shmem_barrier_all(); } /* Even PEs reduce the buffer */ if (rc == TC_PASS) { rc = test_item7(); log_item(node, 7, rc); shmem_barrier_all(); } /* reduce calls in loop with alternating multiple pSync and pWrk arrays (without barrrier synchronization between iterations) */ if (rc == TC_PASS) { rc = test_item8(); log_item(node, 8, rc); shmem_barrier_all(); } if (pSync) { shfree(pSync); } return rc; }
int main(int argc, char **argv) { int me, nProcs, c, l; int nWords, loops, incWords; int Verbose = 0, power2 = 0, modulo = 5; DataType *dp; pgm = strrchr(argv[0],'/'); if ( pgm ) pgm++; else pgm = argv[0]; shmem_init(); me = shmem_my_pe(); nProcs = shmem_n_pes(); while ((c = getopt (argc, argv, "hpv")) != -1) switch (c) { case 'p': power2++; break; case 'v': Verbose++; break; case 'h': default: usage(); break; } if (optind == argc) nWords = DFLT_NWORDS; else if ((nWords = getSize (argv[optind++])) <= 0) usage (); if (optind == argc) loops = DFLT_LOOPS; else if ((loops = getSize (argv[optind++])) < 0) usage (); if (optind == argc) incWords = DFLT_INCR; else if ((incWords = getSize (argv[optind++])) < 0) usage (); if (power2) { nWords = 1; modulo = 1; loops = 21; } if (Verbose && me == 0) { if (power2) { printf("%s: nWords(1) << 1 per loop.\n", pgm); } else printf("%s: nWords(%d) loops(%d) nWords-incr-per-loop(%d)\n", pgm, nWords, loops, incWords); } for(l=0; l < loops; l++) { result_sz = (nProcs-1) * (nWords * sizeof(DataType)); result = (DataType *)shmem_malloc(result_sz); if (! result) { perror ("Failed result memory allocation"); shmem_finalize(); exit (1); } for(dp=result; dp < &result[(result_sz/sizeof(DataType))];) *dp++ = 1; target_sz = nWords * sizeof(DataType); if (!(target = (DataType *)shmem_malloc(target_sz))) { perror ("Failed target memory allocation"); shmem_finalize(); exit (1); } for(dp=target; dp < &target[(target_sz / sizeof(DataType))];) *dp++ = 2; source_sz = 2 * nWords * sizeof(DataType); if (!(source = (DataType *)shmem_malloc(source_sz))) { perror ("Failed source memory allocation"); shmem_finalize(); exit (1); } for(dp=source; dp < &source[(source_sz / sizeof(DataType))];) *dp++ = 3; #if 0 printf("[%d] source %p target %p result %p\n", me, (void*)source,(void*)target,(void*)result); shmem_barrier_all(); #endif shmem_barrier_all(); /* sync sender and receiver */ for(dp=source; dp < &source[(source_sz / sizeof(DataType))]; dp++) if (*dp != 3 ) { printf("source not consistent @ 3?\n"); break; } shmem_free(source); for(dp=target; dp < &target[(target_sz / sizeof(DataType))]; dp++) if (*dp != 2 ) { printf("target not consistent @ 2?\n"); break; } shmem_free(target); for(dp=result; dp < &result[(result_sz / sizeof(DataType))]; dp++) if (*dp != 1 ) { printf("result not consistent @ 1?\n"); break; } shmem_free(result); if (loops > 1) { if (Verbose && me == 0) { if (l == 0 || (l % modulo == 0)) printf("End loop %3d nWords(%d)\n",(l+1),nWords); } if (power2) nWords <<= 1; else nWords += incWords; // watch for double inc. } } shmem_finalize(); return 0; }
int main(void) { int i, me, npes; int errors = 0; shmem_init(); me = shmem_my_pe(); npes = shmem_n_pes(); for (i = 0; i < NELEM; i++) { src[i] = me; dst_max[i] = -1; dst_min[i] = -1; } for (i = 0; i < SHMEM_REDUCE_SYNC_SIZE; i++) { max_psync[i] = SHMEM_SYNC_VALUE; max_psync[i] = SHMEM_SYNC_VALUE; } if (me == 0) printf("Shrinking active set test\n"); shmem_barrier_all(); /* A total of npes tests are performed, where the active set in each test * includes PEs i..npes-1 */ for (i = 0; i <= me; i++) { int j; if (me == i) printf(" + PE_start=%d, logPE_stride=0, PE_size=%d\n", i, npes-i); shmem_long_max_to_all(dst_max, src, NELEM, i, 0, npes-i, max_pwrk, max_psync); /* Validate reduced data */ for (j = 0; j < NELEM; j++) { long expected = npes-1; if (dst_max[j] != expected) { printf("%d: Max expected dst_max[%d] = %ld, got dst_max[%d] = %ld, iteration %d\n", me, j, expected, j, dst_max[j], i); errors++; } } shmem_long_min_to_all(dst_min, src, NELEM, i, 0, npes-i, min_pwrk, min_psync); /* Validate reduced data */ for (j = 0; j < NELEM; j++) { long expected = i; if (dst_min[j] != expected) { printf("%d: Min expected dst_min[%d] = %ld, got dst_min[%d] = %ld, iteration %d\n", me, j, expected, j, dst_min[j], i); errors++; } } } shmem_finalize(); return errors != 0; }
static int test_item4(void) { int rc = TC_PASS; TYPE_VALUE* shmem_addr = NULL; TYPE_VALUE* recv_addr = NULL; TYPE_VALUE my_value = 0; TYPE_VALUE peer_value = 0; TYPE_VALUE expect_value = 0; int num_proc = 0; int my_proc = 0; int peer_proc = 0; num_proc = _num_pes(); my_proc = _my_pe(); shmem_addr = (TYPE_VALUE*)shmalloc(sizeof(*shmem_addr) * __max_buffer_size); recv_addr = (TYPE_VALUE*)sys_malloc(sizeof(*recv_addr) * __max_buffer_size); if (shmem_addr && recv_addr) { INT64_TYPE i = 0; long cur_buf_size = 0; my_value = 0; for (i = 0; (i < __cycle_count) && (rc == TC_PASS); i++) { /* Set my value */ my_value = (my_proc % 2 ? 1 : -1) * (i * (MAX_VALUE / __cycle_count)); cur_buf_size = sys_max(1, (i + 1) * __max_buffer_size / __cycle_count); fill_buffer((void *)shmem_addr, cur_buf_size, (void *)&my_value, sizeof(my_value)); /* Give some time to all PE for setting their values */ shmem_barrier_all(); /* Define peer and it value */ peer_proc = (my_proc + 1) % num_proc; peer_value = (peer_proc % 2 ? 1 : -1) * (i * (MAX_VALUE / __cycle_count)); /* Define expected value */ expect_value = peer_value; /* Get value from peer */ FUNC_VALUE(recv_addr, shmem_addr, cur_buf_size, peer_proc); rc = (!compare_buffer_with_const(recv_addr, cur_buf_size, &expect_value, sizeof(expect_value)) ? TC_PASS : TC_FAIL); log_debug(OSH_TC, "my(#%d:%lld) peer(#%d:%lld) expected = %lld buffer size = %lld\n", my_proc, (INT64_TYPE)my_value, peer_proc, (INT64_TYPE)peer_value, (INT64_TYPE)expect_value, (INT64_TYPE)cur_buf_size); if (rc) { TYPE_VALUE* check_addr = recv_addr; int odd_index = compare_buffer_with_const(check_addr, cur_buf_size, &expect_value, sizeof(expect_value)); int show_index = (odd_index > 1 ? odd_index - 2 : 0); int show_size = sizeof(*check_addr) * sys_min(3, cur_buf_size - odd_index - 1); log_debug(OSH_TC, "index of incorrect value: 0x%08X (%d)\n", odd_index - 1, odd_index - 1); log_debug(OSH_TC, "buffer interval: 0x%08X - 0x%08X\n", show_index, show_index + show_size); show_buffer(check_addr + show_index, show_size); } shmem_barrier_all(); } } else { rc = TC_SETUP_FAIL; } if (recv_addr) { sys_free(recv_addr); } if (shmem_addr) { shfree(shmem_addr); } return rc; }
int main(int argc, char **argv) { int i; int *target; int *source; int me, npes, elements=N_ELEMENTS, loops=DFLT_LOOPS; char *pgm; shmem_init(); me = shmem_my_pe(); npes = shmem_n_pes(); if ((pgm=strrchr(argv[0],'/'))) pgm++; else pgm = argv[0]; /* lower-case switch enable only a specific test; otherwise run all tests */ while ((i = getopt (argc, argv, "hvqe:l:abcmn")) != EOF) { switch (i) { case 'a': All2++; break; case 'b': Bcast++; break; case 'c': Collect++; break; case 'm': Many++; break; case 'n': Neighbor++; break; case 'q': Verbose=0; break; case 'v': Verbose++; break; case 'e': if ((elements = atoi_scaled(optarg)) <= 0) { fprintf(stderr,"ERR: Bad elements count %d\n",elements); shmem_finalize(); return 1; } break; case 'l': if ((loops = atoi_scaled(optarg)) <= 0) { fprintf(stderr,"ERR: Bad loop count %d\n",loops); shmem_finalize(); return 1; } break; case 'h': if (me == 0) usage(pgm); shmem_finalize(); return 0; default: if (me == 0) { fprintf(stderr,"%s: unknown switch '-%c'?\n",pgm,i); usage(pgm); } shmem_finalize(); return 1; } } if (All2==0 && Bcast==0 && Collect==0 && Many==0 && Neighbor==0) All2 = Bcast = Collect = Many = Neighbor = 1; source = (int *) shmem_malloc( elements * sizeof(*source) ); target = (int *) shmem_malloc( elements * sizeof(*target) ); for (i = 0; i < elements; i += 1) { source[i] = i + 1; target[i] = -90; } shmem_barrier_all(); if (Neighbor) { neighbor_put( target, source, elements, me, npes, loops ); neighbor_get( target, source, elements, me, npes, loops ); } if (All2) { all2all_put( target, source, elements, me, npes, loops ); all2all_get( target, source, elements, me, npes, loops ); } if (Many) { one2many_put( target, source, elements, me, npes, loops ); many2one_get( target, source, elements, me, npes, loops ); } if (Bcast) bcast( target, source, elements, me, npes, loops ); if (Collect) { collect( NULL, source, elements, me, npes, loops ); fcollect( NULL, source, elements, me, npes, loops ); } shmem_barrier_all(); shmem_free(target); shmem_free(source); shmem_finalize(); return 0; }
/* exchanges the field l */ void xchange_field(spinor * const l, const int ieo) { # ifdef MPI int i,ix, mu, x0, x1, x2, x3, k; #ifdef _KOJAK_INST #pragma pomp inst begin(xchangefield) #endif shmem_barrier_all(); shmem_double_put((double*)(l+T*LX*LY*LZ/2), (double*)l, (LX*LY*LZ*12), g_nb_t_dn); shmem_double_put((double*)(l+(T+1)*LX*LY*LZ/2), (double*)(l+(T-1)*LX*LY*LZ/2), (LX*LY*LZ*12), g_nb_t_up); # if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) k = (T+2)*LX*LY*LZ/2; for(x0 = 0; x0 < T; x0++) { shmem_double_put((double*)(l + k), (double*)(l + g_lexic2eo[g_ipt[x0][0][0][0]]), 12*LZ*LY, g_nb_x_dn); k+=LZ*LY; } k = ((T+2)*LX*LY*LZ + T*LY*LZ)/2; for(x0 = 0; x0 < T; x0++) { shmem_double_put((double*)(l + k), (double*)(l + g_lexic2eo[g_ipt[x0][LX-1][0][0]]), 12*LZ*LY, g_nb_x_up); k+=LZ*LY; } # endif # if (defined PARALLELXYT || defined PARALLELXYZT) k = ((T+2)*LX*LY*LZ + 2*T*LY*LZ)/2; for(x0 = 0; x0 < T; x0++) { for(x1 = 0; x1 < LX; x1++) { shmem_double_put((double*)(l + k), (double*)(l + g_lexic2eo[g_ipt[x0][x1][0][0]]), 12*LZ, g_nb_y_dn); k+=LZ; } } k = ((T+2)*LX*LY*LZ + 2*T*LY*LZ + T*LX*LZ)/2; for(x0 = 0; x0 < T; x0++) { for(x1 = 0; x1 < LX; x1++) { shmem_double_put((double*)(l + k), (double*)(l + g_lexic2eo[g_ipt[x0][x1][LY-1][0]]), 12*LZ, g_nb_y_up); k+=LZ; } } # endif # if (defined PARALLELXYZT) x0 = (VOLUME/2 + LX*LY*LZ + T*LY*LZ +T*LX*LZ); if(ieo == 1) { for(k = 0; k < T*LX*LY/2; k++) { shmem_double_put((double*)(l + x0), (double*)(l + g_field_z_ipt_even[k]), 24, g_nb_z_dn); x0++; } } else { for(k = 0; k < T*LX*LY/2; k++) { shmem_double_put((double*)(l + x0), (double*)(l + g_field_z_ipt_odd[k]), 24, g_nb_z_dn); x0++; } } x0 = (VOLUME/2 + LX*LY*LZ + T*LY*LZ + T*LX*LZ + T*LX*LY/2); if(ieo == 1) { for(k = T*LX*LY/2; k < T*LX*LY; k++) { shmem_double_put((double*)(l + x0), (double*)(l + g_field_z_ipt_even[k]), 24, g_nb_z_up); x0++; } } else { for(k = T*LX*LY/2; k < T*LX*LY; k++) { shmem_double_put((double*)(l + x0), (double*)(l + g_field_z_ipt_even[k]), 24, g_nb_z_up); x0++; } } # endif shmem_barrier_all(); # endif // MPI return; #ifdef _KOJAK_INST #pragma pomp inst end(xchangefield) #endif }
static int test_item1(void) { int rc = TC_PASS; TYPE_VALUE* shmem_addr = NULL; TYPE_VALUE* local_addr = NULL; TYPE_VALUE my_value = 0; TYPE_VALUE peer_value = 0; TYPE_VALUE* expect_value = NULL; int num_proc = 0; int my_proc = 0; int peer_proc = 0; int tst, sst; int max_stride = MAX_ARRAY_SIZE/2-1; int *wait_variable = NULL; wait_variable = shmalloc(sizeof(int)); num_proc = _num_pes(); my_proc = _my_pe(); shmem_addr = shmalloc(sizeof(*shmem_addr)*MAX_ARRAY_SIZE); local_addr = malloc(sizeof(*local_addr)*MAX_ARRAY_SIZE); expect_value = malloc(sizeof(*expect_value)*MAX_ARRAY_SIZE); if (shmem_addr) { INT64_TYPE i = 0; INT64_TYPE j = 0; int num_to_get; my_value = 0; size_t odd_pos; for (i = 0; (i < COUNT_VALUE) && (rc == TC_PASS); i++) { tst = (i < max_stride) ? i+1 : max_stride; sst = tst; num_to_get = MAX_ARRAY_SIZE/tst; /* Set my value */ my_value = (TYPE_VALUE)(my_proc + 1); memset(shmem_addr,0,MAX_ARRAY_SIZE*SIZE_VALUE); memset(expect_value,0,MAX_ARRAY_SIZE*SIZE_VALUE); for (j = 0; j < MAX_ARRAY_SIZE; j++) local_addr[j] = my_value; /* Define peer and it value */ peer_proc = (my_proc + 1) % num_proc; peer_value = (TYPE_VALUE)((my_proc == 0) ? num_proc : my_proc); /* Define expected value */ for (j=0; j<num_to_get; j++) expect_value[j*tst] = peer_value; /* Wait is set instead of barrier to give some time to all PE for setting their values */ shmem_barrier_all(); /* Get value from peer */ FUNC_VALUE(shmem_addr,local_addr,tst,sst,num_to_get,peer_proc); wait_for_completion(wait_variable,peer_proc,&rc); if (rc == TC_PASS) { rc = (compare_buffer((unsigned char*)shmem_addr, (unsigned char*)expect_value, MAX_ARRAY_SIZE, &odd_pos) ? TC_PASS : TC_FAIL); } log_debug(OSH_TC, "my(#%d:%lld) peer(#%d:%lld) expected = %lld vs got = %lld\n", my_proc, (INT64_TYPE)my_value, peer_proc, (INT64_TYPE)peer_value, (INT64_TYPE)expect_value[0], (INT64_TYPE)local_addr[0]); /* Wait is set instead of barrier to give some time to all PE for setting their values */ shmem_barrier_all(); } } else { rc = TC_SETUP_FAIL; } if (local_addr) { free(local_addr); } if (expect_value) { free(expect_value); } if (shmem_addr) { shfree(shmem_addr); } if (wait_variable) { shfree(wait_variable); } return rc; }
int main(int argc, char **argv) { int i,j,iter; int my_pe,n_pes; int *flag,*one; size_t max_elements,max_elements_bytes; size_t elements[16] = {1,2,4,8,12,16,24,32,64,128,256,512,1024,2048,4096,8192}; int num_elements = 16; short *srce_short,*targ_short; int *srce_int,*targ_int; long *srce_long,*targ_long; float *srce_float,*targ_float; double *srce_double,*targ_double; shmem_init(); my_pe = shmem_my_pe(); n_pes = shmem_n_pes(); flag = shmem_malloc((size_t) sizeof(int)); one = shmem_malloc((size_t) sizeof(int)); *one = 1; /* fail if trying to use odd number of processors */ if ( (n_pes % 2) != 0 ){ fprintf(stderr, "FAIL - test requires even number of PEs\n"); exit(1); } if(my_pe == 0) fprintf(stderr, "shmem_both_put_nb_size(%s)\n", argv[0]); /* alloc arrays */ max_elements = (size_t) (MAX_SIZE / sizeof(int)); max_elements_bytes = (size_t) (sizeof(int)*max_elements); if(my_pe == 0) fprintf(stderr,"shmem_int_put_nb max_elements = %d\n",max_elements); srce_int = shmem_malloc(max_elements_bytes); targ_int = shmem_malloc(max_elements_bytes); if((srce_int == NULL) || (targ_int == NULL)) shmalloc_error(); max_elements = (size_t) (MAX_SIZE / sizeof(short)); max_elements_bytes = (size_t) (sizeof(short)*max_elements); if(my_pe == 0) fprintf(stderr,"shmem_short_put max_elements = %d\n",max_elements); srce_short = shmem_malloc(max_elements_bytes); targ_short = shmem_malloc(max_elements_bytes); if((srce_short == NULL) || (targ_short == NULL)) shmalloc_error(); max_elements = (size_t) (MAX_SIZE / sizeof(long)); max_elements_bytes = (size_t) (sizeof(long)*max_elements); if(my_pe == 0) fprintf(stderr,"shmem_long_put_nb max_elements = %d\n",max_elements); srce_long = shmem_malloc(max_elements_bytes); targ_long = shmem_malloc(max_elements_bytes); if((srce_long == NULL) || (targ_long == NULL)) shmalloc_error(); max_elements = (size_t) (MAX_SIZE / sizeof(float)); max_elements_bytes = (size_t) (sizeof(float)*max_elements); if(my_pe == 0) fprintf(stderr,"shmem_float_put_nb max_elements = %d\n",max_elements); srce_float = shmem_malloc(max_elements_bytes); targ_float = shmem_malloc(max_elements_bytes); if((srce_float == NULL) || (targ_float == NULL)) shmalloc_error(); max_elements = (size_t) (MAX_SIZE / sizeof(double)); max_elements_bytes = (size_t) (sizeof(double)*max_elements); if(my_pe == 0) fprintf(stderr,"shmem_double_put_nb max_elements = %d\n",max_elements); srce_double = shmem_malloc(max_elements_bytes); targ_double = shmem_malloc(max_elements_bytes); if((srce_double == NULL) || (targ_double == NULL)) shmalloc_error(); if(my_pe == 0) fprintf(stderr,"Actual value used for max_elements = %d\n",max_elements); /* try the different sizes MAX_ITER times */ for (iter = 0; iter < MAX_ITER; iter++) { for (i = 0; i < num_elements; i++) { *flag = 0; if (elements[i] <= max_elements) { if ( (my_pe % 2) == 0 ) for(j = 0; j < elements[i]; j++) { srce_short[j] = (short)(my_pe+j); srce_int[j] = (int)(iter*10000+elements[i]*100+my_pe+j); srce_long[j] = (long)(iter*10000+elements[i]*100+my_pe+j); srce_float[j] = (float)(iter*10000+elements[i]*100+my_pe+j); srce_double[j] = (double)(iter*10000+elements[i]*100+my_pe+j); } else for(j = 0; j < elements[i]; j++) { targ_short[j] = (short)(my_pe+j); targ_int[j] = (int)(iter*10000+elements[i]*100+my_pe+j); targ_long[j] = (long)(iter*10000+elements[i]*100+my_pe+j); targ_float[j] = (float)(iter*10000+elements[i]*100+my_pe+j); targ_double[j] = (double)(iter*10000+elements[i]*100+my_pe+j); } shmem_barrier_all(); if ( (my_pe % 2) == 0 ) { #ifndef OPENSHMEM shmemx_int_put_nb(targ_int,srce_int,elements[i],my_pe+1,NULL); shmemx_long_put_nb(targ_long,srce_long,elements[i],my_pe+1,NULL); shmemx_float_put_nb(targ_float,srce_float,elements[i],my_pe+1,NULL); shmemx_double_put_nb(targ_double,srce_double,elements[i],my_pe+1,NULL); #else shmem_int_put_nbi(targ_int,srce_int,elements[i],my_pe+1); shmem_long_put_nbi(targ_long,srce_long,elements[i],my_pe+1); shmem_float_put_nbi(targ_float,srce_float,elements[i],my_pe+1); shmem_double_put_nbi(targ_double,srce_double,elements[i],my_pe+1); #endif /* this one is blocking */ shmem_short_put(targ_short,srce_short,elements[i],my_pe+1); shmem_quiet(); shmem_int_put(flag,one,(size_t)1,my_pe+1); } else { shmem_int_wait(flag,0); for(j = 0; j < elements[i]; j++) { if ( targ_short[j] != (short)(my_pe+j-1) ) fprintf(stderr, "FAIL: PE [%d] iter=%d i=%d targ_short[%d]=%d not equal %d\n", my_pe,iter,i,j,targ_short[j],my_pe+j-1); if ( targ_int[j] != (int)(iter*10000+elements[i]*100+my_pe+j-1) ) fprintf(stderr, "FAIL: PE [%d] iter=%d i=%d targ_int[%d]=%d not equal %d\n", my_pe,iter,i,j,targ_int[j],iter*10000+elements[i]*100+my_pe+j-1); if ( targ_long[j] != (long)(iter*10000+elements[i]*100+my_pe+j-1) ) fprintf(stderr, "FAIL: PE [%d] iter=%d i=%d targ_long[%d]=%d not equal %d\n", my_pe,iter,i,j,targ_long[j],iter*10000+elements[i]*100+my_pe+j-1); if ( targ_float[j] != (float)(iter*10000+elements[i]*100+my_pe+j-1) ) fprintf(stderr, "FAIL: PE [%d] iter=%d i=%d targ_long[%d]=%f not equal %d\n", my_pe,iter,i,j,targ_float[j],iter*10000+elements[i]*100+my_pe+j-1); if ( targ_double[j] != (double)(iter*10000+elements[i]*100+my_pe+j-1) ) fprintf(stderr, "FAIL: PE [%d] iter=%d i=%d targ_double[%d]=%f not equal %d\n", my_pe,iter,i,j,targ_double[j],iter*10000+elements[i]*100+my_pe+j-1); } } } } } shmem_free(srce_short); shmem_free(targ_short); shmem_free(srce_int); shmem_free(targ_int); shmem_free(srce_long); shmem_free(targ_long); shmem_free(srce_float); shmem_free(targ_float); shmem_free(srce_double); shmem_free(targ_double); #ifdef NEEDS_FINALIZE shmem_finalize(); #endif return 0; }
static int _shmem_finalize(void) { int ret = OSHMEM_SUCCESS; shmem_barrier_all(); shmem_lock_finalize(); /* Finalize preconnect framework */ if (OSHMEM_SUCCESS != (ret = oshmem_shmem_preconnect_all_finalize())) { return ret; } /* free requests */ if (OSHMEM_SUCCESS != (ret = oshmem_request_finalize())) { return ret; } /* must free cached groups before we kill collectives */ if (OSHMEM_SUCCESS != (ret = oshmem_group_cache_list_free())) { return ret; } /* We need to call mca_scoll_base_group_unselect explicitly for each group * that are not freed by oshmem_group_cache_list_free. We can only release its collectives at this point */ mca_scoll_base_group_unselect(oshmem_group_all); mca_scoll_base_group_unselect(oshmem_group_self); /* Close down MCA modules */ if (OSHMEM_SUCCESS != (ret = mca_base_framework_close(&oshmem_atomic_base_framework) ) ) { return ret; } if (OSHMEM_SUCCESS != (ret = mca_base_framework_close(&oshmem_scoll_base_framework) ) ) { return ret; } if (OSHMEM_SUCCESS != (ret = mca_base_framework_close(&oshmem_memheap_base_framework) ) ) { return ret; } if (OSHMEM_SUCCESS != (ret = mca_base_framework_close(&oshmem_sshmem_base_framework) ) ) { return ret; } if (OSHMEM_SUCCESS != (ret = MCA_SPML_CALL(del_procs(oshmem_group_all->proc_array, oshmem_group_all->proc_count)))) { return ret; } oshmem_shmem_barrier(); /* free spml resource */ if (OSHMEM_SUCCESS != (ret = mca_spml_base_finalize())) { return ret; } if (OSHMEM_SUCCESS != (ret = mca_base_framework_close(&oshmem_spml_base_framework) ) ) { return ret; } /* free op resources */ if (OSHMEM_SUCCESS != (ret = oshmem_op_finalize())) { return ret; } /* free proc_group resources */ if (OSHMEM_SUCCESS != (ret = oshmem_proc_group_finalize())) { return ret; } /* free proc resources */ if (OSHMEM_SUCCESS != (ret = oshmem_proc_finalize())) { return ret; } /* free info resources */ if (OSHMEM_SUCCESS != (ret = oshmem_info_finalize())) { return ret; } return ret; }
int main (int argc, char **argv) { int i; int nextpe; int me, npes; int success1, success2, success3, success4, success5, success6, success7, success8; short src1[N]; int src2[N]; long src3[N]; long double src4[N]; long long src5[N]; double src6[N]; float src7[N]; char *src8; short src9; int src10; long src11; double src12; float src13; short *dest1; int *dest2; long *dest3; long double *dest4; long long *dest5; double *dest6; float *dest7; char *dest8; short *dest9; int *dest10; long *dest11; double *dest12; float *dest13; shmem_init (); me = shmem_my_pe (); npes = shmem_n_pes (); if (npes > 1) { success1 = 0; success2 = 0; success3 = 0; success4 = 0; success5 = 0; success6 = 0; success7 = 0; success8 = 0; src8 = (char *) malloc (N * sizeof (char)); for (i = 0; i < N; i += 1) { src1[i] = (short) me; src2[i] = me; src3[i] = (long) me; src4[i] = (long double) me; src5[i] = (long long) me; src6[i] = (double) me; src7[i] = (float) me; src8[i] = (char) me; } src9 = (short) me; src10 = me; src11 = (long) me; src12 = (double) me; src13 = (float) me; dest1 = (short *) shmem_malloc (N * sizeof (*dest1)); dest2 = (int *) shmem_malloc (N * sizeof (*dest2)); dest3 = (long *) shmem_malloc (N * sizeof (*dest3)); dest4 = (long double *) shmem_malloc (N * sizeof (*dest4)); dest5 = (long long *) shmem_malloc (N * sizeof (*dest5)); dest6 = (double *) shmem_malloc (N * sizeof (*dest6)); dest7 = (float *) shmem_malloc (N * sizeof (*dest7)); dest8 = (char *) shmem_malloc (4 * sizeof (*dest8)); dest9 = (short *) shmem_malloc (sizeof (*dest9)); dest10 = (int *) shmem_malloc (sizeof (*dest10)); dest11 = (long *) shmem_malloc (sizeof (*dest11)); dest12 = (double *) shmem_malloc (sizeof (*dest12)); dest13 = (float *) shmem_malloc (sizeof (*dest13)); for (i = 0; i < N; i += 1) { dest1[i] = -9; dest2[i] = -9; dest3[i] = -9; dest4[i] = -9; dest5[i] = -9; dest6[i] = -9; dest7[i] = -9.0; dest8[i] = -9; } *dest9 = -9; *dest10 = -9; *dest11 = -9; *dest12 = -9; *dest13 = -9.0; nextpe = (me + 1) % npes; /* Testing shmem_short_put, shmem_short_put, shmem_int_put, shmem_long_put, shmem_longdouble_put, shmem_longlong_put, shmem_double_put, shmem_float_put, shmem_putmem */ shmem_barrier_all (); shmem_short_put (dest1, src1, 0, nextpe); shmem_int_put (dest2, src2, 0, nextpe); shmem_long_put (dest3, src3, 0, nextpe); shmem_longdouble_put (dest4, src4, 0, nextpe); shmem_longlong_put (dest5, src5, 0, nextpe); shmem_double_put (dest6, src6, 0, nextpe); shmem_float_put (dest7, src7, 0, nextpe); shmem_putmem (dest8, src8, 0, nextpe); shmem_barrier_all (); if (me == 0) { for (i = 0; i < N; i += 1) { if (dest1[i] != -9) { success1 = 1; } if (dest2[i] != -9) { success2 = 1; } if (dest3[i] != -9) { success3 = 1; } if (dest4[i] != -9) { success4 = 1; } if (dest5[i] != -9) { success5 = 1; } if (dest6[i] != -9) { success6 = 1; } if (dest7[i] != -9) { success7 = 1; } if (dest8[i] != -9) { success8 = 1; } } if (success1 == 0) printf ("Test shmem_short_put of zero length: Passed\n"); else printf ("Test shmem_short_put of zero length: Failed\n"); if (success2 == 0) printf ("Test shmem_int_put of zero length: Passed\n"); else printf ("Test shmem_int_put of zero length: Failed\n"); if (success3 == 0) printf ("Test shmem_long_put of zero length: Passed\n"); else printf ("Test shmem_long_put of zero length: Failed\n"); if (success4 == 0) printf ("Test shmem_longdouble_put of zero length: Passed\n"); else printf ("Test shmem_longdouble_put of zero length: Failed\n"); if (success5 == 0) printf ("Test shmem_longlong_put of zero length: Passed\n"); else printf ("Test shmem_longlong_put of zero length: Failed\n"); if (success6 == 0) printf ("Test shmem_double_put of zero length: Passed\n"); else printf ("Test shmem_double_put of zero length: Failed\n"); if (success7 == 0) printf ("Test shmem_float_put of zero length: Passed\n"); else printf ("Test shmem_float_put of zero length: Failed\n"); if (success8 == 0) printf ("Test shmem_putmem of zero length: Passed\n"); else printf ("Test shmem_putmem of zero length: Failed\n"); } shmem_barrier_all (); /* Testing shmem_put32, shmem_put64, shmem_put128 */ if (sizeof (int) == 4) { for (i = 0; i < N; i += 1) { dest2[i] = -9; dest3[i] = -9; dest4[i] = -9; } success2 = 0; success3 = 0; success4 = 0; shmem_barrier_all (); shmem_put32 (dest2, src2, 0, nextpe); shmem_put64 (dest3, src3, 0, nextpe); shmem_put128 (dest4, src4, 0, nextpe); shmem_barrier_all (); if (me == 0) { for (i = 0; i < N; i += 1) { if (dest2[i] != -9) { success2 = 1; } if (dest3[i] != -9) { success3 = 1; } if (dest4[i] != -9) { success4 = 1; } } if (success2 == 0) printf ("Test shmem_put32 of zero length: Passed\n"); else printf ("Test shmem_put32 of zero length: Failed\n"); if (success3 == 0) printf ("Test shmem_put64 of zero length: Passed\n"); else printf ("Test shmem_put64 of zero length: Failed\n"); if (success4 == 0) printf ("Test shmem_put128 of zero length: Passed\n"); else printf ("Test shmem_put128 of zero length: Failed\n"); } } else if (sizeof (int) == 8) { for (i = 0; i < N; i += 1) { dest1[i] = -9; dest2[i] = -9; dest3[i] = -9; } success1 = 0; success2 = 0; success3 = 0; shmem_barrier_all (); shmem_put32 (dest1, src1, 0, nextpe); shmem_put64 (dest2, src2, 0, nextpe); shmem_put128 (dest3, src3, 0, nextpe); shmem_barrier_all (); if (me == 0) { for (i = 0; i < N; i += 1) { if (dest1[i] != -9) { success1 = 1; } if (dest2[i] != -9) { success2 = 1; } if (dest3[i] != -9) { success3 = 1; } } if (success1 == 0) printf ("Test shmem_put32 of zero length: Passed\n"); else printf ("Test shmem_put32 of zero length: Failed\n"); if (success2 == 0) printf ("Test shmem_put64 of zero length: Passed\n"); else printf ("Test shmem_put64 of zero length: Failed\n"); if (success3 == 0) printf ("Test shmem_put128 of zero length: Passed\n"); else printf ("Test shmem_put128 of zero length: Failed\n"); } } shmem_barrier_all (); shmem_free (dest1); shmem_free (dest2); shmem_free (dest3); shmem_free (dest4); shmem_free (dest5); shmem_free (dest6); shmem_free (dest7); shmem_free (dest8); shmem_free (dest9); shmem_free (dest10); shmem_free (dest11); shmem_free (dest12); shmem_free (dest13); } else { printf ("Number of PEs must be > 1 to test shmem put of zero length, test skipped\n"); } shmem_finalize (); return 0; }
int main(int argc, char *argv[]) { int myid, numprocs, i; int size; char *s_buf, *r_buf; char *s_buf_heap, *r_buf_heap; int align_size; int64_t t_start = 0, t_end = 0; int use_heap = 0; //default uses global start_pes(0); myid = _my_pe(); numprocs = _num_pes(); if(numprocs != 2) { if(myid == 0) { fprintf(stderr, "This test requires exactly two processes\n"); } return EXIT_FAILURE; } if(argc != 2) { usage(myid); return EXIT_FAILURE; } if(0 == strncmp(argv[1], "heap", strlen("heap"))){ use_heap = 1; } else if(0 == strncmp(argv[1], "global", strlen("global"))){ use_heap = 0; } else { usage(myid); return EXIT_FAILURE; } align_size = MESSAGE_ALIGNMENT; /**************Allocating Memory*********************/ if(use_heap){ s_buf_heap = shmalloc(MYBUFSIZE); r_buf_heap = shmalloc(MYBUFSIZE); s_buf = (char *) (((unsigned long) s_buf_heap + (align_size - 1)) / align_size * align_size); r_buf = (char *) (((unsigned long) r_buf_heap + (align_size - 1)) / align_size * align_size); } else { s_buf = (char *) (((unsigned long) s_buf_original + (align_size - 1)) / align_size * align_size); r_buf = (char *) (((unsigned long) r_buf_original + (align_size - 1)) / align_size * align_size); } /**************Memory Allocation Done*********************/ if(myid == 0) { fprintf(stdout, HEADER); fprintf(stdout, "%-*s%*s\n", 10, "# Size", FIELD_WIDTH, "Latency (us)"); fflush(stdout); } for(size = 1; size <= MAX_MSG_SIZE; size = (size ? size * 2 : 1)) { /* touch the data */ for(i = 0; i < size; i++) { s_buf[i] = 'a'; r_buf[i] = 'b'; } if(size > large_message_size) { loop = loop_large = 100; skip = skip_large = 0; } shmem_barrier_all(); if(myid == 0) { for(i = 0; i < loop + skip; i++) { if(i == skip) t_start = TIME(); shmem_getmem(r_buf, s_buf, size, 1); } t_end = TIME(); } shmem_barrier_all(); if(myid == 0) { double latency = (1.0 * (t_end-t_start)) / loop; fprintf(stdout, "%-*d%*.*f\n", 10, size, FIELD_WIDTH, FLOAT_PRECISION, latency); fflush(stdout); } } shmem_barrier_all(); if(use_heap){ shfree(s_buf_heap); shfree(r_buf_heap); } shmem_barrier_all(); return EXIT_SUCCESS; }
int main( int argc, char *argv[]) { int rc=0, my_pe, npes, neighbor; int loops=LOOPS; int j; size_t data_sz=sizeof(long) * 3; double start_time; long *data, lval=0; if (argc > 1) loops = atoi(argv[1]); shmem_init(); my_pe = shmem_my_pe(); npes = shmem_n_pes(); data = shmem_malloc(data_sz); if (!data) { fprintf(stderr,"[%d] shmem_malloc(%ld) failure? %d\n", my_pe,data_sz,errno); shmem_global_exit(1); } memset((void*)data,0,data_sz); shmem_barrier_all(); neighbor = (my_pe + 1) % npes; start_time = shmemx_wtime(); for(j=0,elapsed=0.0; j < loops; j++) { start_time = shmemx_wtime(); lval = shmem_long_finc( (void*)&data[1], neighbor ); elapsed += shmemx_wtime() - start_time; if (lval != (long) j) { fprintf(stderr,"[%d] Test: FAIL previous val %ld != %d Exit.\n", my_pe, lval, j); shmem_global_exit(1); } } shmem_barrier_all(); rc = 0; if (data[1] != (long)loops) { fprintf(stderr,"[%d] finc neighbot: FAIL data[1](%p) %ld != %d Exit.\n", my_pe, (void*)&data[1], data[1], loops); rc--; } /* check if adjancent memory locations distrubed */ assert(data[0] == 0); assert(data[2] == 0); if (my_pe == 0 ) { if (rc == 0 && Verbose) fprintf(stderr,"[%d] finc neighbor: PASSED.\n",my_pe); fprintf(stderr,"[%d] %d loops of shmem_long_finc() in %6.4f secs\n" " %2.6f usecs per shmem_long_finc()\n", my_pe,loops,elapsed,((elapsed*100000.0)/(double)loops)); } shmem_free(data); shmem_finalize(); return rc; }