void flat_tree (STREAM_TYPE * target, STREAM_TYPE * source, int nreduce) { /* Consider the root to be PE #0 */ if (_world_rank == 0) { /* First, finish gathering */ for (int n = 0; n < _world_size; n++) { STREAM_TYPE *ptr = (STREAM_TYPE *) shmem_ptr (source, n); /* Compute max */ for (int k = 0; k < nreduce; k++) { source[k] = REDUCE_MAX (ptr[k], source[k]); } } /* Then, broadcast results */ for (int n = 0; n < _world_size; n++) { STREAM_TYPE *ptr = (STREAM_TYPE *) shmem_ptr (target, n); for (int k = 0; k < nreduce; k++) { ptr[k] = source[k]; } } } shmem_barrier_all (); return; }
void armci_memoffset_table_newentry(void *ptr, size_t seg_size) { void **ptr_arr; void *master_addr = NULL; size_t tile_size=0, offset=0; if(!ptr) armci_die("armci_memoffset_table_newentry : null ptr",0); if(seg_count >= MAX_SEGS) /* CHECK: make it dynamic */ armci_die("armci_altix_allocate: Increase MAX_SEGS > 512", armci_me); if(armci_me == armci_master) master_addr = shmem_ptr(ptr, armci_me); armci_msg_brdcst(&master_addr, sizeof(void*), armci_master); ptr_arr = (void**)malloc(armci_nproc*sizeof(void*)); armci_altix_gettilesize(ptr, ptr_arr, &tile_size); offset = (size_t)((char*)master_addr - (char*)ptr_arr[armci_master]); /* enter in memoffset table */ armci_memoffset_table[seg_count].seg_addr = ptr_arr[armci_master]; armci_memoffset_table[seg_count].seg_size = seg_size; armci_memoffset_table[seg_count].tile_size = tile_size; armci_memoffset_table[seg_count].mem_offset = offset; #if DEBUG_ printf("%d: addr=%p seg_size=%ld tile_size=%ld offset=%ld\n", armci_me, ptr_arr[armci_master], seg_size, tile_size, offset); #endif ++seg_count; free(ptr_arr); }
int main(void) { static int bigd[100]; int *ptr; int i; shmem_init(); if (shmem_my_pe() == 0) { /* initialize PE 1's bigd array */ ptr = shmem_ptr(bigd, 1); if (ptr == NULL) printf("can't use pointer to directly access PE 1's array\n"); else for (i=0; i<100; i++) *ptr++ = i+1; } shmem_barrier_all(); if (shmem_my_pe() == 1) { printf("bigd on PE 1 is:\n"); for (i=0; i<100; i++) printf(" %d\n",bigd[i]); printf("\n"); } return 1; }
/* SGI Altix Stuff */ static void armci_altix_gettilesize(void *ptr, void **ptr_arr, size_t *tile_size) { int i; size_t diff=0; for(i=0; i<armci_nproc; i++) { ptr_arr[i]=shmem_ptr(ptr,i); if(i>0) diff = (size_t)((char*)ptr_arr[i]-(char*)ptr_arr[i-1]); if(i>1 && diff!=*tile_size) armci_die("armci_memoffset_table_newentry:Inconsistent tile size", armci_me); *tile_size = diff; } }
void armci_altix_shm_malloc(void *ptr_arr[], armci_size_t bytes) { long size=bytes; void *ptr; int i; ARMCI_PR_DBG("enter",0); armci_msg_lgop(&size,1,"max"); ptr=kr_malloc((size_t)size, &altix_ctx_shmem); bzero(ptr_arr,(armci_nproc)*sizeof(void*)); ptr_arr[armci_me] = ptr; if(size!=0 && ptr==NULL) armci_die("armci_altix_shm_malloc(): malloc failed", armci_me); for(i=0; i< armci_nproc; i++) if(i!=armci_me) ptr_arr[i]=shmem_ptr(ptr,i); ARMCI_PR_DBG("exit",0); }
void armci_altix_shm_malloc_group(void *ptr_arr[], armci_size_t bytes, ARMCI_Group *group) { long size=bytes; void *ptr; int i,grp_me, grp_nproc; armci_grp_attr_t *grp_attr=ARMCI_Group_getattr(group); ARMCI_PR_DBG("enter",0); ARMCI_Group_size(group, &grp_nproc); ARMCI_Group_rank(group, &grp_me); armci_msg_group_lgop(&size,1,"max",group); ptr=kr_malloc((size_t)size, &altix_ctx_shmem_grp); if(size!=0 && ptr==NULL) armci_die("armci_altix_shm_malloc_group(): malloc failed for groups. Increase _SHMMAX_ALTIX_GRP", armci_me); bzero(ptr_arr,(grp_nproc)*sizeof(void*)); ptr_arr[grp_me] = ptr; for(i=0; i< grp_nproc; i++) if(i!=grp_me) ptr_arr[i]=shmem_ptr(ptr,ARMCI_Absolute_id(group, i)); ARMCI_PR_DBG("exit",0); }
void *FORTRANIFY (shmem_ptr) (void *target, int *pe) { return shmem_ptr (target, *pe); }
FORTRAN_POINTER_T* shmem_ptr_f(FORTRAN_POINTER_T target, MPI_Fint *pe) { return (FORTRAN_POINTER_T *)shmem_ptr(FPTR_2_VOID_PTR(target), OMPI_FINT_2_INT(*pe)); }
/*\ acquire exclusive LOCK to MEMORY area <pstart,pend> owned by process "proc" * . only one area can be locked at a time by the calling process * . must unlock it with armci_unlockmem \*/ void armci_lockmem(void *start, void *end, int proc) { #ifdef ARMCIX ARMCIX_Lockmem (start, end, proc); #else register void* pstart, *pend; register int slot, avail=0; int turn=0, conflict=0; memlock_t *memlock_table; #if defined(CLUSTER) && !defined(SGIALTIX) int lock = (proc-armci_clus_info[armci_clus_id(proc)].master)%NUM_LOCKS; #else int lock = 0; #endif #ifdef CORRECT_PTR if(! *armci_use_memlock_table) { /* if offset invalid, use dumb locking scheme ignoring addresses */ armci_lockmem_(start, end, proc); return; } # ifndef SGIALTIX /* when processes are attached to a shmem region at different addresses, * addresses written to memlock table must be adjusted to the node master */ if(armci_mem_offset) { start = armci_mem_offset + (char*)start; end = armci_mem_offset + (char*)end; } # endif #endif if(DEBUG_) { printf("%d: calling armci_lockmem for %d range %p -%p\n", armci_me, proc, start,end); fflush(stdout); } memlock_table = (memlock_t*)memlock_table_array[proc]; #ifdef ALIGN_ADDRESS /* align address range on cache line boundary to avoid false sharing */ pstart = ALIGN_ADDRESS(start); pend = CALGN -1 + ALIGN_ADDRESS(end); #else pstart=start; pend =end; #endif #ifdef CRAY_SHMEM { /* adjust according the remote process raw address */ long bytes = (long) ((char*)pend-(char*)pstart); extern void* armci_shmalloc_remote_addr(void *ptr, int proc); pstart = armci_shmalloc_remote_addr(pstart, proc); pend = (char*)pstart + bytes; } #endif #ifdef SGIALTIX if (proc == armci_me) { pstart = shmem_ptr(pstart,armci_me); pend = shmem_ptr(pend,armci_me); } /* In SGI Altix processes are attached to a shmem region at different addresses. Addresses written to memlock table must be adjusted to the node master */ if(ARMCI_Uses_shm()) { int i, seg_id=-1; size_t tile_size,offset; void *start_addr, *end_addr; for(i=0; i<seg_count; i++) { tile_size = armci_memoffset_table[i].tile_size; start_addr = (void*) ((char*)armci_memoffset_table[i].seg_addr + proc*tile_size); end_addr = (void*) ((char*)start_addr + armci_memoffset_table[i].seg_size); /* CHECK: because of too much "span" in armci_lockmem_patch in * strided.c, it is not possible to have condition as (commented):*/ /*if(pstart>=start_addr && pend<=end_addr) {seg_id=i; break;}*/ if(pstart >= start_addr && pstart <= end_addr) { seg_id=i; break; } } if(seg_id==-1) armci_die("armci_lockmem: Invalid segment", seg_id); offset = armci_memoffset_table[seg_id].mem_offset; pstart = ((char*)pstart + offset); pend = ((char*)pend + offset); } #endif while(1) { NATIVE_LOCK(lock,proc); armci_get(memlock_table, table, sizeof(table), proc); /* armci_copy(memlock_table, table, sizeof(table));*/ /* inspect the table */ conflict = 0; avail =-1; for(slot = 0; slot < MAX_SLOTS; slot ++) { /* nonzero starting address means the slot is occupied */ if(table[slot].start == NULL) { /* remember a free slot to store address range */ avail = slot; } else { /*check for conflict: overlap between stored and current range*/ if( (pstart >= table[slot].start && pstart <= table[slot].end) || (pend >= table[slot].start && pend <= table[slot].end) ) { conflict = 1; break; } /* printf("%d: locking %ld-%ld (%d) conflict\n", armci_me, */ } } if(avail != -1 && !conflict) break; NATIVE_UNLOCK(lock,proc); armci_waitsome( ++turn ); } /* we got the memory lock: enter address into the table */ table[avail].start = pstart; table[avail].end = pend; armci_put(table+avail,memlock_table+avail,sizeof(memlock_t),proc); FENCE_NODE(proc); NATIVE_UNLOCK(lock,proc); locked_slot = avail; #endif /* ! ARMCIX */ }