void PARMCI_Fence(int proc) { int i; #if defined(DATA_SERVER) && !(defined(GM) && defined(ACK_FENCE)) // printf("%d [cp] fence_arr(%d)=%d\n",armci_me,proc,FENCE_ARR(proc)); if(FENCE_ARR(proc) && (armci_nclus >1)){ int cluster = armci_clus_id(proc); int master=armci_clus_info[cluster].master; armci_rem_ack(cluster); /* one ack per cluster node suffices */ /* note, in multi-threaded case it will only clear for current thread */ bzero(&FENCE_ARR(master),armci_clus_info[cluster].nslave); } #elif defined(BGML) BGML_WaitProc(proc); MEM_FENCE; #else FENCE_NODE(proc); MEM_FENCE; #endif }
void PARMCI_Fence(int proc) { #if defined(DATA_SERVER) && !(defined(GM) && defined(ACK_FENCE)) if(FENCE_ARR(proc) && (armci_nclus >1)){ int cluster = armci_clus_id(proc); int master = armci_clus_info[cluster].master; armci_rem_ack(cluster); bzero(&FENCE_ARR(master), armci_clus_info[cluster].nslave); } #elif defined(ARMCIX) ARMCIX_Fence (proc); #elif defined(BGML) BGML_WaitProc(proc); MEM_FENCE; #else FENCE_NODE(proc); MEM_FENCE; #endif }
int armci_acc_vector(int op, /* operation code */ void *scale, /* pointer to scale factor in accumulate */ armci_giov_t darr[], /* descriptor array */ int len, /* length of descriptor array */ int proc /* remote process(or) ID */ ) { int i; #if defined(ACC_COPY) if(proc == armci_me ){ #endif for(i = 0; i< len; i++) armci_scatter_acc(op, scale, darr[i], proc, 1); #if defined(ACC_COPY) }else{ for(i = 0; i< len; i++){ armci_giov_t dr = darr[i]; int j, rc, nb; if(dr.bytes > BUFSIZE/2){ /* for large segments use strided implementation */ for(j=0; j< dr.ptr_array_len; j++){ rc = armci_acc_copy_strided(op, scale,proc, dr.src_ptr_array[j], NULL, dr.dst_ptr_array[j],NULL, &dr.bytes, 0); if(rc)return(rc); } }else{ armci_giov_t dl; /*lock memory:should optimize it to lock only a chunk at a time*/ armci_lockmem_scatter(dr.dst_ptr_array, dr.ptr_array_len, dr.bytes, proc); /* copy as many blocks as possible into the local buffer */ dl.bytes = dr.bytes; nb = ARMCI_MIN(PWORKLEN,BUFSIZE/dr.bytes); for(j=0; j< dr.ptr_array_len; j+= nb){ int nblocks = ARMCI_MIN(nb, dr.ptr_array_len -j); int k; /* setup vector descriptor for remote memory copy to bring data into buffer*/ dl.ptr_array_len = nblocks; dl.src_ptr_array = dr.dst_ptr_array + j; /* GET destination becomes source for copy */ for(k=0; k< nblocks; k++) pwork[k] = k*dl.bytes + (char*)armci_internal_buffer; dl.dst_ptr_array = pwork; /* get data to the local buffer */ rc = armci_copy_vector(GET, &dl, 1, proc); if(rc){ ARMCI_UNLOCKMEM(proc); return(rc);} /* update source array for accumulate */ dl.src_ptr_array = dr.src_ptr_array +j; /* do scatter accumulate updating copy of data in buffer */ armci_scatter_acc(op, scale, dl, armci_me, 0); /* modify descriptor-now source becomes destination for PUT*/ dl.dst_ptr_array = dr.dst_ptr_array + j; dl.src_ptr_array = pwork; /* put data back */ rc = armci_copy_vector(PUT, &dl, 1, proc); FENCE_NODE(proc); if(rc){ ARMCI_UNLOCKMEM(proc); return(rc);} } ARMCI_UNLOCKMEM(proc); } }/*endfor*/ } #endif return 0; }
/*\ acquire exclusive LOCK to MEMORY area <pstart,pend> owned by process "proc" * . only one area can be locked at a time by the calling process * . must unlock it with armci_unlockmem \*/ void armci_lockmem(void *start, void *end, int proc) { register void* pstart, *pend; register int slot, avail=0; int turn=0, conflict=0; memlock_t *memlock_table; #if defined(CLUSTER) && !defined(SGIALTIX) int lock = (proc-armci_clus_info[armci_clus_id(proc)].master)%NUM_LOCKS; #else int lock = 0; #endif #ifdef CORRECT_PTR if(! *armci_use_memlock_table){ /* if offset invalid, use dumb locking scheme ignoring addresses */ armci_lockmem_(start, end, proc); return; } # ifndef SGIALTIX /* when processes are attached to a shmem region at different addresses, * addresses written to memlock table must be adjusted to the node master */ if(armci_mem_offset){ start = armci_mem_offset + (char*)start; end = armci_mem_offset + (char*)end; } # endif #endif if(DEBUG_){ printf("%d: calling armci_lockmem for %d range %p -%p\n", armci_me, proc, start,end); fflush(stdout); } memlock_table = (memlock_t*)memlock_table_array[proc]; #ifdef ALIGN_ADDRESS /* align address range on cache line boundary to avoid false sharing */ pstart = ALIGN_ADDRESS(start); pend = CALGN -1 + ALIGN_ADDRESS(end); #else pstart=start; pend =end; #endif #ifdef CRAY_SHMEM { /* adjust according the remote process raw address */ long bytes = (long) ((char*)pend-(char*)pstart); extern void* armci_shmalloc_remote_addr(void *ptr, int proc); pstart = armci_shmalloc_remote_addr(pstart, proc); pend = (char*)pstart + bytes; } #endif while(1){ NATIVE_LOCK(lock,proc); armci_get(memlock_table, table, sizeof(table), proc); /* armci_copy(memlock_table, table, sizeof(table));*/ /* inspect the table */ conflict = 0; avail =-1; for(slot = 0; slot < MAX_SLOTS; slot ++){ /* nonzero starting address means the slot is occupied */ if(table[slot].start == NULL){ /* remember a free slot to store address range */ avail = slot; }else{ /*check for conflict: overlap between stored and current range*/ if( (pstart >= table[slot].start && pstart <= table[slot].end) || (pend >= table[slot].start && pend <= table[slot].end) ){ conflict = 1; break; } /* printf("%d: locking %ld-%ld (%d) conflict\n", armci_me, */ } } if(avail != -1 && !conflict) break; NATIVE_UNLOCK(lock,proc); armci_waitsome( ++turn ); } /* we got the memory lock: enter address into the table */ table[avail].start = pstart; table[avail].end = pend; armci_put(table+avail,memlock_table+avail,sizeof(memlock_t),proc); FENCE_NODE(proc); NATIVE_UNLOCK(lock,proc); locked_slot = avail; }
/*\ acquire exclusive LOCK to MEMORY area <pstart,pend> owned by process "proc" * . only one area can be locked at a time by the calling process * . must unlock it with armci_unlockmem \*/ void armci_lockmem(void *start, void *end, int proc) { #ifdef ARMCIX ARMCIX_Lockmem (start, end, proc); #else register void* pstart, *pend; register int slot, avail=0; int turn=0, conflict=0; memlock_t *memlock_table; #if defined(CLUSTER) && !defined(SGIALTIX) int lock = (proc-armci_clus_info[armci_clus_id(proc)].master)%NUM_LOCKS; #else int lock = 0; #endif #ifdef CORRECT_PTR if(! *armci_use_memlock_table) { /* if offset invalid, use dumb locking scheme ignoring addresses */ armci_lockmem_(start, end, proc); return; } # ifndef SGIALTIX /* when processes are attached to a shmem region at different addresses, * addresses written to memlock table must be adjusted to the node master */ if(armci_mem_offset) { start = armci_mem_offset + (char*)start; end = armci_mem_offset + (char*)end; } # endif #endif if(DEBUG_) { printf("%d: calling armci_lockmem for %d range %p -%p\n", armci_me, proc, start,end); fflush(stdout); } memlock_table = (memlock_t*)memlock_table_array[proc]; #ifdef ALIGN_ADDRESS /* align address range on cache line boundary to avoid false sharing */ pstart = ALIGN_ADDRESS(start); pend = CALGN -1 + ALIGN_ADDRESS(end); #else pstart=start; pend =end; #endif #ifdef CRAY_SHMEM { /* adjust according the remote process raw address */ long bytes = (long) ((char*)pend-(char*)pstart); extern void* armci_shmalloc_remote_addr(void *ptr, int proc); pstart = armci_shmalloc_remote_addr(pstart, proc); pend = (char*)pstart + bytes; } #endif #ifdef SGIALTIX if (proc == armci_me) { pstart = shmem_ptr(pstart,armci_me); pend = shmem_ptr(pend,armci_me); } /* In SGI Altix processes are attached to a shmem region at different addresses. Addresses written to memlock table must be adjusted to the node master */ if(ARMCI_Uses_shm()) { int i, seg_id=-1; size_t tile_size,offset; void *start_addr, *end_addr; for(i=0; i<seg_count; i++) { tile_size = armci_memoffset_table[i].tile_size; start_addr = (void*) ((char*)armci_memoffset_table[i].seg_addr + proc*tile_size); end_addr = (void*) ((char*)start_addr + armci_memoffset_table[i].seg_size); /* CHECK: because of too much "span" in armci_lockmem_patch in * strided.c, it is not possible to have condition as (commented):*/ /*if(pstart>=start_addr && pend<=end_addr) {seg_id=i; break;}*/ if(pstart >= start_addr && pstart <= end_addr) { seg_id=i; break; } } if(seg_id==-1) armci_die("armci_lockmem: Invalid segment", seg_id); offset = armci_memoffset_table[seg_id].mem_offset; pstart = ((char*)pstart + offset); pend = ((char*)pend + offset); } #endif while(1) { NATIVE_LOCK(lock,proc); armci_get(memlock_table, table, sizeof(table), proc); /* armci_copy(memlock_table, table, sizeof(table));*/ /* inspect the table */ conflict = 0; avail =-1; for(slot = 0; slot < MAX_SLOTS; slot ++) { /* nonzero starting address means the slot is occupied */ if(table[slot].start == NULL) { /* remember a free slot to store address range */ avail = slot; } else { /*check for conflict: overlap between stored and current range*/ if( (pstart >= table[slot].start && pstart <= table[slot].end) || (pend >= table[slot].start && pend <= table[slot].end) ) { conflict = 1; break; } /* printf("%d: locking %ld-%ld (%d) conflict\n", armci_me, */ } } if(avail != -1 && !conflict) break; NATIVE_UNLOCK(lock,proc); armci_waitsome( ++turn ); } /* we got the memory lock: enter address into the table */ table[avail].start = pstart; table[avail].end = pend; armci_put(table+avail,memlock_table+avail,sizeof(memlock_t),proc); FENCE_NODE(proc); NATIVE_UNLOCK(lock,proc); locked_slot = avail; #endif /* ! ARMCIX */ }