/* * Class: xerial_jnuma_NumaNative * Method: allocateLocal * Signature: (J)J */ JNIEXPORT jlong JNICALL Java_xerial_jnuma_NumaNative_allocateLocal (JNIEnv *env, jobject obj, jlong capacity) { void* mem = numa_alloc_local((size_t) capacity); if(mem != NULL) { return (jlong) mem; } throwException(env, obj, 11); return 0L; }
JNIEXPORT jobject JNICALL Java_xerial_jnuma_NumaNative_allocLocal (JNIEnv *env, jobject jobj, jint capacity) { void* mem = numa_alloc_local((size_t) capacity); //printf("allocate local memory\n"); if(mem == NULL) printf("failed to allocate local memory\n"); return (*env)->NewDirectByteBuffer(env, mem, (jlong) capacity); }
/* nrealloc() - allocates a new buffer */ void numa_allocator::nrealloc(void) { // increase size of our old_buffers to store the previously allocated memory num_buffers++; if(other_buffers == NULL) { assert(num_buffers == 1); other_buffers = (void**)malloc(num_buffers * sizeof(void*)); *other_buffers = buf_start; } else { void** new_bufs = (void**)malloc(num_buffers * sizeof(void*)); for(int i = 0; i < num_buffers - 1; ++i) { new_bufs[i] = other_buffers[i]; } new_bufs[num_buffers-1] = buf_start; free(other_buffers); other_buffers = new_bufs; } // allocate new buffer & update pointers and total size buf_cur = buf_start = numa_alloc_local(buf_size); }
/* Constructor */ numa_allocator::numa_allocator(unsigned ssize) :buf_size(ssize), empty(false), num_buffers(0), buf_old(NULL), other_buffers(NULL), last_alloc_half(false), cache_size(CACHE_LINE_SIZE) { buf_cur = buf_start = numa_alloc_local(buf_size); }
int main(int argc, const char **argv) { int num_cpus = numa_num_task_cpus(); printf("num cpus: %d\n", num_cpus); printf("numa available: %d\n", numa_available()); numa_set_localalloc(); struct bitmask *bm = numa_bitmask_alloc(num_cpus); for (int i=0; i<=numa_max_node(); ++i) { numa_node_to_cpus(i, bm); printf("numa node %d ", i); print_bitmask(bm); printf(" - %g GiB\n", numa_node_size(i, 0) / (1024.*1024*1024.)); } numa_bitmask_free(bm); puts(""); char *x; const size_t cache_line_size = 64; const size_t array_size = 100*1000*1000; size_t ntrips = 2; #pragma omp parallel { assert(omp_get_num_threads() == num_cpus); int tid = omp_get_thread_num(); pin_to_core(tid); if(tid == 0) x = (char *) numa_alloc_local(array_size); // {{{ single access #pragma omp barrier for (size_t i = 0; i<num_cpus; ++i) { if (tid == i) { double t = measure_access(x, array_size, ntrips); printf("sequential core %d -> core 0 : BW %g MB/s\n", i, array_size*ntrips*cache_line_size / t / 1e6); } #pragma omp barrier } // }}} // {{{ everybody contends for one { if (tid == 0) puts(""); #pragma omp barrier double t = measure_access(x, array_size, ntrips); #pragma omp barrier for (size_t i = 0; i<num_cpus; ++i) { if (tid == i) printf("all-contention core %d -> core 0 : BW %g MB/s\n", tid, array_size*ntrips*cache_line_size / t / 1e6); #pragma omp barrier } } // }}} // {{{ zero and someone else contending if (tid == 0) puts(""); #pragma omp barrier for (size_t i = 1; i<num_cpus; ++i) { double t; if (tid == i || tid == 0) t = measure_access(x, array_size, ntrips); #pragma omp barrier if (tid == 0) { printf("two-contention core %d -> core 0 : BW %g MB/s\n", tid, array_size*ntrips*cache_line_size / t / 1e6); } #pragma omp barrier if (tid == i) { printf("two-contention core %d -> core 0 : BW %g MB/s\n\n", tid, array_size*ntrips*cache_line_size / t / 1e6); } #pragma omp barrier } } numa_free(x, array_size); return 0; }
int csr_to_blk(struct csr_mat_t *csr, struct blk_mat_t *blk) { blk->rows = csr->rows; blk->cols = csr->cols; blk->non_zeros = csr->non_zeros; int blk_row = (csr->rows + BLOCK_SIZE - 1) / BLOCK_SIZE; int blk_col = (csr->cols + BLOCK_SIZE - 1) / BLOCK_SIZE; printf("Notify: blk_row = %d, blk_col = %d.\n", blk_row, blk_col); int blk_num = blk_row * blk_col; blk->types = (blk_type_t*)numa_alloc_local(blk_num * sizeof(blk_type_t)); blk->row_id = (DWORD*)numa_alloc_local((blk_num + 1) * sizeof(DWORD)); blk->col_idx = (WORD*)numa_alloc_local(blk->non_zeros * sizeof(DWORD)); blk->vals = (FLOAT*)numa_alloc_local(blk->non_zeros * sizeof(FLOAT)); blk->row_id[0] = 0; int *blk_cnt = (int*)calloc(blk_num, sizeof(int)); int idx; int i, j; int x, y; for (i = 0; i < csr->rows; i++) { x = i / BLOCK_SIZE; for (j = csr->row_ptr[i]; j < csr->row_ptr[i + 1]; j++) { y = csr->col_idx[j] / BLOCK_SIZE; idx = x * blk_col + y; blk_cnt[idx]++; } } int block_size; for (i = 0; i < blk_row; i++) { for (j = 0; j < blk_col; j++) { // get the real block size block_size = blk->rows - i * BLOCK_SIZE; if (block_size > BLOCK_SIZE) { block_size = BLOCK_SIZE; } idx = i * blk_col + j; if (blk_cnt[idx] >= block_size * CSR_THRESHOLD) { blk->row_id[idx + 1] = blk->row_id[idx] + block_size; blk->types[idx]= BLK_CSR; } else { blk->row_id[idx + 1] = blk->row_id[idx] + blk_cnt[idx]; blk->types[idx]= BLK_COO; } } } blk->row_info = (WORD*)numa_alloc_local(blk->row_id[blk_num] * sizeof(WORD)); memset(blk->row_info, 0, blk->row_id[blk_num] * sizeof(WORD)); int *blk_idx = (int*)calloc(blk_num, sizeof(int)); INT64 *blk_pos = (INT64*)malloc(blk_num * sizeof(INT64)); blk_pos[0] = 0; for (i = 1; i < blk_num; i++) { blk_pos[i] = blk_pos[i - 1] + blk_cnt[i - 1]; } WORD *cur_row_info; WORD *cur_col_idx; FLOAT *cur_vals; for (i = 0; i < csr->rows; i++) { x = i / BLOCK_SIZE; for (j = csr->row_ptr[i]; j < csr->row_ptr[i + 1]; j++) { y = csr->col_idx[j] / BLOCK_SIZE; idx = x * blk_col + y; cur_row_info = blk->row_info + blk->row_id[idx]; cur_col_idx = blk->col_idx + blk_pos[idx]; cur_vals = blk->vals + blk_pos[idx]; if (blk->types[idx] == BLK_COO) { cur_row_info[blk_idx[idx]] = i - x * BLOCK_SIZE; } else if (blk->types[idx] == BLK_CSR) { cur_row_info[i - x * BLOCK_SIZE]++; } cur_col_idx[blk_idx[idx]] = csr->col_idx[j] - y * BLOCK_SIZE; cur_vals[blk_idx[idx]] = csr->vals[j]; blk_idx[idx]++; } } free(blk_cnt); free(blk_idx); free(blk_pos); return 0; }