void release_csr_cont(struct csr_cont_t *csr_cont) { int i; for (i = 0; i < csr_cont->count; i++) { release_csr_mat(csr_cont->csrs + i); } numa_free(csr_cont->split_idx, (csr_cont->count + 1) * sizeof(int)); numa_free(csr_cont->csrs, csr_cont->count * sizeof(struct csr_mat_t)); }
void release_blk_mat(struct blk_mat_t *mat) { int blk_row = (mat->rows + BLOCK_SIZE - 1) / BLOCK_SIZE; int blk_col = (mat->cols + BLOCK_SIZE - 1) / BLOCK_SIZE; int blk_num = blk_row * blk_col; numa_free(mat->types, blk_num * sizeof(blk_type_t)); numa_free(mat->row_info, mat->row_id[blk_num] * sizeof(WORD)); numa_free(mat->col_idx, mat->non_zeros * sizeof(WORD)); numa_free(mat->vals, mat->non_zeros * sizeof(FLOAT)); numa_free(mat->row_id, (blk_num + 1) * sizeof(DWORD)); }
void dpi_flow_table_delete_v4( dpi_flow_DB_v4_t* db, dpi_flow_cleaner_callback* flow_cleaner_callback){ u_int32_t i; u_int16_t j; if(db!=NULL){ if(db->table!=NULL){ for(j=0; j<db->num_partitions; ++j){ for(i=db->partitions[j].partition.informations. lowest_index; i<=db->partitions[j].partition.informations. highest_index; ++i){ while(db->table[i].next!=&(db->table[i])){ mc_dpi_flow_table_delete_flow_v4( db, flow_cleaner_callback, j, db->table[i].next); } } #if DPI_FLOW_TABLE_USE_MEMORY_POOL #if DPI_NUMA_AWARE numa_free(db->partitions[j].partition. memory_chunk_lower_bound, sizeof(ipv6_flow_t)*db->individual_pool_size); numa_free(db->partitions[j].partition. pool, sizeof(u_int32_t)*db->individual_pool_size); #else free(db->partitions[j].partition. memory_chunk_lower_bound); free(db->partitions[j].partition.pool); #endif #endif } } #if DPI_NUMA_AWARE numa_free(db->partitions, sizeof(dpi_flow_DB_v4_partition_t)*db->num_partitions); numa_free(db->table, sizeof(ipv4_flow_t)*db->total_size); #else free(db->partitions); free(db->table); #endif free(db); } }
void *SyncThreadWriteOrRead(void *arg) { struct thread_data *data = arg; bind2node_id(data->node_id); int node_id = data->node_id; int num = data->num; int off_start = data->off_start; int i; char *buffer = (char *) numa_alloc_onnode(block_size, node_id); ssd_file_desc_t fd = ssd_open(data->file_name, node_id, 0); printf("thread %d: access %d blocks\n", data->idx, num); for (i = 0; i < num; i++) { off_t offset = offs[off_start + i]; if (access == READ) ssd_read(fd, (void *) buffer, block_size, offset); else ssd_write(fd, (void *) buffer, block_size, offset); } numa_free(buffer, block_size); ssd_close(fd); return NULL; }
/* nreset() - frees all memory buffers */ void numa_allocator::nreset(void) { if(!empty) { empty = true; // free other_buffers, if used if(other_buffers != NULL) { int i = num_buffers - 1; while(i >= 0) { numa_free(other_buffers[i], buf_size); i--; } free(other_buffers); } // free primary buffer numa_free(buf_start, buf_size); } }
void numa_membench(mem_bench_info_t *mbinfo) { assert(mbinfo->destnode <= numa_max_node()); { long size, freep; size = numa_node_size(mbinfo->destnode, &freep); //printf("node %d : total = %ld(B), free = %ld(B)\n", mbinfo->destnode, size, freep); assert(freep >= mbinfo->working_size); mbinfo->working_area = (long *)numa_alloc_onnode(mbinfo->working_size, mbinfo->destnode); if (NULL == mbinfo->working_area) { perror("numa_alloc_onnode"); exit(EXIT_FAILURE); } memset(mbinfo->working_area, 0, mbinfo->working_size); } memory_stress_rand(&mbinfo->pc, mbinfo->working_area, mbinfo->working_size); // release resources numa_free(mbinfo->working_area, mbinfo->working_size); }
static inline void v4_flow_free(ipv4_flow_t* flow){ #if DPI_NUMA_AWARE numa_free(flow, sizeof(ipv4_flow_t)); #else free(flow); #endif }
JNIEXPORT void JNICALL Java_xerial_jnuma_NumaNative_free__Ljava_nio_ByteBuffer_2 (JNIEnv *env, jobject jobj, jobject buf) { //printf("free is called\n"); void* mem = (*env)->GetDirectBufferAddress(env, buf); jlong capacity = (*env)->GetDirectBufferCapacity(env, buf); if(mem != 0) { //printf("free capacity:%d\n", capacity); numa_free(mem, (size_t) capacity); } }
void myhbwmalloc_final(void) { if (myhbwmalloc_mspace != NULL) { size_t bytes_avail = destroy_mspace(myhbwmalloc_mspace); if (myhbwmalloc_verbose) { printf("hbwmalloc: destroy_mspace returned = %zu\n", bytes_avail); } } if (myhbwmalloc_slab != NULL) { numa_free(myhbwmalloc_slab, myhbwmalloc_slab_size); } }
static void s_numa_free(void *p) { if (likely(p != NULL)) { size_t sz = 0; lagopus_result_t r = s_find_addr(p, &sz); if (likely(r == LAGOPUS_RESULT_OK)) { numa_free(p, sz); s_delete_addr(p); } else { free(p); } } }
static inline #endif void pfwl_free_task(mc_pfwl_task_t *task) { #if PFWL_NUMA_AWARE numa_free(task, sizeof(mc_pfwl_task_t)); #else #if PFWL_MULTICORE_ALIGN_TASKS free(task); #else delete task; #endif #endif }
/* * free_pages() - free an array of pages * @pages: array of page pointers to be freed * @num: no. of pages in the array */ void free_pages(void **pages, unsigned int num) { #if HAVE_NUMA_H int i; size_t onepage = get_page_size(); for (i = 0; i < num; i++) { if (pages[i] != NULL) { numa_free(pages[i], onepage); } } #endif }
static bool s_free_all(void *key, void *val, lagopus_hashentry_t he, void *arg) { void *addr = key; size_t sz = (size_t)val; (void)he; (void)arg; if (likely(addr != NULL && sz > 0)) { numa_free(addr, sz); } return true; }
void *dynarray_destroy(struct dynarray *da) { void *ret = da->elems; //printf("destroy realloc: idx:%lu nr:%lu realloc size:%lu\n", da->next_idx, da->elems_nr, (da->next_idx+1)*da->elem_size); if (da->numa) { ret = numa_realloc(ret, da->elems_nr*da->elem_size, da->next_idx*da->elem_size); numa_free(da, sizeof(*da)); } else { ret = realloc(ret, da->next_idx*da->elem_size); free(da); } return ret; }
//---------------------------------------------------------------------- //-- free memory allocated by AllocOnNode -- needs mem and size //---------------------------------------------------------------------- inline void FreeOnNode(void *mem, const size_t size) { #if THERON_NUMA #if THERON_WINDOWS VirtualFree(mem, size, MEM_RELEASE); #elif THERON_GCC numa_free(mem, size); #endif #endif // THERON_NUMA }
~MemBlockAlloc() { if (verbosity > 1) std::cout << "NUMABlockAlloc EXIT" << std::endl; for (auto& n : domain_block_groups) { for (auto& g : n.block_groups) { // std::cout << "cleaning up " << g.free_blocks.size() << " blocks of size " << g.block_size << std::endl; for (auto& b : g.free_blocks) { #if NUMA_BLOCK_ALLOCATOR_TYPE == 0 || NUMA_BLOCK_ALLOCATOR_TYPE == 3 ::free(b); #else numa_free(b, g.block_size); #endif } } } }
void pfree(void* start, size_t size) { numa_free(start, size); }
int main(int argc, const char **argv) { int num_cpus = numa_num_task_cpus(); printf("num cpus: %d\n", num_cpus); printf("numa available: %d\n", numa_available()); numa_set_localalloc(); struct bitmask *bm = numa_bitmask_alloc(num_cpus); for (int i=0; i<=numa_max_node(); ++i) { numa_node_to_cpus(i, bm); printf("numa node %d ", i); print_bitmask(bm); printf(" - %g GiB\n", numa_node_size(i, 0) / (1024.*1024*1024.)); } numa_bitmask_free(bm); puts(""); char *x; const size_t cache_line_size = 64; const size_t array_size = 100*1000*1000; size_t ntrips = 2; #pragma omp parallel { assert(omp_get_num_threads() == num_cpus); int tid = omp_get_thread_num(); pin_to_core(tid); if(tid == 0) x = (char *) numa_alloc_local(array_size); // {{{ single access #pragma omp barrier for (size_t i = 0; i<num_cpus; ++i) { if (tid == i) { double t = measure_access(x, array_size, ntrips); printf("sequential core %d -> core 0 : BW %g MB/s\n", i, array_size*ntrips*cache_line_size / t / 1e6); } #pragma omp barrier } // }}} // {{{ everybody contends for one { if (tid == 0) puts(""); #pragma omp barrier double t = measure_access(x, array_size, ntrips); #pragma omp barrier for (size_t i = 0; i<num_cpus; ++i) { if (tid == i) printf("all-contention core %d -> core 0 : BW %g MB/s\n", tid, array_size*ntrips*cache_line_size / t / 1e6); #pragma omp barrier } } // }}} // {{{ zero and someone else contending if (tid == 0) puts(""); #pragma omp barrier for (size_t i = 1; i<num_cpus; ++i) { double t; if (tid == i || tid == 0) t = measure_access(x, array_size, ntrips); #pragma omp barrier if (tid == 0) { printf("two-contention core %d -> core 0 : BW %g MB/s\n", tid, array_size*ntrips*cache_line_size / t / 1e6); } #pragma omp barrier if (tid == i) { printf("two-contention core %d -> core 0 : BW %g MB/s\n\n", tid, array_size*ntrips*cache_line_size / t / 1e6); } #pragma omp barrier } } numa_free(x, array_size); return 0; }
int main(void) { int node_id = 0; int arrival_lambda = 10; int thread_cpu_map[N_THREADS] = {1,6}; int i; int j; /* pthread_spinlock_t *spinlock_ptr = malloc(sizeof(pthread_spinlock_t)); if(spinlock_ptr == NULL) //error handling of the malloc of the spinlock { printf("malloc of spinlock failed.\n"); } else { printf("malloc of spinlock succeeded.\n"); } free(spinlock_ptr); */ //pthread_t threads[N_THREADS]; //cpu_set_t cpuset[N_THREADS]; //for setting the affinity of threads thread_params para[N_THREADS]; //The parameters to pass to the threads //printf("The return value of numa_get_run_node_mask(void) is %d\n",numa_get_run_node_mask()); //printf("The return value of numa_max_node(void) is %d\n",numa_max_node()); //numa_tonode_memory((void *)spinlock_ptr,sizeof(pthread_spinlock_t),node_id); //This doesn't work //initilize the spinlock pointer and put it on a specific node pthread_spinlock_t *spinlock_ptr = numa_alloc_onnode(sizeof(pthread_spinlock_t),node_id); if(spinlock_ptr == NULL) //error handling of the allocating of a spinlock pointer on a specific node { printf("alloc of spinlock on a node failed.\n"); } else { printf("alloc of spinlock on a node succeeded.\n"); } for(j = 0; j < 100000; j++) { //initlize spinlock pthread_spin_init(spinlock_ptr,0); //create the threads for(i = 0; i < N_THREADS; i++) { int rc; int s; para[i].thread_id = i; para[i].arrival_lambda = arrival_lambda; para[i].spinlock_ptr = spinlock_ptr; CPU_ZERO(&cpuset[i]); CPU_SET(thread_cpu_map[i],&cpuset[i]); rc = pthread_create(&threads[i],NULL,work,(void*)¶[i]); if(rc) { printf("ERROR: return code from pthread_create() is %d for thread %d \n",rc,i); exit(-1); } /* s = pthread_setaffinity_np(threads[i], sizeof(cpu_set_t), &cpuset[i]); if (s != 0) perror("set affinity error\n"); */ flag = 1; } /* for(i = 0; i < N_THREADS; i++) { int s; CPU_ZERO(&cpuset[i]); CPU_SET(thread_cpu_map[i],&cpuset[i]); s = pthread_setaffinity_np(threads[i], sizeof(cpu_set_t), &cpuset[i]); if (s != 0) perror("sjfljkl\n"); } */ for(i = 0; i < N_THREADS; i++) { pthread_join(threads[i],NULL); } } for(i = 0; i < N_THREADS; i++) { printf("The time to get one lock for thread %d is : %.9f\n",i,time_in_cs[i]/100000); } pthread_spin_destroy(spinlock_ptr); numa_free(spinlock_ptr,sizeof(pthread_spinlock_t)); pthread_exit(NULL); return 0; }
static void setup(void) { int ret, i, j; int pagesize = getpagesize(); void *p; tst_require_root(NULL); TEST(ltp_syscall(__NR_migrate_pages, 0, 0, NULL, NULL)); if (numa_available() == -1) tst_brkm(TCONF, NULL, "NUMA not available"); ret = get_allowed_nodes_arr(NH_MEMS, &num_nodes, &nodes); if (ret < 0) tst_brkm(TBROK | TERRNO, NULL, "get_allowed_nodes(): %d", ret); if (num_nodes < 2) tst_brkm(TCONF, NULL, "at least 2 allowed NUMA nodes" " are required"); else if (tst_kvercmp(2, 6, 18) < 0) tst_brkm(TCONF, NULL, "2.6.18 or greater kernel required"); /* * find 2 nodes, which can hold NODE_MIN_FREEMEM bytes * The reason is that: * 1. migrate_pages() is expected to succeed * 2. this test avoids hitting: * Bug 870326 - migrate_pages() reports success, but pages are * not moved to desired node * https://bugzilla.redhat.com/show_bug.cgi?id=870326 */ nodeA = nodeB = -1; for (i = 0; i < num_nodes; i++) { p = numa_alloc_onnode(NODE_MIN_FREEMEM, nodes[i]); if (p == NULL) break; memset(p, 0xff, NODE_MIN_FREEMEM); j = 0; while (j < NODE_MIN_FREEMEM) { if (addr_on_node(p + j) != nodes[i]) break; j += pagesize; } numa_free(p, NODE_MIN_FREEMEM); if (j >= NODE_MIN_FREEMEM) { if (nodeA == -1) nodeA = nodes[i]; else if (nodeB == -1) nodeB = nodes[i]; else break; } } if (nodeA == -1 || nodeB == -1) tst_brkm(TCONF, NULL, "at least 2 NUMA nodes with " "free mem > %d are needed", NODE_MIN_FREEMEM); tst_resm(TINFO, "Using nodes: %d %d", nodeA, nodeB); ltpuser = getpwnam(nobody_uid); if (ltpuser == NULL) tst_brkm(TBROK | TERRNO, NULL, "getpwnam failed"); TEST_PAUSE; }
int main(int argc, char *argv[]) { int node_id = 0; int arrival_lambda = 10; int thread_cpu_map[N_THREADS]; int i,j,k; int n_threads; int n_left; int n_right; int next_index_left = 3; int next_index_right = 7; float local_square = 0.0, remote_square = 0.0; /***************** make sure #args is correct and get the n_threads, n_left and n_right */ if(argc < 4) { printf("Usage: ./test_numa_comb n_of_threads n_of_threads_on_node0 n_of_threads_on_node1\n"); exit(-1); } n_threads = atoi(argv[1]); n_left = atoi(argv[2]); n_right = atoi(argv[3]); /******************* Set the thread_cpu_map according to the n_left and n_right */ printf("n_threads: %d, n_left: %d, n_right: %d\n",n_threads,n_left,n_right); for(i = 0; i < n_left; i++) { thread_cpu_map[i] = next_index_left; next_index_left--; } for(i = n_left; i < n_threads; i++) { thread_cpu_map[i] = next_index_right; next_index_right--; } for(i = 0; i < n_threads; i++) { printf("Thread %d is on cpu %d\n",i,thread_cpu_map[i]); } thread_params para[n_threads]; //The parameters to pass to the threads //printf("The return value of numa_get_run_node_mask(void) is %d\n",numa_get_run_node_mask()); //printf("The return value of numa_max_node(void) is %d\n",numa_max_node()); //numa_tonode_memory((void *)spinlock_ptr,sizeof(pthread_spinlock_t),node_id); //This doesn't work //initilize the spinlock pointer and put it on a specific node pthread_spinlock_t *spinlock_ptr = numa_alloc_onnode(sizeof(pthread_spinlock_t),node_id); if(spinlock_ptr == NULL) //error handling of the allocating of a spinlock pointer on a specific node { printf("alloc of spinlock on a node failed.\n"); exit(-1); } /* initialise syncs */ pthread_barrier_init(&fin_barrier, NULL, n_threads); pthread_spin_init(spinlock_ptr,0); int rc; //create the threads for(i = 0; i < n_threads; i++) { para[i].thread_id = i; para[i].arrival_lambda = arrival_lambda; para[i].spinlock_ptr = spinlock_ptr; CPU_ZERO(&cpuset[i]); CPU_SET(thread_cpu_map[i],&cpuset[i]); rc = pthread_create(&threads[i],NULL,work,(void*)¶[i]); E (rc); } start_work_flag = 1; /* wait here */ for(i = 0; i < n_threads; i++) pthread_join(threads[i],NULL); pthread_barrier_destroy(&fin_barrier); /* for(i = 0; i < n_threads; i++) { printf("The time to get one lock for thread %d is : %.9f\n",i,time_in_cs[i]/num_access_each_thread[i]); printf("The number of lock accesses for thread %d is : %d\n",i,num_access_each_thread[i]); } */ qsort((void*)g_tss,(size_t)access_count,(size_t)sizeof(timestamp),cmp_timestamp); /* for (i = 0; i < access_count; i++) printf("%lu with id %d\n",g_tss[i].ts,g_tss[i].id); */ /* for (i = 0; i < access_count; i++) * { * printf ("%lu %d\n", g_tss[i].ts, g_tss[i].id); * } */ /* */ int cs_order[access_count/2]; for(i = 0; i < access_count/2; i++) { cs_order[i] = g_tss[i*2].id; //printf("%d in cs\n",cs_order[i]); } int cs_matrix[n_threads][n_threads]; uint64_t delay_matrix[n_threads][n_threads]; float prob_matrix[n_threads][n_threads]; float rate_matrix[n_threads][n_threads]; // zero out all the matrices memset(&cs_matrix, '\0', n_threads*n_threads*sizeof(int)); memset(&delay_matrix, '\0', n_threads*n_threads*sizeof(uint64_t)); memset(&prob_matrix, '\0', n_threads*n_threads*sizeof(float)); int local_count2 = 0, remote_count2 = 0; uint64_t diff; for(i = 0; i < n_threads; i++) for(j = 0; j < n_threads; j++) for(k = 0; k < access_count/2 -1 ; k++) { if(cs_order[k] == i && cs_order[k+1] == j) { cs_matrix[i][j]++; diff = g_tss[2*k+2].ts - g_tss[2*k+1].ts; delay_matrix[i][j] += diff; if(is_on_same_node(i, j, n_threads, n_left, n_right)) { dprintf("local_delay: %lu\n", diff); local_square += sqr(diff); local_count2++; } else { dprintf("remote_delay: %lu\n", diff); remote_square += sqr(diff); remote_count2++; } } } int num_access[n_threads]; for(i = 0; i < access_count/2 -1; i++) for(j = 0; j < n_threads; j++) { if (cs_order[i] == j) num_access[j]++; } for(i = 0; i < n_threads; i++) printf("num_access[%d]:%d\n",i,num_access[i]); for(i = 0; i < n_threads; i++) for(j = 0; j < n_threads ; j++) { prob_matrix[i][j] = (float)cs_matrix[i][j]/(float)num_access[i]; rate_matrix[i][j] = 1.0/((delay_matrix[i][j]/(float)cs_matrix[i][j])/CPU_FREQ); } printf ("\n***************** PROBS *******************\n"); printf ("Lock is on LP, [L, R] is [%d, %d]:\n", n_left - 1, n_right); // tl printf ("L -> L\n"); print_mtx (n_threads, n_threads, prob_matrix, 0, 0, n_left, n_left, 0); // tr printf ("L -> R\n"); print_mtx (n_threads, n_threads, prob_matrix, n_left, 0, n_threads, n_left, 0); printf ("Lock is on RP, [L, R] is [%d, %d]:\n", n_left, n_right - 1); // br printf ("R -> R\n"); print_mtx (n_threads, n_threads, prob_matrix, n_left, n_left, n_threads, n_threads, 0); // bl printf ("R -> L\n"); print_mtx (n_threads, n_threads, prob_matrix, 0, n_left, n_left, n_threads, 0); printf ("\n***************** RATES *******************\n"); printf ("Lock is on LP, [L, R] is [%d, %d]:\n", n_left - 1, n_right); // tl printf ("L -> L\n"); print_mtx (n_threads, n_threads, rate_matrix, 0, 0, n_left, n_left, 1); // tr printf ("L -> R\n"); print_mtx (n_threads, n_threads, rate_matrix, n_left, 0, n_threads, n_left, 1); printf ("Lock is on RP, [L, R] is [%d, %d]:\n", n_left, n_right - 1); // br printf ("R -> R\n"); print_mtx (n_threads, n_threads, rate_matrix, n_left, n_left, n_threads, n_threads, 1); // bl printf ("R -> \n"); print_mtx (n_threads, n_threads, rate_matrix, 0, n_left, n_left, n_threads, 1); //print the intra-core and inter-core delay //thread 0 - n_left -1 are on the left core, n_left to n_threads are on the right core uint64_t local_delay = 0, remote_delay = 0; int local_count = 0, remote_count = 0; float local_prob = 0.0, remote_prob = 0.0; for(i = 0; i < n_threads; i++) for(j = 0; j < n_threads; j++) { if (j == i) continue; if(is_on_same_node(i, j, n_threads, n_left, n_right)) { //printf("%d and %d on the same node\n",i,j); local_delay += delay_matrix[i][j]; local_count += cs_matrix[i][j]; local_prob += prob_matrix[j][i]; } else { //printf("%d and %d not the same node\n",i,j); remote_delay += delay_matrix[i][j]; remote_count += cs_matrix[i][j]; remote_prob += prob_matrix[j][i]; } } float local = (float)local_delay/(local_count); float remote = (float)remote_delay/(remote_count); printf("\n\n**************************** Aggregates ***************************\n"); printf("local delay: %f, remote_delay: %f, local_count: %d, remote_count: %d\n",(float)local_delay/(local_count),(float)remote_delay/(remote_count),local_count,remote_count); printf("local prob:%f, remote prob: %f\n",local_prob/n_threads, remote_prob/n_threads); printf("local delay variance:%f, remote delay variance: %f\n",local_square/local_count - local*local, remote_square/remote_count - remote*remote); printf("local count2: %d, remote_count2:%d\n",local_count2, remote_count2); pthread_spin_destroy(spinlock_ptr); numa_free((void *)spinlock_ptr,sizeof(pthread_spinlock_t)); pthread_exit(NULL); return 0; }
int main( int argc, char** argv ) { struct timespec start, stop; double time; #ifndef NDEBUG std::cout << "-->WARNING: COMPILED *WITH* ASSERTIONS!<--" << std::endl; #endif if( argc<=3 ) { std::cout << "Usage: " << argv[0] << " <mtx> <scheme> <x> <REP1> <REP2>" << std::endl << std::endl; std::cout << "calculates Ax=y and reports average time taken as well as the mean of y." << std::endl; std::cout << "with\t\t <mtx> filename of the matrix A in matrix-market or binary triplet format." << std::endl; std::cout << " \t\t <scheme> number of a sparse scheme to use, see below." << std::endl; std::cout << " \t\t <x> 0 for taking x to be the 1-vector, 1 for taking x to be random (fixed seed)." << std::endl; std::cout << " \t\t <REP1> (optional, default is 1) number of repititions of the entire experiment." << std::endl; std::cout << " \t\t <REP2> (optional, default is 1) number of repititions of the in-place SpMV multiplication, per experiment." << std::endl; std::cout << std::endl << "Possible schemes:" << std::endl; std::cout << " 0: TS (triplet scheme)" << std::endl; std::cout << " 1: CRS (also known as CSR)" << std::endl; std::cout << " 2: ICRS (Incremental CRS)" << std::endl; std::cout << " 3: ZZ-CRS (Zig-zag CRS)" << std::endl; std::cout << " 4: ZZ-ICRS (Zig-zag ICRS)" << std::endl; std::cout << " 5: SVM (Sparse vector matrix)" << std::endl; std::cout << " 6: HTS (Hilbert-ordered triplet scheme)" << std::endl; std::cout << " 7: BICRS (Bi-directional Incremental CRS)" << std::endl; std::cout << " 8: Hilbert (Hilbert-ordered triplets backed by BICRS)" << std::endl; std::cout << " 9: Block Hilbert (Sparse matrix blocking, backed by Hilbert and HBICRS)" << std::endl; std::cout << "10: Bisection Hilbert (Sparse matrix blocking by bisection, backed by Hilbert and HBICRS)" << std::endl; std::cout << "11: CBICRS (Compressed Bi-directional Incremental CRS)" << std::endl; std::cout << "12: Beta Hilbert (known as Block CO-H+ in the paper by Yzelman & Roose, 2012: parallel compressed blocked Hilbert with BICRS)" << std::endl; std::cout << "13: Row-distributed Beta Hilbert (known as Row-distributed block CO-H in the paper by Yzelman & Roose, 2012: same as 12, but simpler distribution)" << std::endl; #ifdef WITH_CSB std::cout << "14: Row-distributed CSB (Uses CSB sequentially within the row-distributed scheme of 13)" << std::endl; #endif std::cout << "15: Row-distributed Hilbert (Parallel row-distributed Hilbert scheme, see also 8)" << std::endl; std::cout << "16: Row-distributed parallel CRS (using OpenMP, known as OpenMP CRS in the paper by Yzelman & Roose, 2012)" << std::endl; std::cout << "17: Row-distributed SpMV using compressed Hilbert indices." << std::endl; #ifdef WITH_MKL std::cout << "18: Intel MKL SpMV based on the CRS data structure." << std::endl; #endif std::cout << "19: Optimised ICRS." << std::endl; #ifdef WITH_CUDA std::cout << "20: CUDA CuSparse HYB format." << std::endl; #endif std::cout << std::endl << "The in-place Ax=y calculation is preceded by a quasi pre-fetch." << std::endl; std::cout << "Add a minus sign before the scheme number to enable use of the CCS wrapper (making each CRS-based structure CCS-based instead)" << std::endl; std::cout << "Note: binary triplet format is machine-dependent. "; std::cout << "Take care when using the same binary files on different machine architectures." << std::endl; return EXIT_FAILURE; } std::string file = std::string( argv[1] ); int scheme = atoi( argv[2] ); int ccs = scheme < 0 ? 1 : 0; if( ccs ) scheme = -scheme; int x_mode = atoi( argv[3] ); unsigned long int rep1 = 1; unsigned long int rep2 = 1; if( argc >= 5 ) rep1 = static_cast< unsigned long int >( atoi( argv[4] ) ); if( argc >= 6 ) rep2 = static_cast< unsigned long int >( atoi( argv[5] ) ); if( scheme != 16 && scheme != -16 && //pin master thread to a single core scheme != 18 && scheme != -18 ) { //but not when OpenMP is used (otherwise serialised computations) cpu_set_t mask; CPU_ZERO( &mask ); CPU_SET ( 0, &mask ); if( pthread_setaffinity_np( pthread_self(), sizeof( mask ), &mask ) != 0 ) { std::cerr << "Error setting main thread affinity!" << std::endl; exit( 1 ); } } else { omp_set_num_threads( MachineInfo::getInstance().cores() ); } #ifdef WITH_MKL if( scheme == 18 ) { mkl_set_num_threads( MachineInfo::getInstance().cores() ); } #endif std::cout << argv[0] << " called with matrix input file " << file << ", scheme number "; std::cout << scheme << " and x being " << (x_mode?"random":"the 1-vector") << "." << std::endl; std::cout << "Number of repititions of in-place zax is " << rep2 << std::endl; std::cout << "Number of repititions of the " << rep2 << " in-place zax(es) is " << rep1 << std::endl; Matrix< double >* checkm = new TS< double >( file ); clock_gettime( CLOCK_ID, &start); Matrix< double >* matrix = selectMatrix( scheme, ccs, file ); clock_gettime( CLOCK_ID, &stop); time = (stop.tv_sec-start.tv_sec)*1000; time += (stop.tv_nsec-start.tv_nsec)/1000000.0; if( matrix == NULL ) { std::cerr << "Error during sparse scheme loading, exiting." << std::endl; return EXIT_FAILURE; } std::cout << "Matrix dimensions: " << matrix->m() << " times " << matrix->n() << "." << std::endl; std::cout << "Datastructure loading time: " << time << " ms." << std::endl << std::endl; srand( FIXED_SEED ); double* x = NULL; #ifdef INTERLEAVE_X if( scheme == 13 || scheme == 14 || scheme == 15 || scheme == 16 || scheme == 17 || scheme == 18 ) x = (double*) numa_alloc_interleaved( matrix->n() * sizeof( double ) ); else #endif x = (double*) _mm_malloc( matrix->n() * sizeof( double ), 64 ); //initialise input vector for( unsigned long int j=0; j<matrix->n(); j++ ) { x[ j ] = x_mode?(rand()/(double)RAND_MAX):1.0; } //do one trial run, also for verification double* c = checkm->mv( x ); clock_gettime( CLOCK_ID, &start ); double* z = matrix->mv( x ); clock_gettime( CLOCK_ID, &stop); time = (stop.tv_sec-start.tv_sec)*1000; time += (stop.tv_nsec-start.tv_nsec)/1000000.0; double checkMSE = 0; unsigned long int max_e_index = 0; double max_e = fabs( z[0] - c[0] ); for( unsigned long int j=0; j<matrix->m(); j++ ) { double curdiff = fabs( z[j] - c[j] ); if( curdiff > max_e ) { max_e = curdiff; max_e_index = j; } curdiff *= curdiff; curdiff /= (double)(matrix->m()); checkMSE += curdiff; } #ifdef OUTPUT_Z for( unsigned long int j=0; j<matrix->m(); j++ ) { std::cout << z[ j ] << std::endl; } #endif std::cout << "out-of-place z=Ax: mean= " << checksum( z, matrix->m() ) << ", "; std::cout << "MSE = " << checkMSE << ", "; std::cout << "max abs error = " << max_e << " while comparing y[ " << max_e_index << " ] = " << z[max_e_index] << " and c[ " << max_e_index << " ] = " << c[max_e_index] << ", "; std::cout << "time= " << time << " ms." << std::endl; #ifdef RDBH_NO_COLLECT if( scheme == 13 ) { std::cout << "WARNING: MSE and max abs error are not correct for the Row-distributed Beta Hilbert scheme; please see the RDBHilbert.hpp file, and look for the RDBH_NO_COLLECT flag." << std::endl; } #else if( scheme == 13 ) { std::cout << "WARNING: timings are pessimistic for the Row-distributed Beta Hilbert scheme; each spmv a (syncing) collect is executed to write local data to the global output vector as required by this library. To get the correct timings, turn this collect off via the RDBH_NO_COLLECT flag in the RDBHilbert.hpp file. Note that this causes the verification process to fail, since all data is kept in private local output subvectors." << std::endl; } #endif double *times = new double[ rep1 ]; //Run rep*rep instances for( unsigned long int run = 0; run < rep1; run++ ) { sleep( 1 ); time = 0.0; //"prefetch" matrix->zax( x, z ); matrix->zax( x, z, rep2, CLOCK_ID, &time ); time /= static_cast<double>( rep2 ); times[ run ] = time; } //calculate statistics double meantime, mintime, vartime; meantime = vartime = 0.0; mintime = times[ 0 ]; for( unsigned long int run = 0; run < rep1; run++ ) { if( times[ run ] < mintime ) mintime = times[ run ]; meantime += times[ run ] / static_cast< double >( rep1 ); } for( unsigned long int run = 0; run < rep1; run++ ) { vartime += ( times[ run ] - meantime ) * ( times[ run ] - meantime ) / static_cast< double >( rep1 - 1 ); } vartime = sqrt( vartime ); std::cout << "In-place:" << std::endl; std::cout << "Mean = " << checksum( z, matrix->m() ) << std::endl; std::cout << "Time = " << meantime << " (average), \t" << mintime << " (fastest), \t" << vartime << " (stddev) ms. " << std::endl; const double avgspeed = static_cast< double >( 2*matrix->nzs() ) / meantime / 1000000.0; const double minspeed = static_cast< double >( 2*matrix->nzs() ) / mintime / 1000000.0; const double varspeed = fabs( avgspeed - static_cast< double >( 2*matrix->nzs() ) / (meantime - vartime) / 1000000.0 ); std::cout << "Speed = " << avgspeed << " (average), \t" << minspeed << " (fastest), \t" << varspeed << " (variance) Gflop/s." << std::endl; const size_t memuse1 = matrix->bytesUsed() + sizeof( double ) * 2 * matrix->nzs(); const double avgmem1 = static_cast< double >( 1000*memuse1 ) / meantime / 1073741824.0; const double minmem1 = static_cast< double >( 1000*memuse1 ) / mintime / 1073741824.0; const double varmem1 = fabs( avgmem1 - static_cast< double >( 1000*memuse1 ) / (meantime-vartime) / 1073741824.0 ); std::cout << " " << avgmem1 << " (average), \t" << minmem1 << " (fastest), \t" << varmem1 << " (variance) Gbyte/s (upper bound)." << std::endl; const size_t memuse2 = matrix->bytesUsed() + sizeof( double ) * ( matrix->m() + matrix->n() ); const double avgmem2 = static_cast< double >( 1000*memuse2 ) / meantime / 1073741824.0; const double minmem2 = static_cast< double >( 1000*memuse2 ) / mintime / 1073741824.0; const double varmem2 = fabs( avgmem2 - static_cast< double >( 1000*memuse2 ) / (meantime-vartime) / 1073741824.0 ); std::cout << " " << avgmem2 << " (average), \t" << minmem2 << " (fastest), \t" << varmem2 << " (variance) Gbyte/s (lower bound)." << std::endl; delete [] times; #ifdef INTERLEAVE_X if( scheme == 13 || scheme == 14 || scheme == 15 || scheme == 16 || scheme == 17 || scheme == 18 ) { numa_free( x, matrix->n() * sizeof( double ) ); } else #endif _mm_free( x ); if( scheme == 12 || scheme == 13 || scheme == 14 || scheme == 15 || scheme == 16 || scheme == 17 || scheme == 18 ) { #ifdef _NO_LIBNUMA _mm_free( z ); #else numa_free( z, matrix->m() * sizeof( double ) ); #endif } else { _mm_free( z ); } _mm_free( c ); delete matrix; delete checkm; return EXIT_SUCCESS; }
int main(int argc, char* argv[]) { printf("\n NODE_BIND:%d, NUMA:%d, CPU_BIND:%d, FIRST_TOUCH:%d\n",NODE_BIND, NUMA, CPU_BIND, FIRST_TOUCH); int repetitions, // number of repetition maxThreads, // max number of threads it, N; // array size; int bitCount = 1; int * key; // array of keys long * dataIn; // input data long * dataSTL; // input stl data long * dataRadix; // input radix data repetitions = 1; #pragma omp parallel maxThreads = omp_get_num_threads(); if(argc ==1 ){ printf("prog input_file number_of_elements bit_count number_of_repetitions\n"); printf("NO INPUT FILE"); return 0; } if(argc == 2){ printf("prog input_file number_of_elements bit_count number_of_repetitions\n"); printf("NO ELEMENT COUNT\n"); return 0; } if(argc >2 ){ N = (int) strtol(argv[2], NULL, 10); } if(argc >3){ int tmp; tmp = (int) strtol(argv[3], NULL, 10); if ((tmp > 0) && (tmp<=16 )) // limit bit count bitCount = tmp; } if(argc >4){ int tmp; tmp = (int) strtol(argv[4], NULL, 10); if ((tmp > 0) && (tmp<=10000 )) // limit repetitions repetitions = tmp; } int *input; size_t N2; printf( "Reading data from file.\n" ); if( readIntArrayFile( argv[1], &input, &N2 ) ) return 1; printf( "Data reading done.\n" ); if( (N2<(size_t)N) || (N<=0) ) N = N2; printf( "\nPARALLEL STL SORT for N=%d, max threads = %d, test repetitions: %d\n", N, maxThreads, repetitions); dataIn = new long[N]; dataSTL = new long[N]; #ifdef _WIN32 dataRadix = new long[N]; key = new int[N]; #endif #ifdef linux key = new int[N]; #if NUMA==0 dataRadix = new long[N]; #elif NUMA==1 dataRadix = (long*) numa_alloc_interleaved(N * sizeof(long)); #elif NUMA==2 dataRadix = (long*)numa_alloc_onnode(sizeof(long)*N,1); #endif #endif VTimer stlTimes(maxThreads); VTimer radixTimes(maxThreads); #if TIME_COUNT==1 VTimer partTimes(TIMERS_COUNT); #endif #if FLUSH_CACHE==1 #ifdef linux CacheFlusher cf; #endif #endif for(long i=0;i<N;i++) dataIn[i]=input[i]; delete[] input; // loop from 1 to maxThreads for (int t = 1; t <= maxThreads; t++) { int i; #if TIME_COUNT==1 partTimes.reset(); #endif #if CALC_REF==1 // parallel STL for (it = 0; it < repetitions; it++) { setThreadsNo(t, maxThreads); #pragma omp parallel for private(i) for (i = 0; i < N; i++) dataSTL[i] = dataIn[i]; #if FLUSH_CACHE==1 #ifdef linux cf.flush(); #endif #endif stlTimes.timerStart(t-1); #ifdef linux __gnu_parallel::sort(dataSTL, dataSTL + N); #endif #ifdef _WIN32 std::sort(dataSTL, dataSTL + N); #endif stlTimes.timerEnd(t-1); } #if FLUSH_CACHE==1 #ifdef linux cf.flush(); #endif #endif #endif // radix sort V1 for (it = 0; it < repetitions; it++) { setThreadsNo(t, maxThreads); #pragma omp parallel for private(i) default(shared) for (i = 0; i < N; i++){ dataRadix[i] = dataIn[i]; key[i]=i; } #if FLUSH_CACHE==1 #ifdef linux cf.flush(); #endif #endif omp_set_num_threads(t); radixTimes.timerStart(t-1); #if TIME_COUNT==1 prsort::pradsort<long,int>(dataRadix,key, N, bitCount,&partTimes); #else prsort::pradsort<long,int>(dataRadix,key, N,bitCount,NULL); #endif radixTimes.timerEnd(t-1); } #if CALC_REF==1 printf("|STL SORT(th=%2d) : %1.3fs |\t", t, stlTimes.getTime(t-1)); #endif #if TIME_COUNT==1 for (int i = 0; i < TIMERS_COUNT; i++) { #if CREATE_OUTPUT==1 printf("%d %d %d %d %d %d %d %f\n", NUMA, NODE_BIND, CPU_BIND, FIRST_TOUCH,bitCount , t, i ,partTimes.getTime(i)); #else printf("part%d :%f ", i, partTimes.getTime(i)); #endif } #endif #if CREATE_OUTPUT ==1 printf("%d %d %d %d %d %d calosc %1.3f", NUMA,NODE_BIND,CPU_BIND,FIRST_TOUCH,bitCount, t ,radixTimes.getTime(t-1)); #else printf("|RADIX SORT (th=%2d) : %1.3fs |\t", t, radixTimes.getTime(t-1)); #endif // Attention: checking result only from the last function usage #if CALC_REF==1 checkResults(dataSTL, dataRadix, N); #else printf("\n"); #endif #if CHECK_KEY==1 if(checkKey(dataIn,dataRadix,key,N))printf("Keys are good\n"); #endif } #ifdef linux delete[] key; #if NUMA>0 numa_free(dataRadix, sizeof(long) * N); #else delete[] dataRadix; #endif #endif #ifdef _WIN32 delete[] dataRadix; #endif delete[] dataIn; delete[] dataSTL; #if TIME_COUNT==1 #endif return 0; }
void INTERNAL qt_affinity_free(void *ptr, size_t bytes) { /*{{{ */ numa_free(ptr, bytes); } /*}}} */
static void * s_numa_alloc(size_t sz, int cpu) { void *ret = NULL; if (likely(sz > 0)) { if (likely(cpu >= 0)) { if (likely(s_numa_nodes != NULL && s_n_cpus > 0)) { unsigned int node = s_numa_nodes[cpu]; unsigned int allocd_node = UINT_MAX; struct bitmask *bmp; int r; bmp = numa_allocate_nodemask(); numa_bitmask_setbit(bmp, node); errno = 0; r = (int)set_mempolicy(MPOL_BIND, bmp->maskp, bmp->size + 1); if (likely(r == 0)) { errno = 0; ret = numa_alloc_onnode(sz, (int)node); if (likely(ret != NULL)) { lagopus_result_t rl; /* * We need this "first touch" even using the * numa_alloc_onnode(). */ (void)memset(ret, 0, sz); errno = 0; r = (int)get_mempolicy((int *)&allocd_node, NULL, 0, ret, MPOL_F_NODE|MPOL_F_ADDR); if (likely(r == 0)) { if (unlikely(node != allocd_node)) { /* * The memory is not allocated on the node, but it is * still usable. Just return it. */ lagopus_msg_warning("can't allocate " PFSZ(u) " bytes memory " "for CPU %d (NUMA node %d).\n", sz, cpu, node); } } else { lagopus_perror(LAGOPUS_RESULT_POSIX_API_ERROR); lagopus_msg_error("get_mempolicy() returned %d.\n", r); } rl = s_add_addr(ret, sz); if (unlikely(rl != LAGOPUS_RESULT_OK)) { lagopus_perror(rl); lagopus_msg_error("can't register the allocated address.\n"); numa_free(ret, sz); ret = NULL; } } } else { /* r == 0 */ lagopus_perror(LAGOPUS_RESULT_POSIX_API_ERROR); lagopus_msg_error("set_mempolicy() returned %d.\n", r); } numa_free_nodemask(bmp); set_mempolicy(MPOL_DEFAULT, NULL, 0); } else { /* s_numa_nodes != NULL && s_n_cpus > 0 */ /* * Not initialized or initialization failure. */ lagopus_msg_warning("The NUMA related information is not initialized. " "Use malloc(3) instead.\n"); ret = malloc(sz); } } else { /* cpu >= 0 */ /* * Use pure malloc(3). */ ret = malloc(sz); } } return ret; }
void release_csr_mat(struct csr_mat_t *mat) { numa_free(mat->row_ptr, (mat->rows + 1) * sizeof(DWORD)); numa_free(mat->col_idx, mat->non_zeros * sizeof(int)); numa_free(mat->vals, mat->non_zeros * sizeof(FLOAT)); }
JNIEXPORT void JNICALL Java_xerial_jnuma_NumaNative_free (JNIEnv *env, jobject jobj, jlong address, jlong capacity) { if(address != 0) { numa_free((void*) address, (size_t) capacity); } }
int main(void) { int node_id = 0; int arrival_lambda = 10; int thread_cpu_map[N_THREADS] = {1,6}; int n = 10000; int i; int j; /* pthread_spinlock_t *spinlock_ptr = malloc(sizeof(pthread_spinlock_t)); if(spinlock_ptr == NULL) //error handling of the malloc of the spinlock { printf("malloc of spinlock failed.\n"); } else { printf("malloc of spinlock succeeded.\n"); } free(spinlock_ptr); */ //pthread_t threads[N_THREADS]; //cpu_set_t cpuset[N_THREADS]; //for setting the affinity of threads thread_params para[N_THREADS]; //The parameters to pass to the threads //printf("The return value of numa_get_run_node_mask(void) is %d\n",numa_get_run_node_mask()); //printf("The return value of numa_max_node(void) is %d\n",numa_max_node()); //numa_tonode_memory((void *)spinlock_ptr,sizeof(pthread_spinlock_t),node_id); //This doesn't work //initilize the spinlock pointer and put it on a specific node pthread_spinlock_t *spinlock_ptr = numa_alloc_onnode(sizeof(pthread_spinlock_t),node_id); if(spinlock_ptr == NULL) //error handling of the allocating of a spinlock pointer on a specific node { printf("alloc of spinlock on a node failed.\n"); } else { printf("alloc of spinlock on a node succeeded.\n"); } for(j = 0; j < n; j++) { //initlize spinlock pthread_spin_init(spinlock_ptr,0); //create the threads for(i = 0; i < N_THREADS; i++) { int rc; int s; para[i].thread_id = i; para[i].arrival_lambda = arrival_lambda; para[i].spinlock_ptr = spinlock_ptr; CPU_ZERO(&cpuset[i]); CPU_SET(thread_cpu_map[i],&cpuset[i]); rc = pthread_create(&threads[i],NULL,work,(void*)¶[i]); if(rc) { printf("ERROR: return code from pthread_create() is %d for thread %d \n",rc,i); exit(-1); } flag = 1; } for(i = 0; i < N_THREADS; i++) { pthread_join(threads[i],NULL); } } for(i = 0; i < N_THREADS; i++) { printf("The time to get one lock for thread %d is : %.9f\n",i,time_in_cs[i]/n); } double diff = abs_value(time_in_cs[0] - time_in_cs[1])/n; printf("The difference of the time to get one lock is : %.9f (%f) cycles\n",diff,diff*CPU_FREQ); //this is assuming there are only two processors (needs to be changed if there are more) pthread_spin_destroy(spinlock_ptr); numa_free((void *)spinlock_ptr,sizeof(pthread_spinlock_t)); pthread_exit(NULL); return 0; }