int main(int argc, char** argv) { if(argc < 3) { std::cout << "Usage:" << std::endl << "gland <localNodeId> <remoteNodeId> [--human]" <<std::endl; exit(EXIT_FAILURE); } if(argc == 4 && strncmp(argv[3],"--human",7) == 0) { std::cout << "enabling human readable output" << std::endl; humanreadable = true; } size_t localNode = std::stoi(argv[1]); size_t remoteNode = std::stoi(argv[2]); numa_run_on_node(localNode); double *a = (double*) numa_alloc_onnode( N * sizeof(double), localNode ); double *b = (double*) numa_alloc_onnode( N * sizeof(double), remoteNode ); for(int i = 0; i<N ; i++) { a[i]=(double)i; b[i]=(double)-i; } while(1) memcpy_task(a, b); }
static dynarray_t *do_create(unsigned long elem_size, unsigned long alloc_grain, unsigned long elems_nr, int numa) { struct dynarray *da; int node = 0; if (numa) { int cpu = sched_getcpu(); /* Numa-aware allocation */ if (cpu < 0) { perror("dynarray_create: sched_getcpu"); exit(1); } node = numa_node_of_cpu(cpu); if (node < 0) { perror("dynarray_create: numa_node_of_cpu"); exit(1); } da = numa_alloc_onnode(sizeof(*da), node); } else { da = malloc(sizeof(*da)); } if ( !da ) { fprintf(stderr, "dynarray_create: malloc\n"); exit(1); } da->numa = numa; da->next_idx = 0; da->elem_size = elem_size; if (elems_nr <= alloc_grain) { da->elems_nr = alloc_grain; } else { unsigned long rem = elems_nr % alloc_grain; da->elems_nr = elems_nr; if (rem) da->elems_nr += alloc_grain - rem; } da->alloc_grain = alloc_grain; if (numa) { da->elems = numa_alloc_onnode(elem_size*da->elems_nr, node); } else { da->elems = malloc(elem_size*da->elems_nr); } if ( !da->elems ){ fprintf(stderr, "dynarray_create: malloc\n"); exit(1); } return da; }
/** * Each thread initializes the buffers in the local NUMA node for memory copy * in the next phase. */ void *buf_init_func(void *arg) { int i, j; struct buf_init_data *data = (struct buf_init_data *) arg; bind2node_id(data->node_id); for (i = 0; i < NUM_NODES; i++) { for (j = 0; j < NUM_THREADS; j++) { /* * For remote memory access, NUM_NODES * NUM_THREADS pieces of * memory are allocated, even though only (NUM_NODES - 1) * NUM_THREADS * pieces of memory are actually used. * For local memory access, only NUM_THREADS pieces of memory * are allocated. */ if (/*(i == data->node_id && use_remote) ||*/ (i != data->node_id && !use_remote)) { init_buffer(&data->src_bufs[i][j]); init_buffer(&data->local_bufs[i][j]); } if ((/*i != data->node_id && */use_remote) || (i == data->node_id && !use_remote)) { char *buf; if (data->mode == MEMCPY_PULL || data->mode == MEMCPY_PUSH || data->mode == MEMCPY_R2R || data->mode == MEMREAD) { buf = (char *) numa_alloc_onnode(data->buf_size, data->node_id); materialize_buf(buf, data->buf_size); set_buffer(&data->src_bufs[i][j], buf, data->buf_size, data->node_id); } else init_buffer(&data->src_bufs[i][j]); if (data->mode == MEMCPY_PULL || data->mode == MEMCPY_PUSH || data->mode == MEMCPY_R2R || data->mode == MEMWRITE) { buf = (char *) numa_alloc_onnode(data->buf_size, data->node_id); materialize_buf(buf, data->buf_size); set_buffer(&data->local_bufs[i][j], buf, data->buf_size, data->node_id); } else init_buffer(&data->local_bufs[i][j]); } } } return NULL; }
//---------------------------------------------------------------------- //-- a little cross platform numa allocator //-- use the existing theron defines for convenience //---------------------------------------------------------------------- inline void *AllocOnNode(const long node, const size_t size) { #if THERON_NUMA #if THERON_WINDOWS #if _WIN32_WINNT >= 0x0600 return VirtualAllocExNuma( GetCurrentProcess(), NULL, size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE, node ); #else return NULL; #endif #elif THERON_GCC if ((numa_available() < 0)) { return NULL; } return numa_alloc_onnode(size, node); #endif #endif // THERON_NUMA return NULL; }
/* * alloc_pages_on_nodes() - allocate pages on specified NUMA nodes * @pages: array in which the page pointers will be stored * @num: no. of pages to allocate * @nodes: array of NUMA nodes * * A page will be allocated in each node specified by @nodes, and the * page pointers will be stored in @pages array. * * RETURNS: * 0 on success, -1 on allocation failure. */ int alloc_pages_on_nodes(void **pages, unsigned int num, int *nodes) { int i; #if HAVE_NUMA_ALLOC_ONNODE size_t onepage = get_page_size(); #endif for (i = 0; i < num; i++) { pages[i] = NULL; } for (i = 0; i < num; i++) { char *page; #if HAVE_NUMA_ALLOC_ONNODE pages[i] = numa_alloc_onnode(onepage, nodes[i]); #endif if (pages[i] == NULL) { tst_resm(TBROK, "allocation of page on node " "%d failed", nodes[i]); break; } /* Touch the page, to force allocation. */ page = pages[i]; page[0] = i; } if (i == num) return 0; free_pages(pages, num); return -1; }
void numa_membench(mem_bench_info_t *mbinfo) { assert(mbinfo->destnode <= numa_max_node()); { long size, freep; size = numa_node_size(mbinfo->destnode, &freep); //printf("node %d : total = %ld(B), free = %ld(B)\n", mbinfo->destnode, size, freep); assert(freep >= mbinfo->working_size); mbinfo->working_area = (long *)numa_alloc_onnode(mbinfo->working_size, mbinfo->destnode); if (NULL == mbinfo->working_area) { perror("numa_alloc_onnode"); exit(EXIT_FAILURE); } memset(mbinfo->working_area, 0, mbinfo->working_size); } memory_stress_rand(&mbinfo->pc, mbinfo->working_area, mbinfo->working_size); // release resources numa_free(mbinfo->working_area, mbinfo->working_size); }
void *SyncThreadWriteOrRead(void *arg) { struct thread_data *data = arg; bind2node_id(data->node_id); int node_id = data->node_id; int num = data->num; int off_start = data->off_start; int i; char *buffer = (char *) numa_alloc_onnode(block_size, node_id); ssd_file_desc_t fd = ssd_open(data->file_name, node_id, 0); printf("thread %d: access %d blocks\n", data->idx, num); for (i = 0; i < num; i++) { off_t offset = offs[off_start + i]; if (access == READ) ssd_read(fd, (void *) buffer, block_size, offset); else ssd_write(fd, (void *) buffer, block_size, offset); } numa_free(buffer, block_size); ssd_close(fd); return NULL; }
/** * \brief allocates size bytes of memory on the local node * * \param size size of the memory region in bytes * \param pagesize page size to be used for the mapping * * \returns pointer to memory region * * The memory must be freed with numa_free(). On errors NULL is returned. */ void *numa_alloc_local(size_t size, size_t pagesize) { nodeid_t node = numa_current_node(); NUMA_DEBUG_ALLOC("allocate on local node %" PRIuNODEID "\n", node); return numa_alloc_onnode(size, node, pagesize); }
/* * Class: xerial_jnuma_NumaNative * Method: allocateOnNode * Signature: (JI)J */ JNIEXPORT jlong JNICALL Java_xerial_jnuma_NumaNative_allocateOnNode (JNIEnv *env, jobject obj, jlong capacity, jint node) { void* mem = numa_alloc_onnode((size_t) capacity, node); if(mem != NULL) { return (jlong) mem; } throwException(env, obj, 11); return 0L; }
JNIEXPORT jobject JNICALL Java_xerial_jnuma_NumaNative_allocOnNode (JNIEnv *env, jobject jobj, jint capacity, jint node) { jobject b; void* mem = numa_alloc_onnode((size_t) capacity, (int) node); if(mem == NULL) printf("failed to allocate memory on node %d\n", (int) node); b = (*env)->NewDirectByteBuffer(env, mem, (jlong) capacity); return b; }
dpi_flow_DB_v4_t* dpi_flow_table_create_v4( u_int32_t size, u_int32_t max_active_v4_flows, u_int16_t num_partitions){ #endif u_int32_t i; dpi_flow_DB_v4_t* table; if(size!=0){ assert((table=(dpi_flow_DB_v4_t*) malloc(sizeof(dpi_flow_DB_v4_t)))!=NULL); table->table=(ipv4_flow_t*) malloc(sizeof(ipv4_flow_t)*size); assert(table->table); table->total_size=size; table->num_partitions=num_partitions; table->max_active_flows=max_active_v4_flows; #if DPI_FLOW_TABLE_USE_MEMORY_POOL table->start_pool_size=start_pool_size; #endif for(i=0; i<table->total_size; i++){ /** Creation of sentinel node. **/ table->table[i].next=&(table->table[i]); table->table[i].prev=&(table->table[i]); } #if DPI_NUMA_AWARE table->partitions=numa_alloc_onnode( sizeof(dpi_flow_DB_v4_partition_t)*table->num_partitions, DPI_NUMA_AWARE_FLOW_TABLE_NODE); assert(table->partitions); #else assert(posix_memalign( (void**) &(table->partitions), DPI_CACHE_LINE_SIZE, sizeof(dpi_flow_DB_v4_partition_t)*table->num_partitions)==0); #endif #if DPI_FLOW_TABLE_HASH_VERSION == DPI_MURMUR3_HASH srand((unsigned int) time(NULL)); table->seed=rand(); #endif dpi_flow_table_setup_partitions_v4(table, table->num_partitions); }else table=NULL; return table; }
static inline ipv6_flow_t* v6_flow_alloc(){ void* r; #if DPI_NUMA_AWARE r=numa_alloc_onnode(sizeof(ipv6_flow_t), DPI_NUMA_AWARE_FLOW_TABLE_NODE); assert(r); #else #if DPI_FLOW_TABLE_ALIGN_FLOWS assert(posix_memalign((void**) &r, DPI_CACHE_LINE_SIZE, sizeof(ipv6_flow_t))==0); #else r=malloc(sizeof(ipv6_flow_t)); assert(r); #endif #endif return (ipv6_flow_t*) r; }
TaskManager :: TaskManager() { num_threads = GetMaxThreads(); // if (MyMPI_GetNTasks() > 1) num_threads = 1; #ifdef USE_NUMA numa_available(); num_nodes = numa_max_node() + 1; if (num_nodes > num_threads) num_nodes = num_threads; for (int j = 0; j < num_nodes; j++) { void * mem = numa_alloc_onnode (sizeof(NodeData), j); nodedata[j] = new (mem) NodeData; complete[j] = -1; workers_on_node[j] = 0; } #else num_nodes = 1; nodedata[0] = new NodeData; complete[0] = -1; workers_on_node[0] = 0; #endif jobnr = 0; done = 0; sleep = false; sleep_usecs = 1000; active_workers = 0; static int cnt = 0; char buf[100]; if (use_paje_trace) { #ifdef PARALLEL sprintf(buf, "ng%d_rank%d.trace", cnt++, MyMPI_GetId()); #else sprintf(buf, "ng%d.trace", cnt++); #endif } else buf[0] = 0; //sprintf(buf, ""); trace = new PajeTrace(num_threads, buf); }
static inline #endif mc_pfwl_task_t * pfwl_allocate_task() { mc_pfwl_task_t *r; #if PFWL_NUMA_AWARE r = (mc_pfwl_task_t *) numa_alloc_onnode(sizeof(mc_pfwl_task_t), PFWL_NUMA_AWARE_TASKS_NODE); #else #if PFWL_MULTICORE_ALIGN_TASKS if (posix_memalign((void **) &r, PFWL_CACHE_LINE_SIZE, sizeof(mc_pfwl_task_t))) { throw std::runtime_error("posix_memalign failed."); } #else r = new mc_pfwl_task_t; #endif #endif return r; }
/** * \brief allocates size bytes of memory with the current NUMA policy. * * \param size size of the memory region in bytes * \param pagesize preferred page size to be used * \returns pointer to the mapped memory region * * The memory must be freed with numa_free(). On errors NULL is returned. */ void *numa_alloc(size_t size, size_t pagesize) { NUMA_DEBUG_ALLOC("allocate according to policy\n"); /* check if we use interleaved mode */ if (bitmap_get_weight(numa_alloc_interleave_mask)) { return numa_alloc_interleaved_subset(size, pagesize, numa_alloc_interleave_mask); } /* check membind */ if (bitmap_get_weight(numa_alloc_bind_mask) == 1) { nodeid_t node = (nodeid_t) bitmap_get_first(numa_alloc_bind_mask); return numa_alloc_onnode(size, node, pagesize); } /* TODO: * - handle the case where multiple nodes are set in membind */ /* just return some memory */ return malloc(size); }
int main(int argc, char *argv[]) { int node_id = 0; int arrival_lambda = 10; int thread_cpu_map[N_THREADS]; int i,j,k; int n_threads; int n_left; int n_right; int next_index_left = 3; int next_index_right = 7; float local_square = 0.0, remote_square = 0.0; /***************** make sure #args is correct and get the n_threads, n_left and n_right */ if(argc < 4) { printf("Usage: ./test_numa_comb n_of_threads n_of_threads_on_node0 n_of_threads_on_node1\n"); exit(-1); } n_threads = atoi(argv[1]); n_left = atoi(argv[2]); n_right = atoi(argv[3]); /******************* Set the thread_cpu_map according to the n_left and n_right */ printf("n_threads: %d, n_left: %d, n_right: %d\n",n_threads,n_left,n_right); for(i = 0; i < n_left; i++) { thread_cpu_map[i] = next_index_left; next_index_left--; } for(i = n_left; i < n_threads; i++) { thread_cpu_map[i] = next_index_right; next_index_right--; } for(i = 0; i < n_threads; i++) { printf("Thread %d is on cpu %d\n",i,thread_cpu_map[i]); } thread_params para[n_threads]; //The parameters to pass to the threads //printf("The return value of numa_get_run_node_mask(void) is %d\n",numa_get_run_node_mask()); //printf("The return value of numa_max_node(void) is %d\n",numa_max_node()); //numa_tonode_memory((void *)spinlock_ptr,sizeof(pthread_spinlock_t),node_id); //This doesn't work //initilize the spinlock pointer and put it on a specific node pthread_spinlock_t *spinlock_ptr = numa_alloc_onnode(sizeof(pthread_spinlock_t),node_id); if(spinlock_ptr == NULL) //error handling of the allocating of a spinlock pointer on a specific node { printf("alloc of spinlock on a node failed.\n"); exit(-1); } /* initialise syncs */ pthread_barrier_init(&fin_barrier, NULL, n_threads); pthread_spin_init(spinlock_ptr,0); int rc; //create the threads for(i = 0; i < n_threads; i++) { para[i].thread_id = i; para[i].arrival_lambda = arrival_lambda; para[i].spinlock_ptr = spinlock_ptr; CPU_ZERO(&cpuset[i]); CPU_SET(thread_cpu_map[i],&cpuset[i]); rc = pthread_create(&threads[i],NULL,work,(void*)¶[i]); E (rc); } start_work_flag = 1; /* wait here */ for(i = 0; i < n_threads; i++) pthread_join(threads[i],NULL); pthread_barrier_destroy(&fin_barrier); /* for(i = 0; i < n_threads; i++) { printf("The time to get one lock for thread %d is : %.9f\n",i,time_in_cs[i]/num_access_each_thread[i]); printf("The number of lock accesses for thread %d is : %d\n",i,num_access_each_thread[i]); } */ qsort((void*)g_tss,(size_t)access_count,(size_t)sizeof(timestamp),cmp_timestamp); /* for (i = 0; i < access_count; i++) printf("%lu with id %d\n",g_tss[i].ts,g_tss[i].id); */ /* for (i = 0; i < access_count; i++) * { * printf ("%lu %d\n", g_tss[i].ts, g_tss[i].id); * } */ /* */ int cs_order[access_count/2]; for(i = 0; i < access_count/2; i++) { cs_order[i] = g_tss[i*2].id; //printf("%d in cs\n",cs_order[i]); } int cs_matrix[n_threads][n_threads]; uint64_t delay_matrix[n_threads][n_threads]; float prob_matrix[n_threads][n_threads]; float rate_matrix[n_threads][n_threads]; // zero out all the matrices memset(&cs_matrix, '\0', n_threads*n_threads*sizeof(int)); memset(&delay_matrix, '\0', n_threads*n_threads*sizeof(uint64_t)); memset(&prob_matrix, '\0', n_threads*n_threads*sizeof(float)); int local_count2 = 0, remote_count2 = 0; uint64_t diff; for(i = 0; i < n_threads; i++) for(j = 0; j < n_threads; j++) for(k = 0; k < access_count/2 -1 ; k++) { if(cs_order[k] == i && cs_order[k+1] == j) { cs_matrix[i][j]++; diff = g_tss[2*k+2].ts - g_tss[2*k+1].ts; delay_matrix[i][j] += diff; if(is_on_same_node(i, j, n_threads, n_left, n_right)) { dprintf("local_delay: %lu\n", diff); local_square += sqr(diff); local_count2++; } else { dprintf("remote_delay: %lu\n", diff); remote_square += sqr(diff); remote_count2++; } } } int num_access[n_threads]; for(i = 0; i < access_count/2 -1; i++) for(j = 0; j < n_threads; j++) { if (cs_order[i] == j) num_access[j]++; } for(i = 0; i < n_threads; i++) printf("num_access[%d]:%d\n",i,num_access[i]); for(i = 0; i < n_threads; i++) for(j = 0; j < n_threads ; j++) { prob_matrix[i][j] = (float)cs_matrix[i][j]/(float)num_access[i]; rate_matrix[i][j] = 1.0/((delay_matrix[i][j]/(float)cs_matrix[i][j])/CPU_FREQ); } printf ("\n***************** PROBS *******************\n"); printf ("Lock is on LP, [L, R] is [%d, %d]:\n", n_left - 1, n_right); // tl printf ("L -> L\n"); print_mtx (n_threads, n_threads, prob_matrix, 0, 0, n_left, n_left, 0); // tr printf ("L -> R\n"); print_mtx (n_threads, n_threads, prob_matrix, n_left, 0, n_threads, n_left, 0); printf ("Lock is on RP, [L, R] is [%d, %d]:\n", n_left, n_right - 1); // br printf ("R -> R\n"); print_mtx (n_threads, n_threads, prob_matrix, n_left, n_left, n_threads, n_threads, 0); // bl printf ("R -> L\n"); print_mtx (n_threads, n_threads, prob_matrix, 0, n_left, n_left, n_threads, 0); printf ("\n***************** RATES *******************\n"); printf ("Lock is on LP, [L, R] is [%d, %d]:\n", n_left - 1, n_right); // tl printf ("L -> L\n"); print_mtx (n_threads, n_threads, rate_matrix, 0, 0, n_left, n_left, 1); // tr printf ("L -> R\n"); print_mtx (n_threads, n_threads, rate_matrix, n_left, 0, n_threads, n_left, 1); printf ("Lock is on RP, [L, R] is [%d, %d]:\n", n_left, n_right - 1); // br printf ("R -> R\n"); print_mtx (n_threads, n_threads, rate_matrix, n_left, n_left, n_threads, n_threads, 1); // bl printf ("R -> \n"); print_mtx (n_threads, n_threads, rate_matrix, 0, n_left, n_left, n_threads, 1); //print the intra-core and inter-core delay //thread 0 - n_left -1 are on the left core, n_left to n_threads are on the right core uint64_t local_delay = 0, remote_delay = 0; int local_count = 0, remote_count = 0; float local_prob = 0.0, remote_prob = 0.0; for(i = 0; i < n_threads; i++) for(j = 0; j < n_threads; j++) { if (j == i) continue; if(is_on_same_node(i, j, n_threads, n_left, n_right)) { //printf("%d and %d on the same node\n",i,j); local_delay += delay_matrix[i][j]; local_count += cs_matrix[i][j]; local_prob += prob_matrix[j][i]; } else { //printf("%d and %d not the same node\n",i,j); remote_delay += delay_matrix[i][j]; remote_count += cs_matrix[i][j]; remote_prob += prob_matrix[j][i]; } } float local = (float)local_delay/(local_count); float remote = (float)remote_delay/(remote_count); printf("\n\n**************************** Aggregates ***************************\n"); printf("local delay: %f, remote_delay: %f, local_count: %d, remote_count: %d\n",(float)local_delay/(local_count),(float)remote_delay/(remote_count),local_count,remote_count); printf("local prob:%f, remote prob: %f\n",local_prob/n_threads, remote_prob/n_threads); printf("local delay variance:%f, remote delay variance: %f\n",local_square/local_count - local*local, remote_square/remote_count - remote*remote); printf("local count2: %d, remote_count2:%d\n",local_count2, remote_count2); pthread_spin_destroy(spinlock_ptr); numa_free((void *)spinlock_ptr,sizeof(pthread_spinlock_t)); pthread_exit(NULL); return 0; }
int main(int argc, char **argv) { char *msg; /* message returned from parse_opts */ /* parse standard options */ msg = parse_opts(argc, argv, NULL, NULL); if (msg != NULL) { tst_brkm(TBROK, NULL, "OPTION PARSING ERROR - %s", msg); } setup(); #if HAVE_NUMA_MOVE_PAGES unsigned int i; int lc; /* loop counter */ unsigned int from_node; unsigned int to_node; int ret; ret = get_allowed_nodes(NH_MEMS, 2, &from_node, &to_node); if (ret < 0) tst_brkm(TBROK|TERRNO, cleanup, "get_allowed_nodes: %d", ret); /* check for looping state if -i option is given */ for (lc = 0; TEST_LOOPING(lc); lc++) { void *pages[TEST_PAGES] = { 0 }; int nodes[TEST_PAGES]; int status[TEST_PAGES]; unsigned long onepage = get_page_size(); /* reset Tst_count in case we are looping */ Tst_count = 0; ret = alloc_pages_on_node(pages, TOUCHED_PAGES, from_node); if (ret == -1) continue; /* Allocate page and do not touch it. */ pages[UNTOUCHED_PAGE] = numa_alloc_onnode(onepage, from_node); if (pages[UNTOUCHED_PAGE] == NULL) { tst_resm(TBROK, "failed allocating page on node %d", from_node); goto err_free_pages; } for (i = 0; i < TEST_PAGES; i++) nodes[i] = to_node; ret = numa_move_pages(0, TEST_PAGES, pages, nodes, status, MPOL_MF_MOVE); TEST_ERRNO = errno; if (ret == -1) { tst_resm(TFAIL | TERRNO, "move_pages unexpectedly failed"); goto err_free_pages; } if (status[UNTOUCHED_PAGE] == -ENOENT) tst_resm(TPASS, "status[%d] set to expected -ENOENT", UNTOUCHED_PAGE); else tst_resm(TFAIL, "status[%d] is %d", UNTOUCHED_PAGE, status[UNTOUCHED_PAGE]); err_free_pages: /* This is capable of freeing both the touched and * untouched pages. */ free_pages(pages, TEST_PAGES); } #else tst_resm(TCONF, "move_pages support not found."); #endif cleanup(); tst_exit(); }
void myhbwmalloc_init(void) { /* set to NULL before trying to initialize. if we return before * successful creation of the mspace, then it will still be NULL, * and we can use that in subsequent library calls to determine * that the library failed to initialize. */ myhbwmalloc_mspace = NULL; /* verbose printout? */ myhbwmalloc_verbose = 0; { char * env_char = getenv("HBWMALLOC_VERBOSE"); if (env_char != NULL) { myhbwmalloc_verbose = 1; printf("hbwmalloc: HBWMALLOC_VERBOSE set\n"); } } /* fail hard or soft? */ myhbwmalloc_hardfail = 1; { char * env_char = getenv("HBWMALLOC_SOFTFAIL"); if (env_char != NULL) { myhbwmalloc_hardfail = 0; printf("hbwmalloc: HBWMALLOC_SOFTFAIL set\n"); } } /* set the atexit handler that will destroy the mspace and free the numa allocation */ atexit(myhbwmalloc_final); /* detect and configure use of NUMA memory nodes */ { int max_possible_node = numa_max_possible_node(); int num_possible_nodes = numa_num_possible_nodes(); int max_numa_nodes = numa_max_node(); int num_configured_nodes = numa_num_configured_nodes(); int num_configured_cpus = numa_num_configured_cpus(); if (myhbwmalloc_verbose) { printf("hbwmalloc: numa_max_possible_node() = %d\n", max_possible_node); printf("hbwmalloc: numa_num_possible_nodes() = %d\n", num_possible_nodes); printf("hbwmalloc: numa_max_node() = %d\n", max_numa_nodes); printf("hbwmalloc: numa_num_configured_nodes() = %d\n", num_configured_nodes); printf("hbwmalloc: numa_num_configured_cpus() = %d\n", num_configured_cpus); } /* FIXME this is a hack. assumes HBW is only numa node 1. */ if (num_configured_nodes <= 2) { myhbwmalloc_numa_node = num_configured_nodes-1; } else { fprintf(stderr,"hbwmalloc: we support only 2 numa nodes, not %d\n", num_configured_nodes); } if (myhbwmalloc_verbose) { for (int i=0; i<num_configured_nodes; i++) { unsigned max_numa_cpus = numa_num_configured_cpus(); struct bitmask * mask = numa_bitmask_alloc( max_numa_cpus ); int rc = numa_node_to_cpus(i, mask); if (rc != 0) { fprintf(stderr, "hbwmalloc: numa_node_to_cpus failed\n"); } else { printf("hbwmalloc: numa node %d cpu mask:", i); for (unsigned j=0; j<max_numa_cpus; j++) { int bit = numa_bitmask_isbitset(mask,j); printf(" %d", bit); } printf("\n"); } numa_bitmask_free(mask); } fflush(stdout); } } #if 0 /* unused */ /* see if the user specifies a slab size */ size_t slab_size_requested = 0; { char * env_char = getenv("HBWMALLOC_BYTES"); if (env_char!=NULL) { long units = 1L; if ( NULL != strstr(env_char,"G") ) units = 1000000000L; else if ( NULL != strstr(env_char,"M") ) units = 1000000L; else if ( NULL != strstr(env_char,"K") ) units = 1000L; else units = 1L; int num_count = strspn(env_char, "0123456789"); memset( &env_char[num_count], ' ', strlen(env_char)-num_count); slab_size_requested = units * atol(env_char); } if (myhbwmalloc_verbose) { printf("hbwmalloc: requested slab_size_requested = %zu\n", slab_size_requested); } } #endif /* see what libnuma says is available */ size_t myhbwmalloc_slab_size; { int node = myhbwmalloc_numa_node; long long freemem; long long maxmem = numa_node_size64(node, &freemem); if (myhbwmalloc_verbose) { printf("hbwmalloc: numa_node_size64 says maxmem=%lld freemem=%lld for numa node %d\n", maxmem, freemem, node); } myhbwmalloc_slab_size = freemem; } /* assume threads, disable if MPI knows otherwise, then allow user to override. */ int multithreaded = 1; #ifdef HAVE_MPI int nprocs; { int is_init, is_final; MPI_Initialized(&is_init); MPI_Finalized(&is_final); if (is_init && !is_final) { MPI_Comm_size(MPI_COMM_WORLD, &nprocs); } /* give equal portion to every MPI process */ myhbwmalloc_slab_size /= nprocs; /* if the user initializes MPI with MPI_Init or * MPI_Init_thread(MPI_THREAD_SINGLE), they assert there * are no threads at all, which means we can skip the * malloc mspace lock. * * if the user lies to MPI, they deserve any bad thing * that comes of it. */ int provided; MPI_Query_thread(&provided); if (provided==MPI_THREAD_SINGLE) { multithreaded = 0; } else { multithreaded = 1; } if (myhbwmalloc_verbose) { printf("hbwmalloc: MPI processes = %d (threaded = %d)\n", nprocs, multithreaded); printf("hbwmalloc: myhbwmalloc_slab_size = %d\n", myhbwmalloc_slab_size); } } #endif /* user can assert that hbwmalloc and friends need not be thread-safe */ { char * env_char = getenv("HBWMALLOC_LOCKLESS"); if (env_char != NULL) { multithreaded = 0; if (myhbwmalloc_verbose) { printf("hbwmalloc: user has disabled locking in mspaces by setting HBWMALLOC_LOCKLESS\n"); } } } myhbwmalloc_slab = numa_alloc_onnode( myhbwmalloc_slab_size, myhbwmalloc_numa_node); if (myhbwmalloc_slab==NULL) { fprintf(stderr, "hbwmalloc: numa_alloc_onnode returned NULL for size = %zu\n", myhbwmalloc_slab_size); return; } else { if (myhbwmalloc_verbose) { printf("hbwmalloc: numa_alloc_onnode succeeded for size %zu\n", myhbwmalloc_slab_size); } /* part (less than 128*sizeof(size_t) bytes) of this space is used for bookkeeping, * so the capacity must be at least this large */ if (myhbwmalloc_slab_size < 128*sizeof(size_t)) { fprintf(stderr, "hbwmalloc: not enough space for mspace bookkeeping\n"); return; } /* see above regarding if the user lies to MPI. */ int locked = multithreaded; myhbwmalloc_mspace = create_mspace_with_base( myhbwmalloc_slab, myhbwmalloc_slab_size, locked); if (myhbwmalloc_mspace == NULL) { fprintf(stderr, "hbwmalloc: create_mspace_with_base returned NULL\n"); return; } else if (myhbwmalloc_verbose) { printf("hbwmalloc: create_mspace_with_base succeeded for size %zu\n", myhbwmalloc_slab_size); } } }
void INTERNAL *qt_affinity_alloc_onnode(size_t bytes, int node) { /*{{{ */ return numa_alloc_onnode(bytes, node); } /*}}} */
static void * s_numa_alloc(size_t sz, int cpu) { void *ret = NULL; if (likely(sz > 0)) { if (likely(cpu >= 0)) { if (likely(s_numa_nodes != NULL && s_n_cpus > 0)) { unsigned int node = s_numa_nodes[cpu]; unsigned int allocd_node = UINT_MAX; struct bitmask *bmp; int r; bmp = numa_allocate_nodemask(); numa_bitmask_setbit(bmp, node); errno = 0; r = (int)set_mempolicy(MPOL_BIND, bmp->maskp, bmp->size + 1); if (likely(r == 0)) { errno = 0; ret = numa_alloc_onnode(sz, (int)node); if (likely(ret != NULL)) { lagopus_result_t rl; /* * We need this "first touch" even using the * numa_alloc_onnode(). */ (void)memset(ret, 0, sz); errno = 0; r = (int)get_mempolicy((int *)&allocd_node, NULL, 0, ret, MPOL_F_NODE|MPOL_F_ADDR); if (likely(r == 0)) { if (unlikely(node != allocd_node)) { /* * The memory is not allocated on the node, but it is * still usable. Just return it. */ lagopus_msg_warning("can't allocate " PFSZ(u) " bytes memory " "for CPU %d (NUMA node %d).\n", sz, cpu, node); } } else { lagopus_perror(LAGOPUS_RESULT_POSIX_API_ERROR); lagopus_msg_error("get_mempolicy() returned %d.\n", r); } rl = s_add_addr(ret, sz); if (unlikely(rl != LAGOPUS_RESULT_OK)) { lagopus_perror(rl); lagopus_msg_error("can't register the allocated address.\n"); numa_free(ret, sz); ret = NULL; } } } else { /* r == 0 */ lagopus_perror(LAGOPUS_RESULT_POSIX_API_ERROR); lagopus_msg_error("set_mempolicy() returned %d.\n", r); } numa_free_nodemask(bmp); set_mempolicy(MPOL_DEFAULT, NULL, 0); } else { /* s_numa_nodes != NULL && s_n_cpus > 0 */ /* * Not initialized or initialization failure. */ lagopus_msg_warning("The NUMA related information is not initialized. " "Use malloc(3) instead.\n"); ret = malloc(sz); } } else { /* cpu >= 0 */ /* * Use pure malloc(3). */ ret = malloc(sz); } } return ret; }
int main(int argc, char **argv) { tst_parse_opts(argc, argv, NULL, NULL); setup(); #if HAVE_NUMA_MOVE_PAGES unsigned int i; int lc; unsigned int from_node; unsigned int to_node; int ret, exp_status; if ((tst_kvercmp(4, 3, 0)) >= 0) exp_status = -EFAULT; else exp_status = -ENOENT; ret = get_allowed_nodes(NH_MEMS, 2, &from_node, &to_node); if (ret < 0) tst_brkm(TBROK | TERRNO, cleanup, "get_allowed_nodes: %d", ret); /* check for looping state if -i option is given */ for (lc = 0; TEST_LOOPING(lc); lc++) { void *pages[TEST_PAGES] = { 0 }; int nodes[TEST_PAGES]; int status[TEST_PAGES]; unsigned long onepage = get_page_size(); /* reset tst_count in case we are looping */ tst_count = 0; ret = alloc_pages_on_node(pages, TOUCHED_PAGES, from_node); if (ret == -1) continue; /* Allocate page and do not touch it. */ pages[UNTOUCHED_PAGE] = numa_alloc_onnode(onepage, from_node); if (pages[UNTOUCHED_PAGE] == NULL) { tst_resm(TBROK, "failed allocating page on node %d", from_node); goto err_free_pages; } for (i = 0; i < TEST_PAGES; i++) nodes[i] = to_node; ret = numa_move_pages(0, TEST_PAGES, pages, nodes, status, MPOL_MF_MOVE); if (ret == -1) { tst_resm(TFAIL | TERRNO, "move_pages unexpectedly failed"); goto err_free_pages; } if (status[UNTOUCHED_PAGE] == exp_status) { tst_resm(TPASS, "status[%d] has expected value", UNTOUCHED_PAGE); } else { tst_resm(TFAIL, "status[%d] is %s, expected %s", UNTOUCHED_PAGE, tst_strerrno(-status[UNTOUCHED_PAGE]), tst_strerrno(-exp_status)); } err_free_pages: /* This is capable of freeing both the touched and * untouched pages. */ free_pages(pages, TEST_PAGES); } #else tst_resm(TCONF, "move_pages support not found."); #endif cleanup(); tst_exit(); }
void* pmalloc(size_t size) { thread_t* thread = thread_self(); return numa_alloc_onnode(size, thread->virtual_node->nvram_node->node_id); }
void dpi_flow_table_setup_partitions_v6(dpi_flow_DB_v6_t* table, u_int16_t num_partitions){ /** Partitions management. **/ u_int32_t partition_size=ceil((float)table->total_size/(float)table->num_partitions); u_int32_t partition_max_active_v6_flows= table->max_active_flows/table->num_partitions; u_int16_t j; u_int32_t lowest_index=0; u_int32_t highest_index=lowest_index+partition_size-1; for(j=0; j<table->num_partitions; ++j){ dpi_flow_table_initialize_informations( &(table->partitions[j].partition.informations), lowest_index, highest_index, partition_max_active_v6_flows); lowest_index=highest_index+1; /** * The last partition gets the entries up to the end of the * table. Indeed, when the size is not a multiple of the * number of partitions, the last partition may be smaller. */ if(j==table->num_partitions-2) highest_index=table->total_size-1; else highest_index+=partition_size; #if DPI_FLOW_TABLE_USE_MEMORY_POOL ipv6_flow_t* flow_pool; u_int32_t i=0; table->individual_pool_size=table->start_pool_size/table->num_partitions; #if DPI_NUMA_AWARE flow_pool=numa_alloc_onnode( sizeof(ipv6_flow_t)*table->individual_pool_size, DPI_NUMA_AWARE_FLOW_TABLE_NODE); assert(flow_pool); table->partitions[j].partition.pool=numa_alloc_onnode( sizeof(u_int32_t)*table->individual_pool_size, DPI_NUMA_AWARE_FLOW_TABLE_NODE); assert(table->partitions[j].partition.pool); #else assert(posix_memalign( (void**) &flow_pool, DPI_CACHE_LINE_SIZE, (sizeof(ipv6_flow_t)*table->individual_pool_size)+ DPI_CACHE_LINE_SIZE)==0); assert(posix_memalign( (void**) &(table->partitions[j].partition.pool), DPI_CACHE_LINE_SIZE, (sizeof(u_int32_t)*table->individual_pool_size)+ DPI_CACHE_LINE_SIZE)==0); #endif for(i=0; i<table->individual_pool_size; i++){ table->partitions[j].partition.pool[i]=i; } table->partitions[j].partition.pool_size= table->individual_pool_size; table->partitions[j].partition.memory_chunk_lower_bound= flow_pool; table->partitions[j].partition.memory_chunk_upper_bound= flow_pool+table->individual_pool_size; #endif } debug_print("%s\n", "[flow_table.c]: Computing active v6 flows."); dpi_flow_table_update_flow_count_v6(table); debug_print("%s\n", "[flow_table.c]: Active v6 flows computation finished."); }
int main(void) { int node_id = 0; int arrival_lambda = 10; int thread_cpu_map[N_THREADS] = {1,6}; int i; int j; /* pthread_spinlock_t *spinlock_ptr = malloc(sizeof(pthread_spinlock_t)); if(spinlock_ptr == NULL) //error handling of the malloc of the spinlock { printf("malloc of spinlock failed.\n"); } else { printf("malloc of spinlock succeeded.\n"); } free(spinlock_ptr); */ //pthread_t threads[N_THREADS]; //cpu_set_t cpuset[N_THREADS]; //for setting the affinity of threads thread_params para[N_THREADS]; //The parameters to pass to the threads //printf("The return value of numa_get_run_node_mask(void) is %d\n",numa_get_run_node_mask()); //printf("The return value of numa_max_node(void) is %d\n",numa_max_node()); //numa_tonode_memory((void *)spinlock_ptr,sizeof(pthread_spinlock_t),node_id); //This doesn't work //initilize the spinlock pointer and put it on a specific node pthread_spinlock_t *spinlock_ptr = numa_alloc_onnode(sizeof(pthread_spinlock_t),node_id); if(spinlock_ptr == NULL) //error handling of the allocating of a spinlock pointer on a specific node { printf("alloc of spinlock on a node failed.\n"); } else { printf("alloc of spinlock on a node succeeded.\n"); } for(j = 0; j < 100000; j++) { //initlize spinlock pthread_spin_init(spinlock_ptr,0); //create the threads for(i = 0; i < N_THREADS; i++) { int rc; int s; para[i].thread_id = i; para[i].arrival_lambda = arrival_lambda; para[i].spinlock_ptr = spinlock_ptr; CPU_ZERO(&cpuset[i]); CPU_SET(thread_cpu_map[i],&cpuset[i]); rc = pthread_create(&threads[i],NULL,work,(void*)¶[i]); if(rc) { printf("ERROR: return code from pthread_create() is %d for thread %d \n",rc,i); exit(-1); } /* s = pthread_setaffinity_np(threads[i], sizeof(cpu_set_t), &cpuset[i]); if (s != 0) perror("set affinity error\n"); */ flag = 1; } /* for(i = 0; i < N_THREADS; i++) { int s; CPU_ZERO(&cpuset[i]); CPU_SET(thread_cpu_map[i],&cpuset[i]); s = pthread_setaffinity_np(threads[i], sizeof(cpu_set_t), &cpuset[i]); if (s != 0) perror("sjfljkl\n"); } */ for(i = 0; i < N_THREADS; i++) { pthread_join(threads[i],NULL); } } for(i = 0; i < N_THREADS; i++) { printf("The time to get one lock for thread %d is : %.9f\n",i,time_in_cs[i]/100000); } pthread_spin_destroy(spinlock_ptr); numa_free(spinlock_ptr,sizeof(pthread_spinlock_t)); pthread_exit(NULL); return 0; }
int main(int argc,char *argv[]) { const char *label[4] = {"Copy", "Scale","Add", "Triad"}; const double bytes[4] = {2 * sizeof(double) * N, 2 * sizeof(double) * N, 3 * sizeof(double) * N, 3 * sizeof(double) * N}; double rmstime[4] = {0},maxtime[4] = {0},mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; int quantum; int BytesPerWord,j,k,size; PetscInt node = -1; double scalar, t, times[4][NTIMES]; #if !STATIC_ALLOC double *PETSC_RESTRICT a,*PETSC_RESTRICT b,*PETSC_RESTRICT c; #endif PetscInitialize(&argc,&argv,0,help); MPI_Comm_size(PETSC_COMM_WORLD,&size); PetscOptionsGetInt(NULL,"-node",&node,NULL); /* --- SETUP --- determine precision and check timing --- */ PetscPrintf(PETSC_COMM_WORLD,HLINE); BytesPerWord = sizeof(double); PetscPrintf(PETSC_COMM_WORLD,"This system uses %d bytes per DOUBLE PRECISION word.\n", BytesPerWord); PetscPrintf(PETSC_COMM_WORLD,HLINE); PetscPrintf(PETSC_COMM_WORLD,"Array size = %d, Offset = %d\n", N, OFFSET); PetscPrintf(PETSC_COMM_WORLD,"Total memory required = %.1f MB per process.\n", (3 * N * BytesPerWord) / 1048576.0); PetscPrintf(PETSC_COMM_WORLD,"Each test is run %d times, but only\n", NTIMES); PetscPrintf(PETSC_COMM_WORLD,"the *best* time for each is used.\n"); /* Get initial value for system clock. */ #if !STATIC_ALLOC if (node == -1) { posix_memalign((void**)&a,64,N*sizeof(double)); posix_memalign((void**)&b,64,N*sizeof(double)); posix_memalign((void**)&c,64,N*sizeof(double)); } else if (node == -2) { a = malloc(N*sizeof(double)); b = malloc(N*sizeof(double)); c = malloc(N*sizeof(double)); #if defined(HAVE_NUMA) } else { a = numa_alloc_onnode(N*sizeof(double),node); b = numa_alloc_onnode(N*sizeof(double),node); c = numa_alloc_onnode(N*sizeof(double),node); #endif } #endif #if FAULT_TOGETHER for (j=0; j<N; j++) { a[j] = 1.0; b[j] = 2.0; c[j] = 0.0; } #else for (j=0; j<N; j++) a[j] = 1.0; for (j=0; j<N; j++) b[j] = 2.0; for (j=0; j<N; j++) c[j] = 0.0; #endif PetscPrintf(PETSC_COMM_WORLD,HLINE); if ((quantum = checktick()) >= 1) PetscPrintf(PETSC_COMM_WORLD,"Your clock granularity/precision appears to be %d microseconds.\n", quantum); else PetscPrintf(PETSC_COMM_WORLD,"Your clock granularity appears to be less than one microsecond.\n"); t = Second(); for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j]; t = 1.0E6 * (Second() - t); PetscPrintf(PETSC_COMM_WORLD,"Each test below will take on the order" " of %d microseconds.\n", (int) t); PetscPrintf(PETSC_COMM_WORLD," (= %d clock ticks)\n", (int) (t/quantum)); PetscPrintf(PETSC_COMM_WORLD,"Increase the size of the arrays if this shows that\n"); PetscPrintf(PETSC_COMM_WORLD,"you are not getting at least 20 clock ticks per test.\n"); PetscPrintf(PETSC_COMM_WORLD,HLINE); PetscPrintf(PETSC_COMM_WORLD,"WARNING -- The above is only a rough guideline.\n"); PetscPrintf(PETSC_COMM_WORLD,"For best results, please be sure you know the\n"); PetscPrintf(PETSC_COMM_WORLD,"precision of your system timer.\n"); PetscPrintf(PETSC_COMM_WORLD,HLINE); /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ scalar = 3.0; for (k=0; k<NTIMES; k++) { MPI_Barrier(PETSC_COMM_WORLD); /* ### COPY: c <- a ### */ times[0][k] = Second(); MPI_Barrier(PETSC_COMM_WORLD); #if USE_MEMCPY memcpy(c,a,N*sizeof(double)); #elif SSE2 for (j=0; j<N; j+=8) { _mm_stream_pd(c+j+0,_mm_load_pd(a+j+0)); _mm_stream_pd(c+j+2,_mm_load_pd(a+j+2)); _mm_stream_pd(c+j+4,_mm_load_pd(a+j+4)); _mm_stream_pd(c+j+6,_mm_load_pd(a+j+6)); # if PREFETCH_NTA _mm_prefetch(a+j+64,_MM_HINT_NTA); # endif } #else for (j=0; j<N; j++) c[j] = a[j]; #endif MPI_Barrier(PETSC_COMM_WORLD); times[0][k] = Second() - times[0][k]; /* ### SCALE: b <- scalar * c ### */ times[1][k] = Second(); MPI_Barrier(PETSC_COMM_WORLD); #if SSE2 { __m128d scalar2 = _mm_set1_pd(scalar); for (j=0; j<N; j+=8) { _mm_stream_pd(b+j+0,_mm_mul_pd(scalar2,_mm_load_pd(c+j+0))); _mm_stream_pd(b+j+2,_mm_mul_pd(scalar2,_mm_load_pd(c+j+2))); _mm_stream_pd(b+j+4,_mm_mul_pd(scalar2,_mm_load_pd(c+j+4))); _mm_stream_pd(b+j+6,_mm_mul_pd(scalar2,_mm_load_pd(c+j+6))); # if PREFETCH_NTA _mm_prefetch(c+j+64,_MM_HINT_NTA); # endif } } #else for (j=0; j<N; j++) b[j] = scalar*c[j]; #endif MPI_Barrier(PETSC_COMM_WORLD); times[1][k] = Second() - times[1][k]; /* ### ADD: c <- a + b ### */ times[2][k] = Second(); MPI_Barrier(PETSC_COMM_WORLD); #if SSE2 { for (j=0; j<N; j+=8) { _mm_stream_pd(c+j+0,_mm_add_pd(_mm_load_pd(a+j+0),_mm_load_pd(b+j+0))); _mm_stream_pd(c+j+2,_mm_add_pd(_mm_load_pd(a+j+2),_mm_load_pd(b+j+2))); _mm_stream_pd(c+j+4,_mm_add_pd(_mm_load_pd(a+j+4),_mm_load_pd(b+j+4))); _mm_stream_pd(c+j+6,_mm_add_pd(_mm_load_pd(a+j+6),_mm_load_pd(b+j+6))); # if PREFETCH_NTA _mm_prefetch(a+j+64,_MM_HINT_NTA); _mm_prefetch(b+j+64,_MM_HINT_NTA); # endif } } #else for (j=0; j<N; j++) c[j] = a[j]+b[j]; #endif MPI_Barrier(PETSC_COMM_WORLD); times[2][k] = Second() - times[2][k]; /* ### TRIAD: a <- b + scalar * c ### */ times[3][k] = Second(); MPI_Barrier(PETSC_COMM_WORLD); #if SSE2 { __m128d scalar2 = _mm_set1_pd(scalar); for (j=0; j<N; j+=8) { _mm_stream_pd(a+j+0,_mm_add_pd(_mm_load_pd(b+j+0),_mm_mul_pd(scalar2,_mm_load_pd(c+j+0)))); _mm_stream_pd(a+j+2,_mm_add_pd(_mm_load_pd(b+j+2),_mm_mul_pd(scalar2,_mm_load_pd(c+j+2)))); _mm_stream_pd(a+j+4,_mm_add_pd(_mm_load_pd(b+j+4),_mm_mul_pd(scalar2,_mm_load_pd(c+j+4)))); _mm_stream_pd(a+j+6,_mm_add_pd(_mm_load_pd(b+j+6),_mm_mul_pd(scalar2,_mm_load_pd(c+j+6)))); # if PREFETCH_NTA _mm_prefetch(b+j+64,_MM_HINT_NTA); _mm_prefetch(c+j+64,_MM_HINT_NTA); # endif } } #else for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j]; #endif MPI_Barrier(PETSC_COMM_WORLD); times[3][k] = Second() - times[3][k]; } /* --- SUMMARY --- */ for (k=0; k<NTIMES; k++) for (j=0; j<4; j++) { rmstime[j] = rmstime[j] + (times[j][k] * times[j][k]); mintime[j] = MIN(mintime[j], times[j][k]); maxtime[j] = MAX(maxtime[j], times[j][k]); } PetscPrintf(PETSC_COMM_WORLD,"%8s: %11s %11s %11s %11s %11s\n","Function","Rate (MB/s)","Total (MB/s)","RMS time","Min time","Max time"); for (j=0; j<4; j++) { rmstime[j] = sqrt(rmstime[j]/(double)NTIMES); PetscPrintf(PETSC_COMM_WORLD,"%8s: %11.4f %11.4f %11.4f %11.4f %11.4f\n", label[j], 1.0e-06*bytes[j]/mintime[j], size*1.0e-06*bytes[j]/mintime[j], rmstime[j], mintime[j], maxtime[j]); } PetscFinalize(); return 0; }
int main(int argc, char* argv[]) { printf("\n NODE_BIND:%d, NUMA:%d, CPU_BIND:%d, FIRST_TOUCH:%d\n",NODE_BIND, NUMA, CPU_BIND, FIRST_TOUCH); int repetitions, // number of repetition maxThreads, // max number of threads it, N; // array size; int bitCount = 1; int * key; // array of keys long * dataIn; // input data long * dataSTL; // input stl data long * dataRadix; // input radix data repetitions = 1; #pragma omp parallel maxThreads = omp_get_num_threads(); if(argc ==1 ){ printf("prog input_file number_of_elements bit_count number_of_repetitions\n"); printf("NO INPUT FILE"); return 0; } if(argc == 2){ printf("prog input_file number_of_elements bit_count number_of_repetitions\n"); printf("NO ELEMENT COUNT\n"); return 0; } if(argc >2 ){ N = (int) strtol(argv[2], NULL, 10); } if(argc >3){ int tmp; tmp = (int) strtol(argv[3], NULL, 10); if ((tmp > 0) && (tmp<=16 )) // limit bit count bitCount = tmp; } if(argc >4){ int tmp; tmp = (int) strtol(argv[4], NULL, 10); if ((tmp > 0) && (tmp<=10000 )) // limit repetitions repetitions = tmp; } int *input; size_t N2; printf( "Reading data from file.\n" ); if( readIntArrayFile( argv[1], &input, &N2 ) ) return 1; printf( "Data reading done.\n" ); if( (N2<(size_t)N) || (N<=0) ) N = N2; printf( "\nPARALLEL STL SORT for N=%d, max threads = %d, test repetitions: %d\n", N, maxThreads, repetitions); dataIn = new long[N]; dataSTL = new long[N]; #ifdef _WIN32 dataRadix = new long[N]; key = new int[N]; #endif #ifdef linux key = new int[N]; #if NUMA==0 dataRadix = new long[N]; #elif NUMA==1 dataRadix = (long*) numa_alloc_interleaved(N * sizeof(long)); #elif NUMA==2 dataRadix = (long*)numa_alloc_onnode(sizeof(long)*N,1); #endif #endif VTimer stlTimes(maxThreads); VTimer radixTimes(maxThreads); #if TIME_COUNT==1 VTimer partTimes(TIMERS_COUNT); #endif #if FLUSH_CACHE==1 #ifdef linux CacheFlusher cf; #endif #endif for(long i=0;i<N;i++) dataIn[i]=input[i]; delete[] input; // loop from 1 to maxThreads for (int t = 1; t <= maxThreads; t++) { int i; #if TIME_COUNT==1 partTimes.reset(); #endif #if CALC_REF==1 // parallel STL for (it = 0; it < repetitions; it++) { setThreadsNo(t, maxThreads); #pragma omp parallel for private(i) for (i = 0; i < N; i++) dataSTL[i] = dataIn[i]; #if FLUSH_CACHE==1 #ifdef linux cf.flush(); #endif #endif stlTimes.timerStart(t-1); #ifdef linux __gnu_parallel::sort(dataSTL, dataSTL + N); #endif #ifdef _WIN32 std::sort(dataSTL, dataSTL + N); #endif stlTimes.timerEnd(t-1); } #if FLUSH_CACHE==1 #ifdef linux cf.flush(); #endif #endif #endif // radix sort V1 for (it = 0; it < repetitions; it++) { setThreadsNo(t, maxThreads); #pragma omp parallel for private(i) default(shared) for (i = 0; i < N; i++){ dataRadix[i] = dataIn[i]; key[i]=i; } #if FLUSH_CACHE==1 #ifdef linux cf.flush(); #endif #endif omp_set_num_threads(t); radixTimes.timerStart(t-1); #if TIME_COUNT==1 prsort::pradsort<long,int>(dataRadix,key, N, bitCount,&partTimes); #else prsort::pradsort<long,int>(dataRadix,key, N,bitCount,NULL); #endif radixTimes.timerEnd(t-1); } #if CALC_REF==1 printf("|STL SORT(th=%2d) : %1.3fs |\t", t, stlTimes.getTime(t-1)); #endif #if TIME_COUNT==1 for (int i = 0; i < TIMERS_COUNT; i++) { #if CREATE_OUTPUT==1 printf("%d %d %d %d %d %d %d %f\n", NUMA, NODE_BIND, CPU_BIND, FIRST_TOUCH,bitCount , t, i ,partTimes.getTime(i)); #else printf("part%d :%f ", i, partTimes.getTime(i)); #endif } #endif #if CREATE_OUTPUT ==1 printf("%d %d %d %d %d %d calosc %1.3f", NUMA,NODE_BIND,CPU_BIND,FIRST_TOUCH,bitCount, t ,radixTimes.getTime(t-1)); #else printf("|RADIX SORT (th=%2d) : %1.3fs |\t", t, radixTimes.getTime(t-1)); #endif // Attention: checking result only from the last function usage #if CALC_REF==1 checkResults(dataSTL, dataRadix, N); #else printf("\n"); #endif #if CHECK_KEY==1 if(checkKey(dataIn,dataRadix,key,N))printf("Keys are good\n"); #endif } #ifdef linux delete[] key; #if NUMA>0 numa_free(dataRadix, sizeof(long) * N); #else delete[] dataRadix; #endif #endif #ifdef _WIN32 delete[] dataRadix; #endif delete[] dataIn; delete[] dataSTL; #if TIME_COUNT==1 #endif return 0; }
int main(void) { int node_id = 0; int arrival_lambda = 10; int thread_cpu_map[N_THREADS] = {1,6}; int n = 10000; int i; int j; /* pthread_spinlock_t *spinlock_ptr = malloc(sizeof(pthread_spinlock_t)); if(spinlock_ptr == NULL) //error handling of the malloc of the spinlock { printf("malloc of spinlock failed.\n"); } else { printf("malloc of spinlock succeeded.\n"); } free(spinlock_ptr); */ //pthread_t threads[N_THREADS]; //cpu_set_t cpuset[N_THREADS]; //for setting the affinity of threads thread_params para[N_THREADS]; //The parameters to pass to the threads //printf("The return value of numa_get_run_node_mask(void) is %d\n",numa_get_run_node_mask()); //printf("The return value of numa_max_node(void) is %d\n",numa_max_node()); //numa_tonode_memory((void *)spinlock_ptr,sizeof(pthread_spinlock_t),node_id); //This doesn't work //initilize the spinlock pointer and put it on a specific node pthread_spinlock_t *spinlock_ptr = numa_alloc_onnode(sizeof(pthread_spinlock_t),node_id); if(spinlock_ptr == NULL) //error handling of the allocating of a spinlock pointer on a specific node { printf("alloc of spinlock on a node failed.\n"); } else { printf("alloc of spinlock on a node succeeded.\n"); } for(j = 0; j < n; j++) { //initlize spinlock pthread_spin_init(spinlock_ptr,0); //create the threads for(i = 0; i < N_THREADS; i++) { int rc; int s; para[i].thread_id = i; para[i].arrival_lambda = arrival_lambda; para[i].spinlock_ptr = spinlock_ptr; CPU_ZERO(&cpuset[i]); CPU_SET(thread_cpu_map[i],&cpuset[i]); rc = pthread_create(&threads[i],NULL,work,(void*)¶[i]); if(rc) { printf("ERROR: return code from pthread_create() is %d for thread %d \n",rc,i); exit(-1); } flag = 1; } for(i = 0; i < N_THREADS; i++) { pthread_join(threads[i],NULL); } } for(i = 0; i < N_THREADS; i++) { printf("The time to get one lock for thread %d is : %.9f\n",i,time_in_cs[i]/n); } double diff = abs_value(time_in_cs[0] - time_in_cs[1])/n; printf("The difference of the time to get one lock is : %.9f (%f) cycles\n",diff,diff*CPU_FREQ); //this is assuming there are only two processors (needs to be changed if there are more) pthread_spin_destroy(spinlock_ptr); numa_free((void *)spinlock_ptr,sizeof(pthread_spinlock_t)); pthread_exit(NULL); return 0; }
static void setup(void) { int ret, i, j; int pagesize = getpagesize(); void *p; tst_require_root(NULL); TEST(ltp_syscall(__NR_migrate_pages, 0, 0, NULL, NULL)); if (numa_available() == -1) tst_brkm(TCONF, NULL, "NUMA not available"); ret = get_allowed_nodes_arr(NH_MEMS, &num_nodes, &nodes); if (ret < 0) tst_brkm(TBROK | TERRNO, NULL, "get_allowed_nodes(): %d", ret); if (num_nodes < 2) tst_brkm(TCONF, NULL, "at least 2 allowed NUMA nodes" " are required"); else if (tst_kvercmp(2, 6, 18) < 0) tst_brkm(TCONF, NULL, "2.6.18 or greater kernel required"); /* * find 2 nodes, which can hold NODE_MIN_FREEMEM bytes * The reason is that: * 1. migrate_pages() is expected to succeed * 2. this test avoids hitting: * Bug 870326 - migrate_pages() reports success, but pages are * not moved to desired node * https://bugzilla.redhat.com/show_bug.cgi?id=870326 */ nodeA = nodeB = -1; for (i = 0; i < num_nodes; i++) { p = numa_alloc_onnode(NODE_MIN_FREEMEM, nodes[i]); if (p == NULL) break; memset(p, 0xff, NODE_MIN_FREEMEM); j = 0; while (j < NODE_MIN_FREEMEM) { if (addr_on_node(p + j) != nodes[i]) break; j += pagesize; } numa_free(p, NODE_MIN_FREEMEM); if (j >= NODE_MIN_FREEMEM) { if (nodeA == -1) nodeA = nodes[i]; else if (nodeB == -1) nodeB = nodes[i]; else break; } } if (nodeA == -1 || nodeB == -1) tst_brkm(TCONF, NULL, "at least 2 NUMA nodes with " "free mem > %d are needed", NODE_MIN_FREEMEM); tst_resm(TINFO, "Using nodes: %d %d", nodeA, nodeB); ltpuser = getpwnam(nobody_uid); if (ltpuser == NULL) tst_brkm(TBROK | TERRNO, NULL, "getpwnam failed"); TEST_PAUSE; }