Exemplo n.º 1
0
int main(int argc, char** argv)
{
	if(argc < 3)
	{	
		std::cout << "Usage:" << std::endl
			  << "gland <localNodeId> <remoteNodeId> [--human]" <<std::endl;
		exit(EXIT_FAILURE);
	}
	if(argc == 4 && strncmp(argv[3],"--human",7) == 0)
	{
		std::cout << "enabling human readable output" << std::endl;
		humanreadable = true;
	}

	size_t localNode = std::stoi(argv[1]);
	size_t remoteNode = std::stoi(argv[2]);
	
	numa_run_on_node(localNode);
	
	double *a = (double*) numa_alloc_onnode( N * sizeof(double), localNode );
	double *b = (double*) numa_alloc_onnode( N * sizeof(double), remoteNode );

	for(int i = 0; i<N ; i++)
	{
		a[i]=(double)i;
		b[i]=(double)-i;
	}

	while(1) memcpy_task(a, b);
}
Exemplo n.º 2
0
static dynarray_t *do_create(unsigned long elem_size,
                             unsigned long alloc_grain,
                             unsigned long elems_nr,
                             int numa)
{
	struct dynarray *da;
	int node = 0;
	if (numa) {
		int cpu = sched_getcpu();
		/* Numa-aware allocation */
		if (cpu < 0) {
			perror("dynarray_create: sched_getcpu");
			exit(1);
		}

		node = numa_node_of_cpu(cpu);
		if (node < 0) {
			perror("dynarray_create: numa_node_of_cpu");
			exit(1);
		}

		da = numa_alloc_onnode(sizeof(*da), node);
	} else {
		da = malloc(sizeof(*da));
	}

	if ( !da ) {
		fprintf(stderr, "dynarray_create: malloc\n");
		exit(1);
	}
	
	da->numa = numa;
	da->next_idx = 0;
	da->elem_size = elem_size;
	if (elems_nr <= alloc_grain) {
	    da->elems_nr = alloc_grain;
	} else {
	    unsigned long rem = elems_nr % alloc_grain;
	    da->elems_nr = elems_nr;
	    if (rem)
	        da->elems_nr += alloc_grain - rem;
	}
	da->alloc_grain = alloc_grain;

	if (numa) {
		da->elems = numa_alloc_onnode(elem_size*da->elems_nr, node);
	} else {
		da->elems = malloc(elem_size*da->elems_nr);
	}

	if ( !da->elems ){
		fprintf(stderr, "dynarray_create: malloc\n");
		exit(1);
	}

	return da;
}
Exemplo n.º 3
0
/**
 * Each thread initializes the buffers in the local NUMA node for memory copy
 * in the next phase.
 */
void *buf_init_func(void *arg)
{
	int i, j;
	struct buf_init_data *data = (struct buf_init_data *) arg;

	bind2node_id(data->node_id);
	for (i = 0; i < NUM_NODES; i++) {
		for (j = 0; j < NUM_THREADS; j++) {
			/*
			 * For remote memory access, NUM_NODES * NUM_THREADS pieces of
			 * memory are allocated, even though only (NUM_NODES - 1) * NUM_THREADS
			 * pieces of memory are actually used.
			 * For local memory access, only NUM_THREADS pieces of memory
			 * are allocated.
			 */
			if (/*(i == data->node_id && use_remote)
					||*/ (i != data->node_id && !use_remote)) {
				init_buffer(&data->src_bufs[i][j]);
				init_buffer(&data->local_bufs[i][j]);
			}
			if ((/*i != data->node_id && */use_remote)
					|| (i == data->node_id && !use_remote)) {
				char *buf;
				
				if (data->mode == MEMCPY_PULL || data->mode == MEMCPY_PUSH
						|| data->mode == MEMCPY_R2R || data->mode == MEMREAD) {
					buf = (char *) numa_alloc_onnode(data->buf_size,
							data->node_id);
					materialize_buf(buf, data->buf_size);
					set_buffer(&data->src_bufs[i][j], buf, data->buf_size,
							data->node_id);
				}
				else
					init_buffer(&data->src_bufs[i][j]);

				if (data->mode == MEMCPY_PULL || data->mode == MEMCPY_PUSH
						|| data->mode == MEMCPY_R2R || data->mode == MEMWRITE) {
					buf = (char *) numa_alloc_onnode(data->buf_size,
							data->node_id);
					materialize_buf(buf, data->buf_size);
					set_buffer(&data->local_bufs[i][j], buf, data->buf_size,
							data->node_id);
				}
				else
					init_buffer(&data->local_bufs[i][j]);
			}
		}
	}
	return NULL;
}
Exemplo n.º 4
0
//----------------------------------------------------------------------
//-- a little cross platform numa allocator
//-- use the existing theron defines for convenience
//----------------------------------------------------------------------
inline void *AllocOnNode(const long node, const size_t size)
{

#if THERON_NUMA

#if THERON_WINDOWS

    #if _WIN32_WINNT >= 0x0600
    return VirtualAllocExNuma(
        GetCurrentProcess(),
        NULL,
        size,
        MEM_RESERVE | MEM_COMMIT,
        PAGE_READWRITE,
        node
    );
    #else
    return NULL;
    #endif

#elif THERON_GCC

    if ((numa_available() < 0))
    {
        return NULL;
    }

    return numa_alloc_onnode(size, node);

#endif

#endif // THERON_NUMA

    return NULL;
}
Exemplo n.º 5
0
/*
 * alloc_pages_on_nodes() - allocate pages on specified NUMA nodes
 * @pages: array in which the page pointers will be stored
 * @num: no. of pages to allocate
 * @nodes: array of NUMA nodes
 *
 * A page will be allocated in each node specified by @nodes, and the
 * page pointers will be stored in @pages array.
 *
 * RETURNS:
 * 0 on success, -1 on allocation failure.
 */
int alloc_pages_on_nodes(void **pages, unsigned int num, int *nodes)
{
	int i;
#if HAVE_NUMA_ALLOC_ONNODE
	size_t onepage = get_page_size();
#endif

	for (i = 0; i < num; i++) {
		pages[i] = NULL;
	}

	for (i = 0; i < num; i++) {
		char *page;

#if HAVE_NUMA_ALLOC_ONNODE
		pages[i] = numa_alloc_onnode(onepage, nodes[i]);
#endif
		if (pages[i] == NULL) {
			tst_resm(TBROK, "allocation of page on node "
				 "%d failed", nodes[i]);
			break;
		}

		/* Touch the page, to force allocation. */
		page = pages[i];
		page[0] = i;
	}

	if (i == num)
		return 0;

	free_pages(pages, num);

	return -1;
}
Exemplo n.º 6
0
void
numa_membench(mem_bench_info_t *mbinfo)
{

  assert(mbinfo->destnode <= numa_max_node());

  {
    long size, freep;
    size = numa_node_size(mbinfo->destnode, &freep);
    //printf("node %d : total = %ld(B), free = %ld(B)\n", mbinfo->destnode, size, freep);
    assert(freep >= mbinfo->working_size);

    mbinfo->working_area =
      (long *)numa_alloc_onnode(mbinfo->working_size, mbinfo->destnode);
    if (NULL == mbinfo->working_area) {
      perror("numa_alloc_onnode");
      exit(EXIT_FAILURE);
    }
    memset(mbinfo->working_area, 0, mbinfo->working_size);
  }

  memory_stress_rand(&mbinfo->pc, mbinfo->working_area, mbinfo->working_size);

  // release resources
  numa_free(mbinfo->working_area, mbinfo->working_size);
}
Exemplo n.º 7
0
void *SyncThreadWriteOrRead(void *arg)
{
	struct thread_data *data = arg;
	bind2node_id(data->node_id);
	int node_id = data->node_id;
	int num = data->num;
	int off_start = data->off_start;
	int i;
	char *buffer = (char *) numa_alloc_onnode(block_size, node_id);

	ssd_file_desc_t fd = ssd_open(data->file_name, node_id, 0);

	printf("thread %d: access %d blocks\n", data->idx, num);
	for (i = 0; i < num; i++) {
		off_t offset = offs[off_start + i];
		if (access == READ)
			ssd_read(fd, (void *) buffer, block_size, offset);
		else
			ssd_write(fd, (void *) buffer, block_size, offset);
	}
	numa_free(buffer, block_size);

	ssd_close(fd);
	return NULL;
}
Exemplo n.º 8
0
/**
 * \brief allocates size bytes of memory on the local node
 *
 * \param size  size of the memory region in bytes
 * \param pagesize  page size to be used for the mapping
 *
 * \returns pointer to memory region
 *
 * The memory must be freed with numa_free(). On errors NULL is returned.
 */
void *numa_alloc_local(size_t size, size_t pagesize)
{
    nodeid_t node = numa_current_node();

    NUMA_DEBUG_ALLOC("allocate on local node %" PRIuNODEID "\n", node);

    return numa_alloc_onnode(size, node, pagesize);
}
Exemplo n.º 9
0
/*
 * Class:     xerial_jnuma_NumaNative
 * Method:    allocateOnNode
 * Signature: (JI)J
 */
JNIEXPORT jlong JNICALL Java_xerial_jnuma_NumaNative_allocateOnNode
    (JNIEnv *env, jobject obj, jlong capacity, jint node) {
  void* mem = numa_alloc_onnode((size_t) capacity, node);
  if(mem != NULL) {
    return (jlong) mem;
  }
  throwException(env, obj, 11);
  return 0L;
}
Exemplo n.º 10
0
JNIEXPORT jobject JNICALL Java_xerial_jnuma_NumaNative_allocOnNode
(JNIEnv *env, jobject jobj, jint capacity, jint node)
{
    jobject b;
    void* mem = numa_alloc_onnode((size_t) capacity, (int) node);
    if(mem == NULL)
        printf("failed to allocate memory on node %d\n", (int) node);
    b = (*env)->NewDirectByteBuffer(env, mem, (jlong) capacity);
    return b;
}
Exemplo n.º 11
0
dpi_flow_DB_v4_t* dpi_flow_table_create_v4(
		u_int32_t size, u_int32_t max_active_v4_flows,
		u_int16_t num_partitions){
#endif
	u_int32_t i;
	dpi_flow_DB_v4_t* table;

	if(size!=0){
		assert((table=(dpi_flow_DB_v4_t*)
				  malloc(sizeof(dpi_flow_DB_v4_t)))!=NULL);
		table->table=(ipv4_flow_t*)
				  malloc(sizeof(ipv4_flow_t)*size);
		assert(table->table);
		table->total_size=size;
		table->num_partitions=num_partitions;
		table->max_active_flows=max_active_v4_flows;
#if DPI_FLOW_TABLE_USE_MEMORY_POOL
		table->start_pool_size=start_pool_size;
#endif

		for(i=0; i<table->total_size; i++){
			/** Creation of sentinel node. **/
			table->table[i].next=&(table->table[i]);
			table->table[i].prev=&(table->table[i]);
		}
		

#if DPI_NUMA_AWARE
		table->partitions=numa_alloc_onnode(
				sizeof(dpi_flow_DB_v4_partition_t)*table->num_partitions,
				DPI_NUMA_AWARE_FLOW_TABLE_NODE);
		assert(table->partitions);
#else
		assert(posix_memalign(
				(void**) &(table->partitions), DPI_CACHE_LINE_SIZE,
				sizeof(dpi_flow_DB_v4_partition_t)*table->num_partitions)==0);
#endif

#if DPI_FLOW_TABLE_HASH_VERSION == DPI_MURMUR3_HASH
		srand((unsigned int) time(NULL));
		table->seed=rand();
#endif

		dpi_flow_table_setup_partitions_v4(table, table->num_partitions);
	}else
		table=NULL;
	return table;
}
Exemplo n.º 12
0
static inline ipv6_flow_t* v6_flow_alloc(){
	void* r;
#if DPI_NUMA_AWARE
	r=numa_alloc_onnode(sizeof(ipv6_flow_t),
			            DPI_NUMA_AWARE_FLOW_TABLE_NODE);
	assert(r);
#else
	#if DPI_FLOW_TABLE_ALIGN_FLOWS
		assert(posix_memalign((void**) &r, DPI_CACHE_LINE_SIZE,
			   sizeof(ipv6_flow_t))==0);
	#else
		r=malloc(sizeof(ipv6_flow_t));
		assert(r);
	#endif
#endif
	return (ipv6_flow_t*) r;
}
Exemplo n.º 13
0
  TaskManager :: TaskManager()
    {
      num_threads = GetMaxThreads();
      // if (MyMPI_GetNTasks() > 1) num_threads = 1;

#ifdef USE_NUMA
      numa_available();
      num_nodes = numa_max_node() + 1;
      if (num_nodes > num_threads) num_nodes = num_threads;

      for (int j = 0; j < num_nodes; j++)
        {
          void * mem = numa_alloc_onnode (sizeof(NodeData), j);
          nodedata[j] = new (mem) NodeData;
	  complete[j] = -1;
          workers_on_node[j] = 0;          
        }
#else
      num_nodes = 1;
      nodedata[0] = new NodeData;
      complete[0] = -1;
      workers_on_node[0] = 0;
#endif

      jobnr = 0;
      done = 0;
      sleep = false;
      sleep_usecs = 1000;
      active_workers = 0;

      static int cnt = 0;
      char buf[100];
      if (use_paje_trace)
        {
#ifdef PARALLEL
          sprintf(buf, "ng%d_rank%d.trace", cnt++, MyMPI_GetId());
#else
          sprintf(buf, "ng%d.trace", cnt++);
#endif
        }
      else
        buf[0] = 0;
      //sprintf(buf, "");
      trace = new PajeTrace(num_threads, buf);
    }
Exemplo n.º 14
0
static inline
#endif
    mc_pfwl_task_t *
    pfwl_allocate_task() {
  mc_pfwl_task_t *r;
#if PFWL_NUMA_AWARE
  r = (mc_pfwl_task_t *) numa_alloc_onnode(sizeof(mc_pfwl_task_t),
                                           PFWL_NUMA_AWARE_TASKS_NODE);
#else
#if PFWL_MULTICORE_ALIGN_TASKS
  if (posix_memalign((void **) &r, PFWL_CACHE_LINE_SIZE,
                     sizeof(mc_pfwl_task_t))) {
    throw std::runtime_error("posix_memalign failed.");
  }
#else
  r = new mc_pfwl_task_t;
#endif
#endif
  return r;
}
Exemplo n.º 15
0
/**
 * \brief allocates size bytes of memory with the current NUMA policy.
 *
 * \param size      size of the memory region in bytes
 * \param pagesize  preferred page size to be used
 * \returns pointer to the mapped memory region
 *
 * The memory must be freed with numa_free(). On errors NULL is returned.
 */
void *numa_alloc(size_t size, size_t pagesize)
{
    NUMA_DEBUG_ALLOC("allocate according to policy\n");

    /* check if we use interleaved mode */
    if (bitmap_get_weight(numa_alloc_interleave_mask)) {
        return numa_alloc_interleaved_subset(size, pagesize,
                                             numa_alloc_interleave_mask);
    }

    /* check membind */
    if (bitmap_get_weight(numa_alloc_bind_mask) == 1) {
        nodeid_t node = (nodeid_t) bitmap_get_first(numa_alloc_bind_mask);
        return numa_alloc_onnode(size, node, pagesize);
    }

    /* TODO:
     * - handle the case where multiple nodes are set in membind
     */

    /* just return some memory */
    return malloc(size);

}
Exemplo n.º 16
0
int main(int argc, char *argv[])
{
	int node_id = 0;
	int arrival_lambda = 10;
	int thread_cpu_map[N_THREADS];
	int i,j,k;
	int n_threads;
	int n_left;
	int n_right;
	int next_index_left = 3;
	int next_index_right = 7;
	float local_square = 0.0, remote_square = 0.0;


	/***************** make sure #args is correct and get the n_threads, n_left and n_right */
	if(argc < 4)
	{
		printf("Usage: ./test_numa_comb n_of_threads n_of_threads_on_node0 n_of_threads_on_node1\n");
		exit(-1);
	}
	n_threads = atoi(argv[1]);
	n_left = atoi(argv[2]);
	n_right = atoi(argv[3]);
	/******************* Set the thread_cpu_map according to the n_left and n_right */
	printf("n_threads: %d, n_left: %d, n_right: %d\n",n_threads,n_left,n_right);
	for(i = 0; i < n_left; i++)
	{
		thread_cpu_map[i] = next_index_left;
		next_index_left--;
	}
	for(i = n_left; i < n_threads; i++)
	{
		thread_cpu_map[i] = next_index_right;
		next_index_right--;
	}
	for(i = 0; i < n_threads; i++)
	{
		printf("Thread %d is on cpu %d\n",i,thread_cpu_map[i]);
	}



	thread_params para[n_threads]; //The parameters to pass to the threads

	//printf("The return value of numa_get_run_node_mask(void) is %d\n",numa_get_run_node_mask());
	//printf("The return value of numa_max_node(void) is %d\n",numa_max_node());
	//numa_tonode_memory((void *)spinlock_ptr,sizeof(pthread_spinlock_t),node_id); //This doesn't work

	//initilize the spinlock pointer and put it on a specific node
	pthread_spinlock_t *spinlock_ptr = numa_alloc_onnode(sizeof(pthread_spinlock_t),node_id);

	if(spinlock_ptr == NULL) //error handling of the allocating of a spinlock pointer on a specific node
	{
		printf("alloc of spinlock on a node failed.\n");
		exit(-1);
	}

	/* initialise  syncs */
	pthread_barrier_init(&fin_barrier, NULL, n_threads);
	pthread_spin_init(spinlock_ptr,0);
	int rc;
	//create the threads
	for(i = 0; i < n_threads; i++)
	{

		para[i].thread_id = i;
		para[i].arrival_lambda = arrival_lambda;
		para[i].spinlock_ptr = spinlock_ptr;
		CPU_ZERO(&cpuset[i]);
		CPU_SET(thread_cpu_map[i],&cpuset[i]);
		rc = pthread_create(&threads[i],NULL,work,(void*)&para[i]);
		E (rc);


	}
	start_work_flag = 1; 

	/* wait here */
	for(i = 0; i < n_threads; i++)
	    pthread_join(threads[i],NULL);


	pthread_barrier_destroy(&fin_barrier);

	/*
	for(i = 0; i < n_threads; i++)
	{
		printf("The time to get one lock for thread %d is : %.9f\n",i,time_in_cs[i]/num_access_each_thread[i]);
		printf("The number of lock accesses for thread %d is : %d\n",i,num_access_each_thread[i]);
	}
	*/

	qsort((void*)g_tss,(size_t)access_count,(size_t)sizeof(timestamp),cmp_timestamp);
	/*
	for (i = 0; i < access_count; i++)
		printf("%lu with id %d\n",g_tss[i].ts,g_tss[i].id);
	*/

	/* for (i = 0; i < access_count; i++)
	 * {
	 *     printf ("%lu %d\n", g_tss[i].ts, g_tss[i].id);
	 * } */

	/* */
	
	int cs_order[access_count/2];
	for(i = 0; i < access_count/2; i++)
	{
		cs_order[i] = g_tss[i*2].id;
		//printf("%d in cs\n",cs_order[i]);
	}
	int cs_matrix[n_threads][n_threads];
	uint64_t delay_matrix[n_threads][n_threads];
	float prob_matrix[n_threads][n_threads];
	float rate_matrix[n_threads][n_threads];

	// zero out all the matrices
	memset(&cs_matrix, '\0', n_threads*n_threads*sizeof(int));
	memset(&delay_matrix, '\0', n_threads*n_threads*sizeof(uint64_t));
	memset(&prob_matrix, '\0', n_threads*n_threads*sizeof(float));


	int local_count2 = 0, remote_count2 = 0;
	uint64_t diff;
	for(i = 0; i < n_threads; i++)
	    for(j = 0; j < n_threads; j++)
		for(k = 0; k < access_count/2 -1 ; k++)
		{
		    if(cs_order[k] == i && cs_order[k+1] == j)
		    {
			cs_matrix[i][j]++;
			diff = g_tss[2*k+2].ts - g_tss[2*k+1].ts; 
			delay_matrix[i][j] += diff;
			if(is_on_same_node(i, j, n_threads, n_left, n_right))
			{
			    dprintf("local_delay: %lu\n", diff);
			    local_square += sqr(diff);
			    local_count2++;
			}
			else
			{
			    dprintf("remote_delay: %lu\n", diff);
			    remote_square += sqr(diff);
			    remote_count2++;
			}
		    }
		}

	int num_access[n_threads];
	for(i = 0; i < access_count/2 -1; i++)
	    for(j = 0; j < n_threads; j++)
	    {
		if (cs_order[i] == j) num_access[j]++;
	    }

	for(i = 0; i < n_threads; i++)
		printf("num_access[%d]:%d\n",i,num_access[i]);

	for(i = 0; i < n_threads; i++)
		for(j = 0; j < n_threads ; j++)
		{
			prob_matrix[i][j] = (float)cs_matrix[i][j]/(float)num_access[i];
			rate_matrix[i][j] = 1.0/((delay_matrix[i][j]/(float)cs_matrix[i][j])/CPU_FREQ);
		}


	printf ("\n***************** PROBS *******************\n");
	printf ("Lock is on LP, [L, R] is [%d, %d]:\n", n_left - 1, n_right);
	// tl
	printf ("L -> L\n");
	print_mtx (n_threads, n_threads, prob_matrix,
			   0, 0, n_left, n_left, 0);
    // tr
	printf ("L -> R\n");
	print_mtx (n_threads, n_threads, prob_matrix,
			   n_left, 0, n_threads, n_left, 0);

	printf ("Lock is on RP, [L, R] is [%d, %d]:\n", n_left, n_right - 1);
	// br
	printf ("R -> R\n");
	print_mtx (n_threads, n_threads, prob_matrix,
			   n_left, n_left, n_threads, n_threads, 0);
	// bl
	printf ("R -> L\n");
	print_mtx (n_threads, n_threads, prob_matrix,
			   0, n_left, n_left, n_threads, 0);
	

	printf ("\n***************** RATES *******************\n");

	printf ("Lock is on LP, [L, R] is [%d, %d]:\n", n_left - 1, n_right);
	// tl
	printf ("L -> L\n");
	print_mtx (n_threads, n_threads, rate_matrix,
			   0, 0, n_left, n_left, 1);
    // tr
	printf ("L -> R\n");
	print_mtx (n_threads, n_threads, rate_matrix,
			   n_left, 0, n_threads, n_left, 1);

	printf ("Lock is on RP, [L, R] is [%d, %d]:\n", n_left, n_right - 1);
	// br
	printf ("R -> R\n");
	print_mtx (n_threads, n_threads, rate_matrix,
			   n_left, n_left, n_threads, n_threads, 1);
	// bl
	printf ("R -> \n");
	print_mtx (n_threads, n_threads, rate_matrix,
			   0, n_left, n_left, n_threads, 1);




	//print the intra-core and inter-core delay
	//thread 0 - n_left -1 are on the left core, n_left to n_threads are on the right core
	uint64_t local_delay = 0, remote_delay = 0;
	int local_count = 0, remote_count = 0;
	float local_prob = 0.0, remote_prob = 0.0;

	for(i = 0; i < n_threads; i++)
	    for(j = 0; j < n_threads; j++)
	    {
			if (j == i)
				continue;
			if(is_on_same_node(i, j, n_threads, n_left, n_right))
			{
				//printf("%d and %d on the same node\n",i,j);
				local_delay += delay_matrix[i][j];
				local_count += cs_matrix[i][j];
				local_prob += prob_matrix[j][i];
			}
			else
			{
				//printf("%d and %d not the same node\n",i,j);
				remote_delay += delay_matrix[i][j];
				remote_count += cs_matrix[i][j];
				remote_prob += prob_matrix[j][i];
			}
	    }


	float local = (float)local_delay/(local_count);
	float remote = (float)remote_delay/(remote_count);

	printf("\n\n**************************** Aggregates ***************************\n");
	printf("local delay: %f, remote_delay: %f, local_count: %d, remote_count: %d\n",(float)local_delay/(local_count),(float)remote_delay/(remote_count),local_count,remote_count);
	printf("local prob:%f, remote prob: %f\n",local_prob/n_threads, remote_prob/n_threads);
	printf("local delay variance:%f, remote delay variance: %f\n",local_square/local_count - local*local, remote_square/remote_count - remote*remote);
	printf("local count2: %d, remote_count2:%d\n",local_count2, remote_count2);
	pthread_spin_destroy(spinlock_ptr);
	numa_free((void *)spinlock_ptr,sizeof(pthread_spinlock_t));
	pthread_exit(NULL);
	return 0;
}
Exemplo n.º 17
0
int main(int argc, char **argv)
{
	char *msg;		/* message returned from parse_opts */

	/* parse standard options */
	msg = parse_opts(argc, argv, NULL, NULL);
	if (msg != NULL) {
		tst_brkm(TBROK, NULL, "OPTION PARSING ERROR - %s", msg);

	}

	setup();

#if HAVE_NUMA_MOVE_PAGES
	unsigned int i;
	int lc;			/* loop counter */
	unsigned int from_node;
	unsigned int to_node;
	int ret;

	ret = get_allowed_nodes(NH_MEMS, 2, &from_node, &to_node);
	if (ret	< 0)
		tst_brkm(TBROK|TERRNO, cleanup, "get_allowed_nodes: %d", ret);

	/* check for looping state if -i option is given */
	for (lc = 0; TEST_LOOPING(lc); lc++) {
		void *pages[TEST_PAGES] = { 0 };
		int nodes[TEST_PAGES];
		int status[TEST_PAGES];
		unsigned long onepage = get_page_size();

		/* reset Tst_count in case we are looping */
		Tst_count = 0;

		ret = alloc_pages_on_node(pages, TOUCHED_PAGES, from_node);
		if (ret == -1)
			continue;

		/* Allocate page and do not touch it. */
		pages[UNTOUCHED_PAGE] = numa_alloc_onnode(onepage, from_node);
		if (pages[UNTOUCHED_PAGE] == NULL) {
			tst_resm(TBROK, "failed allocating page on node %d",
				 from_node);
			goto err_free_pages;
		}

		for (i = 0; i < TEST_PAGES; i++)
			nodes[i] = to_node;

		ret = numa_move_pages(0, TEST_PAGES, pages, nodes,
				      status, MPOL_MF_MOVE);
		TEST_ERRNO = errno;
		if (ret == -1) {
			tst_resm(TFAIL | TERRNO, "move_pages unexpectedly failed");
			goto err_free_pages;
		}

		if (status[UNTOUCHED_PAGE] == -ENOENT)
			tst_resm(TPASS, "status[%d] set to expected -ENOENT",
				 UNTOUCHED_PAGE);
		else
			tst_resm(TFAIL, "status[%d] is %d", UNTOUCHED_PAGE,
				 status[UNTOUCHED_PAGE]);

		err_free_pages:
		    /* This is capable of freeing both the touched and
		     * untouched pages.
		     */
		    free_pages(pages, TEST_PAGES);
	}
#else
	tst_resm(TCONF, "move_pages support not found.");
#endif

	cleanup();
	tst_exit();

}
Exemplo n.º 18
0
void myhbwmalloc_init(void)
{
    /* set to NULL before trying to initialize.  if we return before
     * successful creation of the mspace, then it will still be NULL,
     * and we can use that in subsequent library calls to determine
     * that the library failed to initialize. */
    myhbwmalloc_mspace = NULL;

    /* verbose printout? */
    myhbwmalloc_verbose = 0;
    {
        char * env_char = getenv("HBWMALLOC_VERBOSE");
        if (env_char != NULL) {
            myhbwmalloc_verbose = 1;
            printf("hbwmalloc: HBWMALLOC_VERBOSE set\n");
        }
    }

    /* fail hard or soft? */
    myhbwmalloc_hardfail = 1;
    {
        char * env_char = getenv("HBWMALLOC_SOFTFAIL");
        if (env_char != NULL) {
            myhbwmalloc_hardfail = 0;
            printf("hbwmalloc: HBWMALLOC_SOFTFAIL set\n");
        }
    }

    /* set the atexit handler that will destroy the mspace and free the numa allocation */
    atexit(myhbwmalloc_final);

    /* detect and configure use of NUMA memory nodes */
    {
        int max_possible_node        = numa_max_possible_node();
        int num_possible_nodes       = numa_num_possible_nodes();
        int max_numa_nodes           = numa_max_node();
        int num_configured_nodes     = numa_num_configured_nodes();
        int num_configured_cpus      = numa_num_configured_cpus();
        if (myhbwmalloc_verbose) {
            printf("hbwmalloc: numa_max_possible_node()    = %d\n", max_possible_node);
            printf("hbwmalloc: numa_num_possible_nodes()   = %d\n", num_possible_nodes);
            printf("hbwmalloc: numa_max_node()             = %d\n", max_numa_nodes);
            printf("hbwmalloc: numa_num_configured_nodes() = %d\n", num_configured_nodes);
            printf("hbwmalloc: numa_num_configured_cpus()  = %d\n", num_configured_cpus);
        }
        /* FIXME this is a hack.  assumes HBW is only numa node 1. */
        if (num_configured_nodes <= 2) {
            myhbwmalloc_numa_node = num_configured_nodes-1;
        } else {
            fprintf(stderr,"hbwmalloc: we support only 2 numa nodes, not %d\n", num_configured_nodes);
        }

        if (myhbwmalloc_verbose) {
            for (int i=0; i<num_configured_nodes; i++) {
                unsigned max_numa_cpus = numa_num_configured_cpus();
                struct bitmask * mask = numa_bitmask_alloc( max_numa_cpus );
                int rc = numa_node_to_cpus(i, mask);
                if (rc != 0) {
                    fprintf(stderr, "hbwmalloc: numa_node_to_cpus failed\n");
                } else {
                    printf("hbwmalloc: numa node %d cpu mask:", i);
                    for (unsigned j=0; j<max_numa_cpus; j++) {
                        int bit = numa_bitmask_isbitset(mask,j);
                        printf(" %d", bit);
                    }
                    printf("\n");
                }
                numa_bitmask_free(mask);
            }
            fflush(stdout);
        }
    }

#if 0 /* unused */
    /* see if the user specifies a slab size */
    size_t slab_size_requested = 0;
    {
        char * env_char = getenv("HBWMALLOC_BYTES");
        if (env_char!=NULL) {
            long units = 1L;
            if      ( NULL != strstr(env_char,"G") ) units = 1000000000L;
            else if ( NULL != strstr(env_char,"M") ) units = 1000000L;
            else if ( NULL != strstr(env_char,"K") ) units = 1000L;
            else                                     units = 1L;

            int num_count = strspn(env_char, "0123456789");
            memset( &env_char[num_count], ' ', strlen(env_char)-num_count);
            slab_size_requested = units * atol(env_char);
        }
        if (myhbwmalloc_verbose) {
            printf("hbwmalloc: requested slab_size_requested = %zu\n", slab_size_requested);
        }
    }
#endif

    /* see what libnuma says is available */
    size_t myhbwmalloc_slab_size;
    {
        int node = myhbwmalloc_numa_node;
        long long freemem;
        long long maxmem = numa_node_size64(node, &freemem);
        if (myhbwmalloc_verbose) {
            printf("hbwmalloc: numa_node_size64 says maxmem=%lld freemem=%lld for numa node %d\n",
                    maxmem, freemem, node);
        }
        myhbwmalloc_slab_size = freemem;
    }

    /* assume threads, disable if MPI knows otherwise, then allow user to override. */
    int multithreaded = 1;
#ifdef HAVE_MPI
    int nprocs;
    {
        int is_init, is_final;
        MPI_Initialized(&is_init);
        MPI_Finalized(&is_final);
        if (is_init && !is_final) {
            MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
        }

        /* give equal portion to every MPI process */
        myhbwmalloc_slab_size /= nprocs;

        /* if the user initializes MPI with MPI_Init or
         * MPI_Init_thread(MPI_THREAD_SINGLE), they assert there
         * are no threads at all, which means we can skip the
         * malloc mspace lock.
         *
         * if the user lies to MPI, they deserve any bad thing
         * that comes of it. */
        int provided;
        MPI_Query_thread(&provided);
        if (provided==MPI_THREAD_SINGLE) {
            multithreaded = 0;
        } else {
            multithreaded = 1;
        }

        if (myhbwmalloc_verbose) {
            printf("hbwmalloc: MPI processes = %d (threaded = %d)\n", nprocs, multithreaded);
            printf("hbwmalloc: myhbwmalloc_slab_size = %d\n", myhbwmalloc_slab_size);
        }
    }
#endif

    /* user can assert that hbwmalloc and friends need not be thread-safe */
    {
        char * env_char = getenv("HBWMALLOC_LOCKLESS");
        if (env_char != NULL) {
            multithreaded = 0;
            if (myhbwmalloc_verbose) {
                printf("hbwmalloc: user has disabled locking in mspaces by setting HBWMALLOC_LOCKLESS\n");
            }
        }
    }

    myhbwmalloc_slab = numa_alloc_onnode( myhbwmalloc_slab_size, myhbwmalloc_numa_node);
    if (myhbwmalloc_slab==NULL) {
        fprintf(stderr, "hbwmalloc: numa_alloc_onnode returned NULL for size = %zu\n", myhbwmalloc_slab_size);
        return;
    } else {
        if (myhbwmalloc_verbose) {
            printf("hbwmalloc: numa_alloc_onnode succeeded for size %zu\n", myhbwmalloc_slab_size);
        }

        /* part (less than 128*sizeof(size_t) bytes) of this space is used for bookkeeping,
         * so the capacity must be at least this large */
        if (myhbwmalloc_slab_size < 128*sizeof(size_t)) {
            fprintf(stderr, "hbwmalloc: not enough space for mspace bookkeeping\n");
            return;
        }

        /* see above regarding if the user lies to MPI. */
        int locked = multithreaded;
        myhbwmalloc_mspace = create_mspace_with_base( myhbwmalloc_slab, myhbwmalloc_slab_size, locked);
        if (myhbwmalloc_mspace == NULL) {
            fprintf(stderr, "hbwmalloc: create_mspace_with_base returned NULL\n");
            return;
        } else if (myhbwmalloc_verbose) {
            printf("hbwmalloc: create_mspace_with_base succeeded for size %zu\n", myhbwmalloc_slab_size);
        }
    }
}
Exemplo n.º 19
0
void INTERNAL *qt_affinity_alloc_onnode(size_t bytes,
                                        int    node)
{                                      /*{{{ */
    return numa_alloc_onnode(bytes, node);
}                                      /*}}} */
Exemplo n.º 20
0
static void *
s_numa_alloc(size_t sz, int cpu) {
  void *ret = NULL;

  if (likely(sz > 0)) {
    if (likely(cpu >= 0)) {
      if (likely(s_numa_nodes != NULL && s_n_cpus > 0)) {
        unsigned int node = s_numa_nodes[cpu];
        unsigned int allocd_node = UINT_MAX;
        struct bitmask *bmp;
        int r;
  
        bmp = numa_allocate_nodemask();
        numa_bitmask_setbit(bmp, node);

        errno = 0;
        r = (int)set_mempolicy(MPOL_BIND, bmp->maskp, bmp->size + 1);
        if (likely(r == 0)) {
          errno = 0;
          ret = numa_alloc_onnode(sz, (int)node);
          if (likely(ret != NULL)) {
            lagopus_result_t rl;

            /*
             * We need this "first touch" even using the
             * numa_alloc_onnode().
             */
            (void)memset(ret, 0, sz);

            errno = 0;
            r = (int)get_mempolicy((int *)&allocd_node, NULL, 0, ret,
                                   MPOL_F_NODE|MPOL_F_ADDR);
            if (likely(r == 0)) {
              if (unlikely(node != allocd_node)) {
                /*
                 * The memory is not allocated on the node, but it is
                 * still usable. Just return it.
                 */
                lagopus_msg_warning("can't allocate " PFSZ(u) " bytes memory "
                                    "for CPU %d (NUMA node %d).\n",
                                    sz, cpu, node);
              }
            } else {
              lagopus_perror(LAGOPUS_RESULT_POSIX_API_ERROR);
              lagopus_msg_error("get_mempolicy() returned %d.\n", r);
            }

            rl = s_add_addr(ret, sz);
            if (unlikely(rl != LAGOPUS_RESULT_OK)) {
              lagopus_perror(rl);
              lagopus_msg_error("can't register the allocated address.\n");
              numa_free(ret, sz);
              ret = NULL;
            }
          }

        } else {	/* r == 0 */
          lagopus_perror(LAGOPUS_RESULT_POSIX_API_ERROR);
          lagopus_msg_error("set_mempolicy() returned %d.\n", r);
        }

        numa_free_nodemask(bmp);
        set_mempolicy(MPOL_DEFAULT, NULL, 0);

      } else {	/* s_numa_nodes != NULL && s_n_cpus > 0 */
        /*
         * Not initialized or initialization failure.
         */
        lagopus_msg_warning("The NUMA related information is not initialized. "
                            "Use malloc(3) instead.\n");
        ret = malloc(sz);
      }

    } else {	/* cpu >= 0 */
      /*
       * Use pure malloc(3).
       */
      ret = malloc(sz);
    }

  }

  return ret;
}
Exemplo n.º 21
0
int main(int argc, char **argv)
{

	tst_parse_opts(argc, argv, NULL, NULL);

	setup();

#if HAVE_NUMA_MOVE_PAGES
	unsigned int i;
	int lc;
	unsigned int from_node;
	unsigned int to_node;
	int ret, exp_status;

	if ((tst_kvercmp(4, 3, 0)) >= 0)
		exp_status = -EFAULT;
	else
		exp_status = -ENOENT;

	ret = get_allowed_nodes(NH_MEMS, 2, &from_node, &to_node);
	if (ret < 0)
		tst_brkm(TBROK | TERRNO, cleanup, "get_allowed_nodes: %d", ret);

	/* check for looping state if -i option is given */
	for (lc = 0; TEST_LOOPING(lc); lc++) {
		void *pages[TEST_PAGES] = { 0 };
		int nodes[TEST_PAGES];
		int status[TEST_PAGES];
		unsigned long onepage = get_page_size();

		/* reset tst_count in case we are looping */
		tst_count = 0;

		ret = alloc_pages_on_node(pages, TOUCHED_PAGES, from_node);
		if (ret == -1)
			continue;

		/* Allocate page and do not touch it. */
		pages[UNTOUCHED_PAGE] = numa_alloc_onnode(onepage, from_node);
		if (pages[UNTOUCHED_PAGE] == NULL) {
			tst_resm(TBROK, "failed allocating page on node %d",
				 from_node);
			goto err_free_pages;
		}

		for (i = 0; i < TEST_PAGES; i++)
			nodes[i] = to_node;

		ret = numa_move_pages(0, TEST_PAGES, pages, nodes,
				      status, MPOL_MF_MOVE);
		if (ret == -1) {
			tst_resm(TFAIL | TERRNO,
				 "move_pages unexpectedly failed");
			goto err_free_pages;
		}

		if (status[UNTOUCHED_PAGE] == exp_status) {
			tst_resm(TPASS, "status[%d] has expected value",
				 UNTOUCHED_PAGE);
		} else {
			tst_resm(TFAIL, "status[%d] is %s, expected %s",
				UNTOUCHED_PAGE,
				tst_strerrno(-status[UNTOUCHED_PAGE]),
				tst_strerrno(-exp_status));
		}

err_free_pages:
		/* This is capable of freeing both the touched and
		 * untouched pages.
		 */
		free_pages(pages, TEST_PAGES);
	}
#else
	tst_resm(TCONF, "move_pages support not found.");
#endif

	cleanup();
	tst_exit();

}
Exemplo n.º 22
0
void* pmalloc(size_t size)
{
    thread_t* thread = thread_self();
    return numa_alloc_onnode(size, thread->virtual_node->nvram_node->node_id);
}
Exemplo n.º 23
0
void dpi_flow_table_setup_partitions_v6(dpi_flow_DB_v6_t* table, u_int16_t num_partitions){
	/** Partitions management. **/
	u_int32_t partition_size=ceil((float)table->total_size/(float)table->num_partitions);
	u_int32_t partition_max_active_v6_flows=
			     table->max_active_flows/table->num_partitions;

	u_int16_t j;
	u_int32_t lowest_index=0;
	u_int32_t highest_index=lowest_index+partition_size-1;
	for(j=0; j<table->num_partitions; ++j){
		dpi_flow_table_initialize_informations(
				&(table->partitions[j].partition.informations),
				lowest_index, highest_index,
				partition_max_active_v6_flows);
		lowest_index=highest_index+1;

		/**
		 * The last partition gets the entries up to the end of the
		 * table. Indeed, when the size is not a multiple of the
		 * number of partitions, the last partition may be smaller.
		 */
		if(j==table->num_partitions-2)
			highest_index=table->total_size-1;
		else
			highest_index+=partition_size;

#if DPI_FLOW_TABLE_USE_MEMORY_POOL
		ipv6_flow_t* flow_pool;
		u_int32_t i=0;
		table->individual_pool_size=table->start_pool_size/table->num_partitions;
#if DPI_NUMA_AWARE
		flow_pool=numa_alloc_onnode(
				      sizeof(ipv6_flow_t)*table->individual_pool_size,
				      DPI_NUMA_AWARE_FLOW_TABLE_NODE);
		assert(flow_pool);
		table->partitions[j].partition.pool=numa_alloc_onnode(
				      sizeof(u_int32_t)*table->individual_pool_size,
				      DPI_NUMA_AWARE_FLOW_TABLE_NODE);
		assert(table->partitions[j].partition.pool);
#else
		assert(posix_memalign(
				     (void**) &flow_pool,
				     DPI_CACHE_LINE_SIZE,
				     (sizeof(ipv6_flow_t)*table->individual_pool_size)+
				     DPI_CACHE_LINE_SIZE)==0);
		assert(posix_memalign(
				     (void**) &(table->partitions[j].partition.pool),
				     DPI_CACHE_LINE_SIZE,
				     (sizeof(u_int32_t)*table->individual_pool_size)+
				     DPI_CACHE_LINE_SIZE)==0);
#endif
		for(i=0; i<table->individual_pool_size; i++){
			table->partitions[j].partition.pool[i]=i;
		}
		table->partitions[j].partition.pool_size=
				table->individual_pool_size;
		table->partitions[j].partition.memory_chunk_lower_bound=
				flow_pool;
		table->partitions[j].partition.memory_chunk_upper_bound=
				flow_pool+table->individual_pool_size;
#endif
	}
	debug_print("%s\n", "[flow_table.c]: Computing active v6 flows.");
	dpi_flow_table_update_flow_count_v6(table);
	debug_print("%s\n", "[flow_table.c]: Active v6 flows computation finished.");
}
Exemplo n.º 24
0
int main(void)
{
	int node_id = 0;
	int arrival_lambda = 10;
	int thread_cpu_map[N_THREADS] = {1,6};
	int i;
	int j;
	/*
	   pthread_spinlock_t *spinlock_ptr = malloc(sizeof(pthread_spinlock_t));
	   if(spinlock_ptr == NULL) //error handling of the malloc of the spinlock
	   {
	   printf("malloc of spinlock failed.\n");
	   }
	   else
	   {
	   printf("malloc of spinlock succeeded.\n");
	   }
	   free(spinlock_ptr);
	 */
	//pthread_t threads[N_THREADS];
	//cpu_set_t cpuset[N_THREADS]; //for setting the affinity of threads
	thread_params para[N_THREADS]; //The parameters to pass to the threads

	//printf("The return value of numa_get_run_node_mask(void) is %d\n",numa_get_run_node_mask());
	//printf("The return value of numa_max_node(void) is %d\n",numa_max_node());
	//numa_tonode_memory((void *)spinlock_ptr,sizeof(pthread_spinlock_t),node_id); //This doesn't work

	//initilize the spinlock pointer and put it on a specific node
	pthread_spinlock_t *spinlock_ptr = numa_alloc_onnode(sizeof(pthread_spinlock_t),node_id);
	if(spinlock_ptr == NULL) //error handling of the allocating of a spinlock pointer on a specific node
	{
		printf("alloc of spinlock on a node failed.\n");
	}
	else
	{
		printf("alloc of spinlock on a node succeeded.\n");
	}
	for(j = 0; j  < 100000; j++)
	{
		//initlize spinlock
		pthread_spin_init(spinlock_ptr,0);
		//create the threads
		for(i = 0; i < N_THREADS; i++)
		{
			int rc;
			int s;
			para[i].thread_id = i;
			para[i].arrival_lambda = arrival_lambda;
			para[i].spinlock_ptr = spinlock_ptr;
			CPU_ZERO(&cpuset[i]);
			CPU_SET(thread_cpu_map[i],&cpuset[i]);
			rc = pthread_create(&threads[i],NULL,work,(void*)&para[i]);
			if(rc)
			{
				printf("ERROR: return code from pthread_create() is %d for thread %d \n",rc,i);
				exit(-1);
			}
			/*
			s = pthread_setaffinity_np(threads[i], sizeof(cpu_set_t), &cpuset[i]);
			if (s != 0)
				perror("set affinity error\n");
			*/
			flag = 1; 
		}
	/*
		for(i = 0; i < N_THREADS; i++)
		{
			int s;
			CPU_ZERO(&cpuset[i]);
			CPU_SET(thread_cpu_map[i],&cpuset[i]);
			s = pthread_setaffinity_np(threads[i], sizeof(cpu_set_t), &cpuset[i]);
			if (s != 0)
				perror("sjfljkl\n");
		}
	*/
		for(i = 0; i < N_THREADS; i++)
		{
			pthread_join(threads[i],NULL);
		}
	}
	for(i = 0; i < N_THREADS; i++)
	{
		printf("The time to get one lock for thread %d is : %.9f\n",i,time_in_cs[i]/100000);
	}
	pthread_spin_destroy(spinlock_ptr);
	numa_free(spinlock_ptr,sizeof(pthread_spinlock_t));
	pthread_exit(NULL);
	return 0;
}
Exemplo n.º 25
0
int main(int argc,char *argv[])
{
  const char   *label[4] = {"Copy", "Scale","Add", "Triad"};
  const double bytes[4]  = {2 * sizeof(double) * N,
                            2 * sizeof(double) * N,
                            3 * sizeof(double) * N,
                            3 * sizeof(double) * N};
  double       rmstime[4] = {0},maxtime[4] = {0},mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
  int          quantum;
  int          BytesPerWord,j,k,size;
  PetscInt     node = -1;
  double       scalar, t, times[4][NTIMES];
#if !STATIC_ALLOC
  double       *PETSC_RESTRICT a,*PETSC_RESTRICT b,*PETSC_RESTRICT c;
#endif

  PetscInitialize(&argc,&argv,0,help);
  MPI_Comm_size(PETSC_COMM_WORLD,&size);
  PetscOptionsGetInt(NULL,"-node",&node,NULL);
  /* --- SETUP --- determine precision and check timing --- */

  PetscPrintf(PETSC_COMM_WORLD,HLINE);
  BytesPerWord = sizeof(double);
  PetscPrintf(PETSC_COMM_WORLD,"This system uses %d bytes per DOUBLE PRECISION word.\n",
              BytesPerWord);

  PetscPrintf(PETSC_COMM_WORLD,HLINE);
  PetscPrintf(PETSC_COMM_WORLD,"Array size = %d, Offset = %d\n", N, OFFSET);
  PetscPrintf(PETSC_COMM_WORLD,"Total memory required = %.1f MB per process.\n",
              (3 * N * BytesPerWord) / 1048576.0);
  PetscPrintf(PETSC_COMM_WORLD,"Each test is run %d times, but only\n", NTIMES);
  PetscPrintf(PETSC_COMM_WORLD,"the *best* time for each is used.\n");

  /* Get initial value for system clock. */

#if !STATIC_ALLOC
  if (node == -1) {
    posix_memalign((void**)&a,64,N*sizeof(double));
    posix_memalign((void**)&b,64,N*sizeof(double));
    posix_memalign((void**)&c,64,N*sizeof(double));
  } else if (node == -2) {
    a = malloc(N*sizeof(double));
    b = malloc(N*sizeof(double));
    c = malloc(N*sizeof(double));
#if defined(HAVE_NUMA)
  } else {
    a = numa_alloc_onnode(N*sizeof(double),node);
    b = numa_alloc_onnode(N*sizeof(double),node);
    c = numa_alloc_onnode(N*sizeof(double),node);
#endif
  }
#endif
#if FAULT_TOGETHER
  for (j=0; j<N; j++) {
    a[j] = 1.0;
    b[j] = 2.0;
    c[j] = 0.0;
  }
#else
  for (j=0; j<N; j++) a[j] = 1.0;
  for (j=0; j<N; j++) b[j] = 2.0;
  for (j=0; j<N; j++) c[j] = 0.0;
#endif

  PetscPrintf(PETSC_COMM_WORLD,HLINE);

  if  ((quantum = checktick()) >= 1) PetscPrintf(PETSC_COMM_WORLD,"Your clock granularity/precision appears to be %d microseconds.\n", quantum);
  else PetscPrintf(PETSC_COMM_WORLD,"Your clock granularity appears to be less than one microsecond.\n");

  t = Second();
  for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j];
  t = 1.0E6 * (Second() - t);

  PetscPrintf(PETSC_COMM_WORLD,"Each test below will take on the order"
              " of %d microseconds.\n", (int) t);
  PetscPrintf(PETSC_COMM_WORLD,"   (= %d clock ticks)\n", (int) (t/quantum));
  PetscPrintf(PETSC_COMM_WORLD,"Increase the size of the arrays if this shows that\n");
  PetscPrintf(PETSC_COMM_WORLD,"you are not getting at least 20 clock ticks per test.\n");

  PetscPrintf(PETSC_COMM_WORLD,HLINE);

  PetscPrintf(PETSC_COMM_WORLD,"WARNING -- The above is only a rough guideline.\n");
  PetscPrintf(PETSC_COMM_WORLD,"For best results, please be sure you know the\n");
  PetscPrintf(PETSC_COMM_WORLD,"precision of your system timer.\n");
  PetscPrintf(PETSC_COMM_WORLD,HLINE);

  /* --- MAIN LOOP --- repeat test cases NTIMES times --- */

  scalar = 3.0;
  for (k=0; k<NTIMES; k++) {
    MPI_Barrier(PETSC_COMM_WORLD);
    /* ### COPY: c <- a ### */
    times[0][k] = Second();
    MPI_Barrier(PETSC_COMM_WORLD);
#if USE_MEMCPY
    memcpy(c,a,N*sizeof(double));
#elif SSE2
    for (j=0; j<N; j+=8) {
      _mm_stream_pd(c+j+0,_mm_load_pd(a+j+0));
      _mm_stream_pd(c+j+2,_mm_load_pd(a+j+2));
      _mm_stream_pd(c+j+4,_mm_load_pd(a+j+4));
      _mm_stream_pd(c+j+6,_mm_load_pd(a+j+6));
#  if PREFETCH_NTA
      _mm_prefetch(a+j+64,_MM_HINT_NTA);
#  endif
    }
#else
    for (j=0; j<N; j++) c[j] = a[j];
#endif
    MPI_Barrier(PETSC_COMM_WORLD);
    times[0][k] = Second() - times[0][k];

    /* ### SCALE: b <- scalar * c ### */
    times[1][k] = Second();
    MPI_Barrier(PETSC_COMM_WORLD);
#if SSE2
    {
      __m128d scalar2 = _mm_set1_pd(scalar);
      for (j=0; j<N; j+=8) {
        _mm_stream_pd(b+j+0,_mm_mul_pd(scalar2,_mm_load_pd(c+j+0)));
        _mm_stream_pd(b+j+2,_mm_mul_pd(scalar2,_mm_load_pd(c+j+2)));
        _mm_stream_pd(b+j+4,_mm_mul_pd(scalar2,_mm_load_pd(c+j+4)));
        _mm_stream_pd(b+j+6,_mm_mul_pd(scalar2,_mm_load_pd(c+j+6)));
#  if PREFETCH_NTA
        _mm_prefetch(c+j+64,_MM_HINT_NTA);
#  endif
      }
    }
#else
    for (j=0; j<N; j++) b[j] = scalar*c[j];
#endif
    MPI_Barrier(PETSC_COMM_WORLD);
    times[1][k] = Second() - times[1][k];

    /* ### ADD: c <- a + b ### */
    times[2][k] = Second();
    MPI_Barrier(PETSC_COMM_WORLD);
#if SSE2
    {
      for (j=0; j<N; j+=8) {
        _mm_stream_pd(c+j+0,_mm_add_pd(_mm_load_pd(a+j+0),_mm_load_pd(b+j+0)));
        _mm_stream_pd(c+j+2,_mm_add_pd(_mm_load_pd(a+j+2),_mm_load_pd(b+j+2)));
        _mm_stream_pd(c+j+4,_mm_add_pd(_mm_load_pd(a+j+4),_mm_load_pd(b+j+4)));
        _mm_stream_pd(c+j+6,_mm_add_pd(_mm_load_pd(a+j+6),_mm_load_pd(b+j+6)));
#  if PREFETCH_NTA
        _mm_prefetch(a+j+64,_MM_HINT_NTA);
        _mm_prefetch(b+j+64,_MM_HINT_NTA);
#  endif
      }
    }
#else
    for (j=0; j<N; j++) c[j] = a[j]+b[j];
#endif
    MPI_Barrier(PETSC_COMM_WORLD);
    times[2][k] = Second() - times[2][k];

    /* ### TRIAD: a <- b + scalar * c ### */
    times[3][k] = Second();
    MPI_Barrier(PETSC_COMM_WORLD);
#if SSE2
    {
      __m128d scalar2 = _mm_set1_pd(scalar);
      for (j=0; j<N; j+=8) {
        _mm_stream_pd(a+j+0,_mm_add_pd(_mm_load_pd(b+j+0),_mm_mul_pd(scalar2,_mm_load_pd(c+j+0))));
        _mm_stream_pd(a+j+2,_mm_add_pd(_mm_load_pd(b+j+2),_mm_mul_pd(scalar2,_mm_load_pd(c+j+2))));
        _mm_stream_pd(a+j+4,_mm_add_pd(_mm_load_pd(b+j+4),_mm_mul_pd(scalar2,_mm_load_pd(c+j+4))));
        _mm_stream_pd(a+j+6,_mm_add_pd(_mm_load_pd(b+j+6),_mm_mul_pd(scalar2,_mm_load_pd(c+j+6))));
#  if PREFETCH_NTA
        _mm_prefetch(b+j+64,_MM_HINT_NTA);
        _mm_prefetch(c+j+64,_MM_HINT_NTA);
#  endif
      }
    }
#else
    for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j];
#endif
    MPI_Barrier(PETSC_COMM_WORLD);
    times[3][k] = Second() - times[3][k];
  }

  /* --- SUMMARY --- */

  for (k=0; k<NTIMES; k++)
    for (j=0; j<4; j++) {
      rmstime[j] = rmstime[j] + (times[j][k] * times[j][k]);
      mintime[j] = MIN(mintime[j], times[j][k]);
      maxtime[j] = MAX(maxtime[j], times[j][k]);
    }


  PetscPrintf(PETSC_COMM_WORLD,"%8s:  %11s  %11s  %11s  %11s  %11s\n","Function","Rate (MB/s)","Total (MB/s)","RMS time","Min time","Max time");
  for (j=0; j<4; j++) {
    rmstime[j] = sqrt(rmstime[j]/(double)NTIMES);
    PetscPrintf(PETSC_COMM_WORLD,"%8s: %11.4f  %11.4f  %11.4f  %11.4f  %11.4f\n", label[j], 1.0e-06*bytes[j]/mintime[j], size*1.0e-06*bytes[j]/mintime[j], rmstime[j], mintime[j], maxtime[j]);
  }
  PetscFinalize();
  return 0;
}
Exemplo n.º 26
0
int main(int argc, char* argv[]) {
	printf("\n NODE_BIND:%d, NUMA:%d, CPU_BIND:%d, FIRST_TOUCH:%d\n",NODE_BIND, NUMA, CPU_BIND, FIRST_TOUCH);

        int repetitions, // number of repetition 
			maxThreads, // max number of threads
			it,
                        N; // array size;
        int bitCount = 1;
	int * key; // array of keys
	long * dataIn; // input data
	long * dataSTL; // input stl data
	long * dataRadix; // input radix data

        repetitions = 1;
#pragma omp parallel
	maxThreads = omp_get_num_threads();

        if(argc ==1 ){
            printf("prog input_file number_of_elements bit_count number_of_repetitions\n");
            printf("NO INPUT FILE");
            return 0;
        }
        if(argc == 2){
            printf("prog input_file number_of_elements bit_count number_of_repetitions\n");
            printf("NO ELEMENT COUNT\n");
            return 0;
        }
        if(argc >2 ){
	    N = (int) strtol(argv[2], NULL, 10);
        }
        if(argc >3){
             int tmp;
	    tmp = (int) strtol(argv[3], NULL, 10);
	    if ((tmp > 0) && (tmp<=16 )) // limit bit count
		bitCount = tmp;
        }        
        if(argc >4){
             int tmp;
	    tmp = (int) strtol(argv[4], NULL, 10);
	    if ((tmp > 0) && (tmp<=10000 )) // limit repetitions
		repetitions = tmp;
        }

        int *input;
	size_t N2;
	printf( "Reading data from file.\n" );
        if( readIntArrayFile( argv[1], &input, &N2 ) )
           return 1; 
	printf( "Data reading done.\n" );

        if( (N2<(size_t)N) || (N<=0) )
		N = N2;


       	printf( "\nPARALLEL STL SORT for N=%d, max threads = %d, test repetitions: %d\n", N, maxThreads, repetitions);

	dataIn = new long[N]; 
	dataSTL = new long[N];

#ifdef _WIN32

	dataRadix = new long[N];
	key = new int[N];
#endif
#ifdef linux

	key = new int[N];
#if NUMA==0

	dataRadix = new long[N]; 

#elif NUMA==1
			dataRadix = (long*) numa_alloc_interleaved(N * sizeof(long));
#elif NUMA==2
			dataRadix = (long*)numa_alloc_onnode(sizeof(long)*N,1);
#endif
#endif
	VTimer stlTimes(maxThreads);
	VTimer radixTimes(maxThreads);
#if TIME_COUNT==1
	VTimer partTimes(TIMERS_COUNT);
#endif
#if FLUSH_CACHE==1
#ifdef linux
        CacheFlusher cf;
#endif
#endif

        for(long i=0;i<N;i++)
		dataIn[i]=input[i];
	delete[] input;

// loop from 1 to maxThreads
	for (int t = 1; t <= maxThreads; t++) {
		int i;
#if TIME_COUNT==1
                partTimes.reset();
#endif
#if CALC_REF==1
// parallel STL
		for (it = 0; it < repetitions; it++) {
			setThreadsNo(t, maxThreads);
#pragma omp parallel for private(i)
			for (i = 0; i < N; i++)
				dataSTL[i] = dataIn[i];
#if FLUSH_CACHE==1
#ifdef linux
                        cf.flush();
#endif
#endif
			stlTimes.timerStart(t-1);

#ifdef linux
			__gnu_parallel::sort(dataSTL, dataSTL + N);
#endif
#ifdef _WIN32
			std::sort(dataSTL, dataSTL + N);
#endif
			stlTimes.timerEnd(t-1);
		}

#if FLUSH_CACHE==1
#ifdef linux
                cf.flush();
#endif
#endif
#endif

// radix sort V1
		for (it = 0; it < repetitions; it++) {
			setThreadsNo(t, maxThreads);
#pragma omp parallel for private(i) default(shared)

			for (i = 0; i < N; i++){
				dataRadix[i] = dataIn[i];
				key[i]=i;
			}

#if FLUSH_CACHE==1
#ifdef linux
                        cf.flush();
#endif
#endif
			omp_set_num_threads(t);
			radixTimes.timerStart(t-1);
#if TIME_COUNT==1
                        prsort::pradsort<long,int>(dataRadix,key, N, bitCount,&partTimes);
#else
                        prsort::pradsort<long,int>(dataRadix,key, N,bitCount,NULL);
#endif
			radixTimes.timerEnd(t-1);

		}

       
#if CALC_REF==1
		printf("|STL   SORT(th=%2d)  : %1.3fs  |\t", t,
				stlTimes.getTime(t-1));
#endif
#if TIME_COUNT==1
		for (int i = 0; i < TIMERS_COUNT; i++) {
#if CREATE_OUTPUT==1
			printf("%d %d %d %d %d %d %d %f\n", NUMA, NODE_BIND, CPU_BIND, FIRST_TOUCH,bitCount , t, i ,partTimes.getTime(i));
#else
			printf("part%d :%f ", i, partTimes.getTime(i));
#endif

		}
#endif
#if CREATE_OUTPUT ==1
		        printf("%d %d %d %d %d %d calosc %1.3f", NUMA,NODE_BIND,CPU_BIND,FIRST_TOUCH,bitCount, t ,radixTimes.getTime(t-1));
#else
		printf("|RADIX SORT (th=%2d)  : %1.3fs  |\t", t,
				radixTimes.getTime(t-1));
#endif

// Attention: checking result only from the last function usage 

#if CALC_REF==1
		checkResults(dataSTL, dataRadix, N);
#else
		printf("\n");
#endif

#if CHECK_KEY==1
	if(checkKey(dataIn,dataRadix,key,N))printf("Keys are good\n");

#endif
	}

#ifdef linux
	delete[] key;
#if NUMA>0
	numa_free(dataRadix, sizeof(long) * N);
        
#else

	delete[] dataRadix;
#endif
#endif
#ifdef _WIN32
	delete[] dataRadix;
#endif

	delete[] dataIn;
	delete[] dataSTL;
	
#if TIME_COUNT==1
	
        
        
#endif
	return 0;
}
Exemplo n.º 27
0
int main(void)
{
	int node_id = 0;
	int arrival_lambda = 10;
	int thread_cpu_map[N_THREADS] = {1,6};
	int n = 10000;
	int i;
	int j;
	/*
	   pthread_spinlock_t *spinlock_ptr = malloc(sizeof(pthread_spinlock_t));
	   if(spinlock_ptr == NULL) //error handling of the malloc of the spinlock
	   {
	   printf("malloc of spinlock failed.\n");
	   }
	   else
	   {
	   printf("malloc of spinlock succeeded.\n");
	   }
	   free(spinlock_ptr);
	 */
	//pthread_t threads[N_THREADS];
	//cpu_set_t cpuset[N_THREADS]; //for setting the affinity of threads
	thread_params para[N_THREADS]; //The parameters to pass to the threads

	//printf("The return value of numa_get_run_node_mask(void) is %d\n",numa_get_run_node_mask());
	//printf("The return value of numa_max_node(void) is %d\n",numa_max_node());
	//numa_tonode_memory((void *)spinlock_ptr,sizeof(pthread_spinlock_t),node_id); //This doesn't work

	//initilize the spinlock pointer and put it on a specific node
	pthread_spinlock_t *spinlock_ptr = numa_alloc_onnode(sizeof(pthread_spinlock_t),node_id);
	if(spinlock_ptr == NULL) //error handling of the allocating of a spinlock pointer on a specific node
	{
		printf("alloc of spinlock on a node failed.\n");
	}
	else
	{
		printf("alloc of spinlock on a node succeeded.\n");
	}
	for(j = 0; j  < n; j++)
	{
		//initlize spinlock
		pthread_spin_init(spinlock_ptr,0);
		//create the threads
		for(i = 0; i < N_THREADS; i++)
		{
			int rc;
			int s;
			para[i].thread_id = i;
			para[i].arrival_lambda = arrival_lambda;
			para[i].spinlock_ptr = spinlock_ptr;
			CPU_ZERO(&cpuset[i]);
			CPU_SET(thread_cpu_map[i],&cpuset[i]);
			rc = pthread_create(&threads[i],NULL,work,(void*)&para[i]);
			if(rc)
			{
				printf("ERROR: return code from pthread_create() is %d for thread %d \n",rc,i);
				exit(-1);
			}
			flag = 1; 
		}
		for(i = 0; i < N_THREADS; i++)
		{
			pthread_join(threads[i],NULL);
		}
	}
	for(i = 0; i < N_THREADS; i++)
	{
		printf("The time to get one lock for thread %d is : %.9f\n",i,time_in_cs[i]/n);
	}
	double diff = abs_value(time_in_cs[0] - time_in_cs[1])/n;
	printf("The difference of the time to get one lock is : %.9f (%f) cycles\n",diff,diff*CPU_FREQ); //this is assuming there are only two processors (needs to be changed if there are more)
	pthread_spin_destroy(spinlock_ptr);
	numa_free((void *)spinlock_ptr,sizeof(pthread_spinlock_t));
	pthread_exit(NULL);
	return 0;
}
Exemplo n.º 28
0
static void setup(void)
{
	int ret, i, j;
	int pagesize = getpagesize();
	void *p;

	tst_require_root(NULL);
	TEST(ltp_syscall(__NR_migrate_pages, 0, 0, NULL, NULL));

	if (numa_available() == -1)
		tst_brkm(TCONF, NULL, "NUMA not available");

	ret = get_allowed_nodes_arr(NH_MEMS, &num_nodes, &nodes);
	if (ret < 0)
		tst_brkm(TBROK | TERRNO, NULL, "get_allowed_nodes(): %d", ret);

	if (num_nodes < 2)
		tst_brkm(TCONF, NULL, "at least 2 allowed NUMA nodes"
			 " are required");
	else if (tst_kvercmp(2, 6, 18) < 0)
		tst_brkm(TCONF, NULL, "2.6.18 or greater kernel required");

	/*
	 * find 2 nodes, which can hold NODE_MIN_FREEMEM bytes
	 * The reason is that:
	 * 1. migrate_pages() is expected to succeed
	 * 2. this test avoids hitting:
	 *    Bug 870326 - migrate_pages() reports success, but pages are
	 *                 not moved to desired node
	 *    https://bugzilla.redhat.com/show_bug.cgi?id=870326
	 */
	nodeA = nodeB = -1;
	for (i = 0; i < num_nodes; i++) {
		p = numa_alloc_onnode(NODE_MIN_FREEMEM, nodes[i]);
		if (p == NULL)
			break;
		memset(p, 0xff, NODE_MIN_FREEMEM);

		j = 0;
		while (j < NODE_MIN_FREEMEM) {
			if (addr_on_node(p + j) != nodes[i])
				break;
			j += pagesize;
		}
		numa_free(p, NODE_MIN_FREEMEM);

		if (j >= NODE_MIN_FREEMEM) {
			if (nodeA == -1)
				nodeA = nodes[i];
			else if (nodeB == -1)
				nodeB = nodes[i];
			else
				break;
		}
	}

	if (nodeA == -1 || nodeB == -1)
		tst_brkm(TCONF, NULL, "at least 2 NUMA nodes with "
			 "free mem > %d are needed", NODE_MIN_FREEMEM);
	tst_resm(TINFO, "Using nodes: %d %d", nodeA, nodeB);

	ltpuser = getpwnam(nobody_uid);
	if (ltpuser == NULL)
		tst_brkm(TBROK | TERRNO, NULL, "getpwnam failed");

	TEST_PAUSE;
}