示例#1
0
void release_csr_cont(struct csr_cont_t *csr_cont)
{
    int i;
    for (i = 0; i < csr_cont->count; i++)
    {
        release_csr_mat(csr_cont->csrs + i);
    }
    numa_free(csr_cont->split_idx, (csr_cont->count + 1) * sizeof(int));
    numa_free(csr_cont->csrs, csr_cont->count * sizeof(struct csr_mat_t));
}
示例#2
0
void release_blk_mat(struct blk_mat_t *mat)
{
    int blk_row = (mat->rows + BLOCK_SIZE - 1) / BLOCK_SIZE;
    int blk_col = (mat->cols + BLOCK_SIZE - 1) / BLOCK_SIZE;
    int blk_num = blk_row * blk_col;

    numa_free(mat->types, blk_num * sizeof(blk_type_t));
    numa_free(mat->row_info, mat->row_id[blk_num] * sizeof(WORD));
    numa_free(mat->col_idx, mat->non_zeros * sizeof(WORD));
    numa_free(mat->vals, mat->non_zeros * sizeof(FLOAT));
    numa_free(mat->row_id, (blk_num + 1) * sizeof(DWORD));
}
示例#3
0
void dpi_flow_table_delete_v4(
		dpi_flow_DB_v4_t* db,
		dpi_flow_cleaner_callback* flow_cleaner_callback){


	u_int32_t i;
	u_int16_t j;
	if(db!=NULL){
		if(db->table!=NULL){
			for(j=0; j<db->num_partitions; ++j){
				for(i=db->partitions[j].partition.informations.
						  lowest_index;
					i<=db->partitions[j].partition.informations.
					      highest_index; ++i){
					while(db->table[i].next!=&(db->table[i])){
						mc_dpi_flow_table_delete_flow_v4(
								db, flow_cleaner_callback,
								j, db->table[i].next);
					}
				}
#if DPI_FLOW_TABLE_USE_MEMORY_POOL
#if DPI_NUMA_AWARE
				numa_free(db->partitions[j].partition.
						        memory_chunk_lower_bound,
						  sizeof(ipv6_flow_t)*db->individual_pool_size);
				numa_free(db->partitions[j].partition.
						         pool,
						  sizeof(u_int32_t)*db->individual_pool_size);
#else
				free(db->partitions[j].partition.
						 memory_chunk_lower_bound);
				free(db->partitions[j].partition.pool);
#endif
#endif
			}
		}



#if DPI_NUMA_AWARE
		numa_free(db->partitions,
				  sizeof(dpi_flow_DB_v4_partition_t)*db->num_partitions);
		numa_free(db->table, sizeof(ipv4_flow_t)*db->total_size);
#else
		free(db->partitions);
		free(db->table);
#endif
		free(db);
	}
}
示例#4
0
void *SyncThreadWriteOrRead(void *arg)
{
	struct thread_data *data = arg;
	bind2node_id(data->node_id);
	int node_id = data->node_id;
	int num = data->num;
	int off_start = data->off_start;
	int i;
	char *buffer = (char *) numa_alloc_onnode(block_size, node_id);

	ssd_file_desc_t fd = ssd_open(data->file_name, node_id, 0);

	printf("thread %d: access %d blocks\n", data->idx, num);
	for (i = 0; i < num; i++) {
		off_t offset = offs[off_start + i];
		if (access == READ)
			ssd_read(fd, (void *) buffer, block_size, offset);
		else
			ssd_write(fd, (void *) buffer, block_size, offset);
	}
	numa_free(buffer, block_size);

	ssd_close(fd);
	return NULL;
}
示例#5
0
/* nreset() - frees all memory buffers */
void numa_allocator::nreset(void) {
	if(!empty) {
		empty = true;
		// free other_buffers, if used
		if(other_buffers != NULL) {
			int i = num_buffers - 1;
			while(i >= 0) {
				numa_free(other_buffers[i], buf_size);
				i--;
			}
			free(other_buffers);
		}
		// free primary buffer
		numa_free(buf_start, buf_size);
	}
}
示例#6
0
void
numa_membench(mem_bench_info_t *mbinfo)
{

  assert(mbinfo->destnode <= numa_max_node());

  {
    long size, freep;
    size = numa_node_size(mbinfo->destnode, &freep);
    //printf("node %d : total = %ld(B), free = %ld(B)\n", mbinfo->destnode, size, freep);
    assert(freep >= mbinfo->working_size);

    mbinfo->working_area =
      (long *)numa_alloc_onnode(mbinfo->working_size, mbinfo->destnode);
    if (NULL == mbinfo->working_area) {
      perror("numa_alloc_onnode");
      exit(EXIT_FAILURE);
    }
    memset(mbinfo->working_area, 0, mbinfo->working_size);
  }

  memory_stress_rand(&mbinfo->pc, mbinfo->working_area, mbinfo->working_size);

  // release resources
  numa_free(mbinfo->working_area, mbinfo->working_size);
}
示例#7
0
static inline void v4_flow_free(ipv4_flow_t* flow){
#if DPI_NUMA_AWARE
	numa_free(flow, sizeof(ipv4_flow_t));
#else
	free(flow);
#endif
}
示例#8
0
JNIEXPORT void JNICALL Java_xerial_jnuma_NumaNative_free__Ljava_nio_ByteBuffer_2
(JNIEnv *env, jobject jobj, jobject buf) {
    //printf("free is called\n");
    void* mem = (*env)->GetDirectBufferAddress(env, buf);
    jlong capacity = (*env)->GetDirectBufferCapacity(env, buf);
    if(mem != 0) {
        //printf("free capacity:%d\n", capacity);
        numa_free(mem, (size_t) capacity);
    }
}
示例#9
0
void myhbwmalloc_final(void)
{
    if (myhbwmalloc_mspace != NULL) {
        size_t bytes_avail = destroy_mspace(myhbwmalloc_mspace);
        if (myhbwmalloc_verbose) {
            printf("hbwmalloc: destroy_mspace returned = %zu\n", bytes_avail);
        }
    }
    if (myhbwmalloc_slab != NULL) {
        numa_free(myhbwmalloc_slab, myhbwmalloc_slab_size);
    }
}
示例#10
0
文件: numa.c 项目: roccen/lagopus
static void
s_numa_free(void *p) {
  if (likely(p != NULL)) {
    size_t sz = 0;
    lagopus_result_t r = s_find_addr(p, &sz);

    if (likely(r == LAGOPUS_RESULT_OK)) {
      numa_free(p, sz);
      s_delete_addr(p);
    } else {
      free(p);
    }
  }
}
示例#11
0
static inline
#endif
    void
    pfwl_free_task(mc_pfwl_task_t *task) {
#if PFWL_NUMA_AWARE
  numa_free(task, sizeof(mc_pfwl_task_t));
#else
#if PFWL_MULTICORE_ALIGN_TASKS
  free(task);
#else
  delete task;
#endif
#endif
}
示例#12
0
/*
 * free_pages() - free an array of pages
 * @pages: array of page pointers to be freed
 * @num: no. of pages in the array
 */
void free_pages(void **pages, unsigned int num)
{

#if HAVE_NUMA_H
	int i;
	size_t onepage = get_page_size();

	for (i = 0; i < num; i++) {
		if (pages[i] != NULL) {
			numa_free(pages[i], onepage);
		}
	}
#endif
}
示例#13
0
文件: numa.c 项目: roccen/lagopus
static bool
s_free_all(void *key, void *val, lagopus_hashentry_t he, void *arg) {
  void *addr = key;
  size_t sz = (size_t)val;

  (void)he;
  (void)arg;

  if (likely(addr != NULL && sz > 0)) {
    numa_free(addr, sz);
  }

  return true;
}
示例#14
0
文件: dynarray.c 项目: cslab-ntua/csx
void *dynarray_destroy(struct dynarray *da)
{
	void *ret = da->elems;
	//printf("destroy realloc: idx:%lu nr:%lu realloc size:%lu\n", da->next_idx, da->elems_nr, (da->next_idx+1)*da->elem_size);
	if (da->numa) {
		ret = numa_realloc(ret, da->elems_nr*da->elem_size,
	                       da->next_idx*da->elem_size);
		numa_free(da, sizeof(*da));
	} else {
		ret = realloc(ret, da->next_idx*da->elem_size);
		free(da);
	}

	return ret;
}
示例#15
0
//----------------------------------------------------------------------
//-- free memory allocated by AllocOnNode -- needs mem and size
//----------------------------------------------------------------------
inline void FreeOnNode(void *mem, const size_t size)
{
#if THERON_NUMA

#if THERON_WINDOWS

    VirtualFree(mem, size, MEM_RELEASE);

#elif THERON_GCC

    numa_free(mem, size);

#endif

#endif // THERON_NUMA
}
示例#16
0
	~MemBlockAlloc()
	{
		if (verbosity > 1)
			std::cout << "NUMABlockAlloc EXIT" << std::endl;

		for (auto& n : domain_block_groups)
		{
			for (auto& g : n.block_groups)
			{
//				std::cout << "cleaning up " << g.free_blocks.size() << " blocks of size " << g.block_size << std::endl;

				for (auto& b : g.free_blocks)
				{
#if NUMA_BLOCK_ALLOCATOR_TYPE == 0 || NUMA_BLOCK_ALLOCATOR_TYPE == 3
					::free(b);
#else
					numa_free(b, g.block_size);
#endif
				}
			}
		}
	}
示例#17
0
void pfree(void* start, size_t size)
{
    numa_free(start, size);
}
示例#18
0
int main(int argc, const char **argv)
{
  int num_cpus = numa_num_task_cpus();
  printf("num cpus: %d\n", num_cpus);

  printf("numa available: %d\n", numa_available());
  numa_set_localalloc();

  struct bitmask *bm = numa_bitmask_alloc(num_cpus);
  for (int i=0; i<=numa_max_node(); ++i)
  {
    numa_node_to_cpus(i, bm);
    printf("numa node %d ", i);
    print_bitmask(bm);
    printf(" - %g GiB\n", numa_node_size(i, 0) / (1024.*1024*1024.));
  }
  numa_bitmask_free(bm);

  puts("");

  char *x;
  const size_t cache_line_size = 64;
  const size_t array_size = 100*1000*1000;
  size_t ntrips = 2;

#pragma omp parallel
  {
    assert(omp_get_num_threads() == num_cpus);
    int tid = omp_get_thread_num();

    pin_to_core(tid);
    if(tid == 0)
      x = (char *) numa_alloc_local(array_size);

    // {{{ single access
#pragma omp barrier
    for (size_t i = 0; i<num_cpus; ++i)
    {
      if (tid == i)
      {
        double t = measure_access(x, array_size, ntrips);
        printf("sequential core %d -> core 0 : BW %g MB/s\n",
            i, array_size*ntrips*cache_line_size / t / 1e6);
      }
#pragma omp barrier
    }
    // }}}

    // {{{ everybody contends for one

    {
      if (tid == 0) puts("");

#pragma omp barrier
      double t = measure_access(x, array_size, ntrips);
#pragma omp barrier
      for (size_t i = 0; i<num_cpus; ++i)
      {
        if (tid == i)
          printf("all-contention core %d -> core 0 : BW %g MB/s\n",
              tid, array_size*ntrips*cache_line_size / t / 1e6);
#pragma omp barrier
      }
    }

    // }}}

    // {{{ zero and someone else contending

    if (tid == 0) puts("");

#pragma omp barrier
    for (size_t i = 1; i<num_cpus; ++i)
    {
      double t;
      if (tid == i || tid == 0)
        t = measure_access(x, array_size, ntrips);

#pragma omp barrier
      if (tid == 0)
      {
        printf("two-contention core %d -> core 0 : BW %g MB/s\n",
            tid, array_size*ntrips*cache_line_size / t / 1e6);
      }
#pragma omp barrier
      if (tid == i)
      {
        printf("two-contention core %d -> core 0 : BW %g MB/s\n\n",
            tid, array_size*ntrips*cache_line_size / t / 1e6);
      }
#pragma omp barrier
    }
  }
  numa_free(x, array_size);

  return 0;
}
示例#19
0
int main(void)
{
	int node_id = 0;
	int arrival_lambda = 10;
	int thread_cpu_map[N_THREADS] = {1,6};
	int i;
	int j;
	/*
	   pthread_spinlock_t *spinlock_ptr = malloc(sizeof(pthread_spinlock_t));
	   if(spinlock_ptr == NULL) //error handling of the malloc of the spinlock
	   {
	   printf("malloc of spinlock failed.\n");
	   }
	   else
	   {
	   printf("malloc of spinlock succeeded.\n");
	   }
	   free(spinlock_ptr);
	 */
	//pthread_t threads[N_THREADS];
	//cpu_set_t cpuset[N_THREADS]; //for setting the affinity of threads
	thread_params para[N_THREADS]; //The parameters to pass to the threads

	//printf("The return value of numa_get_run_node_mask(void) is %d\n",numa_get_run_node_mask());
	//printf("The return value of numa_max_node(void) is %d\n",numa_max_node());
	//numa_tonode_memory((void *)spinlock_ptr,sizeof(pthread_spinlock_t),node_id); //This doesn't work

	//initilize the spinlock pointer and put it on a specific node
	pthread_spinlock_t *spinlock_ptr = numa_alloc_onnode(sizeof(pthread_spinlock_t),node_id);
	if(spinlock_ptr == NULL) //error handling of the allocating of a spinlock pointer on a specific node
	{
		printf("alloc of spinlock on a node failed.\n");
	}
	else
	{
		printf("alloc of spinlock on a node succeeded.\n");
	}
	for(j = 0; j  < 100000; j++)
	{
		//initlize spinlock
		pthread_spin_init(spinlock_ptr,0);
		//create the threads
		for(i = 0; i < N_THREADS; i++)
		{
			int rc;
			int s;
			para[i].thread_id = i;
			para[i].arrival_lambda = arrival_lambda;
			para[i].spinlock_ptr = spinlock_ptr;
			CPU_ZERO(&cpuset[i]);
			CPU_SET(thread_cpu_map[i],&cpuset[i]);
			rc = pthread_create(&threads[i],NULL,work,(void*)&para[i]);
			if(rc)
			{
				printf("ERROR: return code from pthread_create() is %d for thread %d \n",rc,i);
				exit(-1);
			}
			/*
			s = pthread_setaffinity_np(threads[i], sizeof(cpu_set_t), &cpuset[i]);
			if (s != 0)
				perror("set affinity error\n");
			*/
			flag = 1; 
		}
	/*
		for(i = 0; i < N_THREADS; i++)
		{
			int s;
			CPU_ZERO(&cpuset[i]);
			CPU_SET(thread_cpu_map[i],&cpuset[i]);
			s = pthread_setaffinity_np(threads[i], sizeof(cpu_set_t), &cpuset[i]);
			if (s != 0)
				perror("sjfljkl\n");
		}
	*/
		for(i = 0; i < N_THREADS; i++)
		{
			pthread_join(threads[i],NULL);
		}
	}
	for(i = 0; i < N_THREADS; i++)
	{
		printf("The time to get one lock for thread %d is : %.9f\n",i,time_in_cs[i]/100000);
	}
	pthread_spin_destroy(spinlock_ptr);
	numa_free(spinlock_ptr,sizeof(pthread_spinlock_t));
	pthread_exit(NULL);
	return 0;
}
示例#20
0
static void setup(void)
{
	int ret, i, j;
	int pagesize = getpagesize();
	void *p;

	tst_require_root(NULL);
	TEST(ltp_syscall(__NR_migrate_pages, 0, 0, NULL, NULL));

	if (numa_available() == -1)
		tst_brkm(TCONF, NULL, "NUMA not available");

	ret = get_allowed_nodes_arr(NH_MEMS, &num_nodes, &nodes);
	if (ret < 0)
		tst_brkm(TBROK | TERRNO, NULL, "get_allowed_nodes(): %d", ret);

	if (num_nodes < 2)
		tst_brkm(TCONF, NULL, "at least 2 allowed NUMA nodes"
			 " are required");
	else if (tst_kvercmp(2, 6, 18) < 0)
		tst_brkm(TCONF, NULL, "2.6.18 or greater kernel required");

	/*
	 * find 2 nodes, which can hold NODE_MIN_FREEMEM bytes
	 * The reason is that:
	 * 1. migrate_pages() is expected to succeed
	 * 2. this test avoids hitting:
	 *    Bug 870326 - migrate_pages() reports success, but pages are
	 *                 not moved to desired node
	 *    https://bugzilla.redhat.com/show_bug.cgi?id=870326
	 */
	nodeA = nodeB = -1;
	for (i = 0; i < num_nodes; i++) {
		p = numa_alloc_onnode(NODE_MIN_FREEMEM, nodes[i]);
		if (p == NULL)
			break;
		memset(p, 0xff, NODE_MIN_FREEMEM);

		j = 0;
		while (j < NODE_MIN_FREEMEM) {
			if (addr_on_node(p + j) != nodes[i])
				break;
			j += pagesize;
		}
		numa_free(p, NODE_MIN_FREEMEM);

		if (j >= NODE_MIN_FREEMEM) {
			if (nodeA == -1)
				nodeA = nodes[i];
			else if (nodeB == -1)
				nodeB = nodes[i];
			else
				break;
		}
	}

	if (nodeA == -1 || nodeB == -1)
		tst_brkm(TCONF, NULL, "at least 2 NUMA nodes with "
			 "free mem > %d are needed", NODE_MIN_FREEMEM);
	tst_resm(TINFO, "Using nodes: %d %d", nodeA, nodeB);

	ltpuser = getpwnam(nobody_uid);
	if (ltpuser == NULL)
		tst_brkm(TBROK | TERRNO, NULL, "getpwnam failed");

	TEST_PAUSE;
}
示例#21
0
int main(int argc, char *argv[])
{
	int node_id = 0;
	int arrival_lambda = 10;
	int thread_cpu_map[N_THREADS];
	int i,j,k;
	int n_threads;
	int n_left;
	int n_right;
	int next_index_left = 3;
	int next_index_right = 7;
	float local_square = 0.0, remote_square = 0.0;


	/***************** make sure #args is correct and get the n_threads, n_left and n_right */
	if(argc < 4)
	{
		printf("Usage: ./test_numa_comb n_of_threads n_of_threads_on_node0 n_of_threads_on_node1\n");
		exit(-1);
	}
	n_threads = atoi(argv[1]);
	n_left = atoi(argv[2]);
	n_right = atoi(argv[3]);
	/******************* Set the thread_cpu_map according to the n_left and n_right */
	printf("n_threads: %d, n_left: %d, n_right: %d\n",n_threads,n_left,n_right);
	for(i = 0; i < n_left; i++)
	{
		thread_cpu_map[i] = next_index_left;
		next_index_left--;
	}
	for(i = n_left; i < n_threads; i++)
	{
		thread_cpu_map[i] = next_index_right;
		next_index_right--;
	}
	for(i = 0; i < n_threads; i++)
	{
		printf("Thread %d is on cpu %d\n",i,thread_cpu_map[i]);
	}



	thread_params para[n_threads]; //The parameters to pass to the threads

	//printf("The return value of numa_get_run_node_mask(void) is %d\n",numa_get_run_node_mask());
	//printf("The return value of numa_max_node(void) is %d\n",numa_max_node());
	//numa_tonode_memory((void *)spinlock_ptr,sizeof(pthread_spinlock_t),node_id); //This doesn't work

	//initilize the spinlock pointer and put it on a specific node
	pthread_spinlock_t *spinlock_ptr = numa_alloc_onnode(sizeof(pthread_spinlock_t),node_id);

	if(spinlock_ptr == NULL) //error handling of the allocating of a spinlock pointer on a specific node
	{
		printf("alloc of spinlock on a node failed.\n");
		exit(-1);
	}

	/* initialise  syncs */
	pthread_barrier_init(&fin_barrier, NULL, n_threads);
	pthread_spin_init(spinlock_ptr,0);
	int rc;
	//create the threads
	for(i = 0; i < n_threads; i++)
	{

		para[i].thread_id = i;
		para[i].arrival_lambda = arrival_lambda;
		para[i].spinlock_ptr = spinlock_ptr;
		CPU_ZERO(&cpuset[i]);
		CPU_SET(thread_cpu_map[i],&cpuset[i]);
		rc = pthread_create(&threads[i],NULL,work,(void*)&para[i]);
		E (rc);


	}
	start_work_flag = 1; 

	/* wait here */
	for(i = 0; i < n_threads; i++)
	    pthread_join(threads[i],NULL);


	pthread_barrier_destroy(&fin_barrier);

	/*
	for(i = 0; i < n_threads; i++)
	{
		printf("The time to get one lock for thread %d is : %.9f\n",i,time_in_cs[i]/num_access_each_thread[i]);
		printf("The number of lock accesses for thread %d is : %d\n",i,num_access_each_thread[i]);
	}
	*/

	qsort((void*)g_tss,(size_t)access_count,(size_t)sizeof(timestamp),cmp_timestamp);
	/*
	for (i = 0; i < access_count; i++)
		printf("%lu with id %d\n",g_tss[i].ts,g_tss[i].id);
	*/

	/* for (i = 0; i < access_count; i++)
	 * {
	 *     printf ("%lu %d\n", g_tss[i].ts, g_tss[i].id);
	 * } */

	/* */
	
	int cs_order[access_count/2];
	for(i = 0; i < access_count/2; i++)
	{
		cs_order[i] = g_tss[i*2].id;
		//printf("%d in cs\n",cs_order[i]);
	}
	int cs_matrix[n_threads][n_threads];
	uint64_t delay_matrix[n_threads][n_threads];
	float prob_matrix[n_threads][n_threads];
	float rate_matrix[n_threads][n_threads];

	// zero out all the matrices
	memset(&cs_matrix, '\0', n_threads*n_threads*sizeof(int));
	memset(&delay_matrix, '\0', n_threads*n_threads*sizeof(uint64_t));
	memset(&prob_matrix, '\0', n_threads*n_threads*sizeof(float));


	int local_count2 = 0, remote_count2 = 0;
	uint64_t diff;
	for(i = 0; i < n_threads; i++)
	    for(j = 0; j < n_threads; j++)
		for(k = 0; k < access_count/2 -1 ; k++)
		{
		    if(cs_order[k] == i && cs_order[k+1] == j)
		    {
			cs_matrix[i][j]++;
			diff = g_tss[2*k+2].ts - g_tss[2*k+1].ts; 
			delay_matrix[i][j] += diff;
			if(is_on_same_node(i, j, n_threads, n_left, n_right))
			{
			    dprintf("local_delay: %lu\n", diff);
			    local_square += sqr(diff);
			    local_count2++;
			}
			else
			{
			    dprintf("remote_delay: %lu\n", diff);
			    remote_square += sqr(diff);
			    remote_count2++;
			}
		    }
		}

	int num_access[n_threads];
	for(i = 0; i < access_count/2 -1; i++)
	    for(j = 0; j < n_threads; j++)
	    {
		if (cs_order[i] == j) num_access[j]++;
	    }

	for(i = 0; i < n_threads; i++)
		printf("num_access[%d]:%d\n",i,num_access[i]);

	for(i = 0; i < n_threads; i++)
		for(j = 0; j < n_threads ; j++)
		{
			prob_matrix[i][j] = (float)cs_matrix[i][j]/(float)num_access[i];
			rate_matrix[i][j] = 1.0/((delay_matrix[i][j]/(float)cs_matrix[i][j])/CPU_FREQ);
		}


	printf ("\n***************** PROBS *******************\n");
	printf ("Lock is on LP, [L, R] is [%d, %d]:\n", n_left - 1, n_right);
	// tl
	printf ("L -> L\n");
	print_mtx (n_threads, n_threads, prob_matrix,
			   0, 0, n_left, n_left, 0);
    // tr
	printf ("L -> R\n");
	print_mtx (n_threads, n_threads, prob_matrix,
			   n_left, 0, n_threads, n_left, 0);

	printf ("Lock is on RP, [L, R] is [%d, %d]:\n", n_left, n_right - 1);
	// br
	printf ("R -> R\n");
	print_mtx (n_threads, n_threads, prob_matrix,
			   n_left, n_left, n_threads, n_threads, 0);
	// bl
	printf ("R -> L\n");
	print_mtx (n_threads, n_threads, prob_matrix,
			   0, n_left, n_left, n_threads, 0);
	

	printf ("\n***************** RATES *******************\n");

	printf ("Lock is on LP, [L, R] is [%d, %d]:\n", n_left - 1, n_right);
	// tl
	printf ("L -> L\n");
	print_mtx (n_threads, n_threads, rate_matrix,
			   0, 0, n_left, n_left, 1);
    // tr
	printf ("L -> R\n");
	print_mtx (n_threads, n_threads, rate_matrix,
			   n_left, 0, n_threads, n_left, 1);

	printf ("Lock is on RP, [L, R] is [%d, %d]:\n", n_left, n_right - 1);
	// br
	printf ("R -> R\n");
	print_mtx (n_threads, n_threads, rate_matrix,
			   n_left, n_left, n_threads, n_threads, 1);
	// bl
	printf ("R -> \n");
	print_mtx (n_threads, n_threads, rate_matrix,
			   0, n_left, n_left, n_threads, 1);




	//print the intra-core and inter-core delay
	//thread 0 - n_left -1 are on the left core, n_left to n_threads are on the right core
	uint64_t local_delay = 0, remote_delay = 0;
	int local_count = 0, remote_count = 0;
	float local_prob = 0.0, remote_prob = 0.0;

	for(i = 0; i < n_threads; i++)
	    for(j = 0; j < n_threads; j++)
	    {
			if (j == i)
				continue;
			if(is_on_same_node(i, j, n_threads, n_left, n_right))
			{
				//printf("%d and %d on the same node\n",i,j);
				local_delay += delay_matrix[i][j];
				local_count += cs_matrix[i][j];
				local_prob += prob_matrix[j][i];
			}
			else
			{
				//printf("%d and %d not the same node\n",i,j);
				remote_delay += delay_matrix[i][j];
				remote_count += cs_matrix[i][j];
				remote_prob += prob_matrix[j][i];
			}
	    }


	float local = (float)local_delay/(local_count);
	float remote = (float)remote_delay/(remote_count);

	printf("\n\n**************************** Aggregates ***************************\n");
	printf("local delay: %f, remote_delay: %f, local_count: %d, remote_count: %d\n",(float)local_delay/(local_count),(float)remote_delay/(remote_count),local_count,remote_count);
	printf("local prob:%f, remote prob: %f\n",local_prob/n_threads, remote_prob/n_threads);
	printf("local delay variance:%f, remote delay variance: %f\n",local_square/local_count - local*local, remote_square/remote_count - remote*remote);
	printf("local count2: %d, remote_count2:%d\n",local_count2, remote_count2);
	pthread_spin_destroy(spinlock_ptr);
	numa_free((void *)spinlock_ptr,sizeof(pthread_spinlock_t));
	pthread_exit(NULL);
	return 0;
}
示例#22
0
int main( int argc, char** argv ) {
	struct timespec start, stop; 
	double time;

#ifndef NDEBUG
	std::cout << "-->WARNING: COMPILED *WITH* ASSERTIONS!<--" << std::endl;
#endif
	
	if( argc<=3 ) {
		std::cout << "Usage: " << argv[0] << " <mtx> <scheme> <x> <REP1> <REP2>" << std::endl << std::endl;
		std::cout << "calculates Ax=y and reports average time taken as well as the mean of y." << std::endl;
		std::cout << "with\t\t <mtx> filename of the matrix A in matrix-market or binary triplet format." << std::endl;
		std::cout << "    \t\t <scheme> number of a sparse scheme to use, see below." << std::endl;
		std::cout << "    \t\t <x> 0 for taking x to be the 1-vector, 1 for taking x to be random (fixed seed)." << std::endl;
		std::cout << "    \t\t <REP1> (optional, default is 1) number of repititions of the entire experiment." << std::endl;
		std::cout << "    \t\t <REP2> (optional, default is 1) number of repititions of the in-place SpMV multiplication, per experiment." << std::endl;
		std::cout << std::endl << "Possible schemes:" << std::endl;
		std::cout << " 0: TS (triplet scheme)" << std::endl;
		std::cout << " 1: CRS (also known as CSR)" << std::endl;
		std::cout << " 2: ICRS (Incremental CRS)" << std::endl;
		std::cout << " 3: ZZ-CRS (Zig-zag CRS)" << std::endl;
		std::cout << " 4: ZZ-ICRS (Zig-zag ICRS)" << std::endl;
		std::cout << " 5: SVM (Sparse vector matrix)" << std::endl;
		std::cout << " 6: HTS (Hilbert-ordered triplet scheme)" << std::endl;
		std::cout << " 7: BICRS (Bi-directional Incremental CRS)" << std::endl;
		std::cout << " 8: Hilbert (Hilbert-ordered triplets backed by BICRS)" << std::endl;
		std::cout << " 9: Block Hilbert (Sparse matrix blocking, backed by Hilbert and HBICRS)" << std::endl;
		std::cout << "10: Bisection Hilbert (Sparse matrix blocking by bisection, backed by Hilbert and HBICRS)" << std::endl;
		std::cout << "11: CBICRS (Compressed Bi-directional Incremental CRS)" << std::endl;
		std::cout << "12: Beta Hilbert (known as Block CO-H+ in the paper by Yzelman & Roose, 2012: parallel compressed blocked Hilbert with BICRS)" << std::endl;
		std::cout << "13: Row-distributed Beta Hilbert (known as Row-distributed block CO-H in the paper by Yzelman & Roose, 2012: same as 12, but simpler distribution)" << std::endl;
#ifdef WITH_CSB
		std::cout << "14: Row-distributed CSB (Uses CSB sequentially within the row-distributed scheme of 13)" << std::endl;
#endif
		std::cout << "15: Row-distributed Hilbert (Parallel row-distributed Hilbert scheme, see also 8)" << std::endl;
		std::cout << "16: Row-distributed parallel CRS (using OpenMP, known as OpenMP CRS in the paper by Yzelman & Roose, 2012)" << std::endl;
		std::cout << "17: Row-distributed SpMV using compressed Hilbert indices." << std::endl;
#ifdef WITH_MKL
		std::cout << "18: Intel MKL SpMV based on the CRS data structure." << std::endl;
#endif
		std::cout << "19: Optimised ICRS." << std::endl;
#ifdef WITH_CUDA
		std::cout << "20: CUDA CuSparse HYB format." << std::endl;
#endif
		std::cout << std::endl << "The in-place Ax=y calculation is preceded by a quasi pre-fetch." << std::endl;
		std::cout << "Add a minus sign before the scheme number to enable use of the CCS wrapper (making each CRS-based structure CCS-based instead)" << std::endl;
		std::cout << "Note: binary triplet format is machine-dependent. ";
		std::cout << "Take care when using the same binary files on different machine architectures." << std::endl;
		return EXIT_FAILURE;
	}

	std::string file = std::string( argv[1] );
	int scheme = atoi( argv[2] );
	int ccs    = scheme < 0 ? 1 : 0;
	if( ccs ) scheme = -scheme;
	int x_mode = atoi( argv[3] );
	unsigned long int rep1 = 1;
	unsigned long int rep2 = 1;
	if( argc >= 5 )
		rep1 = static_cast< unsigned long int >( atoi( argv[4] ) );
	if( argc >= 6 )
		rep2 = static_cast< unsigned long int >( atoi( argv[5] ) );

	if( scheme != 16 && scheme != -16 && //pin master thread to a single core
		scheme != 18 && scheme != -18 ) { //but not when OpenMP is used (otherwise serialised computations)
		cpu_set_t mask;
		CPU_ZERO( &mask );
		CPU_SET ( 0, &mask );
		if( pthread_setaffinity_np( pthread_self(), sizeof( mask ), &mask ) != 0 ) {
			std::cerr << "Error setting main thread affinity!" << std::endl;
			exit( 1 );
		}
	} else {
		omp_set_num_threads( MachineInfo::getInstance().cores() );
	}

#ifdef WITH_MKL
	if( scheme == 18 ) {
		mkl_set_num_threads( MachineInfo::getInstance().cores() );
	}
#endif
	std::cout << argv[0] << " called with matrix input file " << file << ", scheme number ";
	std::cout << scheme << " and x being " << (x_mode?"random":"the 1-vector") << "." << std::endl;
	std::cout << "Number of repititions of in-place zax is " << rep2 << std::endl;
	std::cout << "Number of repititions of the " << rep2 << " in-place zax(es) is " << rep1 << std::endl;

	Matrix< double >* checkm = new TS< double >( file );
	clock_gettime( CLOCK_ID, &start);
	Matrix< double >* matrix = selectMatrix( scheme, ccs, file );
	clock_gettime( CLOCK_ID, &stop);
	time  = (stop.tv_sec-start.tv_sec)*1000;
	time += (stop.tv_nsec-start.tv_nsec)/1000000.0;
	if( matrix == NULL ) {
		std::cerr << "Error during sparse scheme loading, exiting." << std::endl;
		return EXIT_FAILURE;
	}

	std::cout << "Matrix dimensions: " << matrix->m() << " times " << matrix->n() << "." << std::endl;
	std::cout << "Datastructure loading time: " << time << " ms." << std::endl << std::endl;

	srand( FIXED_SEED );
	double* x = NULL;
#ifdef INTERLEAVE_X
	if( scheme == 13 || scheme == 14 || scheme == 15 || scheme == 16 || scheme == 17 || scheme == 18 )
		x = (double*) numa_alloc_interleaved( matrix->n() * sizeof( double ) );
	else
#endif
		x = (double*) _mm_malloc( matrix->n() * sizeof( double ), 64 );

	//initialise input vector
	for( unsigned long int j=0; j<matrix->n(); j++ ) {
		x[ j ] = x_mode?(rand()/(double)RAND_MAX):1.0;
	}

	//do one trial run, also for verification
	double* c = checkm->mv( x );
	clock_gettime( CLOCK_ID, &start );
	double* z = matrix->mv( x );
	clock_gettime( CLOCK_ID, &stop);
	time = (stop.tv_sec-start.tv_sec)*1000;
	time += (stop.tv_nsec-start.tv_nsec)/1000000.0;
	double checkMSE = 0;
	unsigned long int max_e_index = 0;
	double max_e = fabs( z[0] - c[0] );
	for( unsigned long int j=0; j<matrix->m(); j++ ) {
		double curdiff = fabs( z[j] - c[j] );
		if( curdiff > max_e ) {
			max_e = curdiff;
			max_e_index = j;
		}
		curdiff  *= curdiff;
		curdiff  /= (double)(matrix->m());
		checkMSE += curdiff;
	}
#ifdef OUTPUT_Z
	for( unsigned long int j=0; j<matrix->m(); j++ ) {
		std::cout << z[ j ] << std::endl;
	}
#endif
	std::cout << "out-of-place z=Ax: mean= " << checksum( z, matrix->m() ) << ", ";
	std::cout << "MSE = " << checkMSE << ", ";
	std::cout << "max abs error = " << max_e << " while comparing y[ " << max_e_index << " ] = " <<  z[max_e_index] << " and c[ " << max_e_index << " ] = " <<  c[max_e_index] << ", ";
	std::cout << "time= " << time << " ms." << std::endl;
#ifdef RDBH_NO_COLLECT
	if( scheme == 13 ) {
		std::cout << "WARNING: MSE and max abs error are not correct for the Row-distributed Beta Hilbert scheme; please see the RDBHilbert.hpp file, and look for the RDBH_NO_COLLECT flag." << std::endl;
	}
#else
	if( scheme == 13 ) {
		std::cout << "WARNING: timings are pessimistic for the Row-distributed Beta Hilbert scheme; each spmv a (syncing) collect is executed to write local data to the global output vector as required by this library. To get the correct timings, turn this collect off via the RDBH_NO_COLLECT flag in the RDBHilbert.hpp file. Note that this causes the verification process to fail, since all data is kept in private local output subvectors." << std::endl;
	}
#endif
	double *times = new double[ rep1 ];

	//Run rep*rep instances
	for( unsigned long int run = 0; run < rep1; run++ ) {
		sleep( 1 );
		time = 0.0;
		//"prefetch"
		matrix->zax( x, z );
		matrix->zax( x, z, rep2, CLOCK_ID, &time );
		time /= static_cast<double>( rep2 );
		times[ run ] = time;
	}

	//calculate statistics
	double meantime, mintime, vartime;
	meantime = vartime = 0.0;
	mintime = times[ 0 ];
	for( unsigned long int run = 0; run < rep1; run++ ) {
		if( times[ run ] < mintime ) mintime = times[ run ];
		meantime += times[ run ] / static_cast< double >( rep1 );
	}
	for( unsigned long int run = 0; run < rep1; run++ ) {
		vartime += ( times[ run ] - meantime ) * ( times[ run ] - meantime ) / static_cast< double >( rep1 - 1 );
	}
	vartime = sqrt( vartime );

	std::cout << "In-place:" << std::endl;
	std::cout << "Mean  = " << checksum( z, matrix->m() ) << std::endl;
	std::cout << "Time  = " << meantime << " (average), \t" <<  mintime << " (fastest), \t" << vartime << " (stddev) ms. " << std::endl;
	const double avgspeed = static_cast< double >( 2*matrix->nzs() ) / meantime / 1000000.0;
	const double minspeed = static_cast< double >( 2*matrix->nzs() ) / mintime / 1000000.0;
	const double varspeed = fabs( avgspeed - static_cast< double >( 2*matrix->nzs() ) / (meantime - vartime) / 1000000.0 );
	std::cout << "Speed = " << avgspeed << " (average), \t"
					<< minspeed << " (fastest), \t"
					<< varspeed << " (variance) Gflop/s." << std::endl;
	const size_t memuse1 = matrix->bytesUsed() + sizeof( double ) * 2 * matrix->nzs();
	const double avgmem1 = static_cast< double >( 1000*memuse1 ) / meantime / 1073741824.0;
	const double minmem1 = static_cast< double >( 1000*memuse1 ) / mintime / 1073741824.0;
	const double varmem1 = fabs( avgmem1 - static_cast< double >( 1000*memuse1 ) / (meantime-vartime) / 1073741824.0 );
	std::cout << "        " << avgmem1 << " (average), \t"
					<< minmem1 << " (fastest), \t"
					<< varmem1 << " (variance) Gbyte/s (upper bound)." << std::endl;
	const size_t memuse2 = matrix->bytesUsed() + sizeof( double ) * ( matrix->m() + matrix->n() );
	const double avgmem2 = static_cast< double >( 1000*memuse2 ) / meantime / 1073741824.0;
	const double minmem2 = static_cast< double >( 1000*memuse2 ) / mintime / 1073741824.0;
	const double varmem2 = fabs( avgmem2 - static_cast< double >( 1000*memuse2 ) / (meantime-vartime) / 1073741824.0 );
	std::cout << "        " << avgmem2 << " (average), \t"
					<< minmem2 << " (fastest), \t"
					<< varmem2 << " (variance) Gbyte/s (lower bound)." << std::endl;

	delete [] times;
#ifdef INTERLEAVE_X
	if( scheme == 13 || scheme == 14 || scheme == 15 || scheme == 16 || scheme == 17 || scheme == 18 ) {
		numa_free( x, matrix->n() * sizeof( double ) );
	} else
#endif
		_mm_free( x );

	if( scheme == 12 || scheme == 13 || scheme == 14 || scheme == 15 || scheme == 16 || scheme == 17 || scheme == 18 ) {
#ifdef _NO_LIBNUMA
		_mm_free( z );
#else
		numa_free( z, matrix->m() * sizeof( double ) );
#endif
	} else {
		_mm_free( z );
	}
	_mm_free( c );
	delete matrix;
	delete checkm;

	return EXIT_SUCCESS;
}
示例#23
0
int main(int argc, char* argv[]) {
	printf("\n NODE_BIND:%d, NUMA:%d, CPU_BIND:%d, FIRST_TOUCH:%d\n",NODE_BIND, NUMA, CPU_BIND, FIRST_TOUCH);

        int repetitions, // number of repetition 
			maxThreads, // max number of threads
			it,
                        N; // array size;
        int bitCount = 1;
	int * key; // array of keys
	long * dataIn; // input data
	long * dataSTL; // input stl data
	long * dataRadix; // input radix data

        repetitions = 1;
#pragma omp parallel
	maxThreads = omp_get_num_threads();

        if(argc ==1 ){
            printf("prog input_file number_of_elements bit_count number_of_repetitions\n");
            printf("NO INPUT FILE");
            return 0;
        }
        if(argc == 2){
            printf("prog input_file number_of_elements bit_count number_of_repetitions\n");
            printf("NO ELEMENT COUNT\n");
            return 0;
        }
        if(argc >2 ){
	    N = (int) strtol(argv[2], NULL, 10);
        }
        if(argc >3){
             int tmp;
	    tmp = (int) strtol(argv[3], NULL, 10);
	    if ((tmp > 0) && (tmp<=16 )) // limit bit count
		bitCount = tmp;
        }        
        if(argc >4){
             int tmp;
	    tmp = (int) strtol(argv[4], NULL, 10);
	    if ((tmp > 0) && (tmp<=10000 )) // limit repetitions
		repetitions = tmp;
        }

        int *input;
	size_t N2;
	printf( "Reading data from file.\n" );
        if( readIntArrayFile( argv[1], &input, &N2 ) )
           return 1; 
	printf( "Data reading done.\n" );

        if( (N2<(size_t)N) || (N<=0) )
		N = N2;


       	printf( "\nPARALLEL STL SORT for N=%d, max threads = %d, test repetitions: %d\n", N, maxThreads, repetitions);

	dataIn = new long[N]; 
	dataSTL = new long[N];

#ifdef _WIN32

	dataRadix = new long[N];
	key = new int[N];
#endif
#ifdef linux

	key = new int[N];
#if NUMA==0

	dataRadix = new long[N]; 

#elif NUMA==1
			dataRadix = (long*) numa_alloc_interleaved(N * sizeof(long));
#elif NUMA==2
			dataRadix = (long*)numa_alloc_onnode(sizeof(long)*N,1);
#endif
#endif
	VTimer stlTimes(maxThreads);
	VTimer radixTimes(maxThreads);
#if TIME_COUNT==1
	VTimer partTimes(TIMERS_COUNT);
#endif
#if FLUSH_CACHE==1
#ifdef linux
        CacheFlusher cf;
#endif
#endif

        for(long i=0;i<N;i++)
		dataIn[i]=input[i];
	delete[] input;

// loop from 1 to maxThreads
	for (int t = 1; t <= maxThreads; t++) {
		int i;
#if TIME_COUNT==1
                partTimes.reset();
#endif
#if CALC_REF==1
// parallel STL
		for (it = 0; it < repetitions; it++) {
			setThreadsNo(t, maxThreads);
#pragma omp parallel for private(i)
			for (i = 0; i < N; i++)
				dataSTL[i] = dataIn[i];
#if FLUSH_CACHE==1
#ifdef linux
                        cf.flush();
#endif
#endif
			stlTimes.timerStart(t-1);

#ifdef linux
			__gnu_parallel::sort(dataSTL, dataSTL + N);
#endif
#ifdef _WIN32
			std::sort(dataSTL, dataSTL + N);
#endif
			stlTimes.timerEnd(t-1);
		}

#if FLUSH_CACHE==1
#ifdef linux
                cf.flush();
#endif
#endif
#endif

// radix sort V1
		for (it = 0; it < repetitions; it++) {
			setThreadsNo(t, maxThreads);
#pragma omp parallel for private(i) default(shared)

			for (i = 0; i < N; i++){
				dataRadix[i] = dataIn[i];
				key[i]=i;
			}

#if FLUSH_CACHE==1
#ifdef linux
                        cf.flush();
#endif
#endif
			omp_set_num_threads(t);
			radixTimes.timerStart(t-1);
#if TIME_COUNT==1
                        prsort::pradsort<long,int>(dataRadix,key, N, bitCount,&partTimes);
#else
                        prsort::pradsort<long,int>(dataRadix,key, N,bitCount,NULL);
#endif
			radixTimes.timerEnd(t-1);

		}

       
#if CALC_REF==1
		printf("|STL   SORT(th=%2d)  : %1.3fs  |\t", t,
				stlTimes.getTime(t-1));
#endif
#if TIME_COUNT==1
		for (int i = 0; i < TIMERS_COUNT; i++) {
#if CREATE_OUTPUT==1
			printf("%d %d %d %d %d %d %d %f\n", NUMA, NODE_BIND, CPU_BIND, FIRST_TOUCH,bitCount , t, i ,partTimes.getTime(i));
#else
			printf("part%d :%f ", i, partTimes.getTime(i));
#endif

		}
#endif
#if CREATE_OUTPUT ==1
		        printf("%d %d %d %d %d %d calosc %1.3f", NUMA,NODE_BIND,CPU_BIND,FIRST_TOUCH,bitCount, t ,radixTimes.getTime(t-1));
#else
		printf("|RADIX SORT (th=%2d)  : %1.3fs  |\t", t,
				radixTimes.getTime(t-1));
#endif

// Attention: checking result only from the last function usage 

#if CALC_REF==1
		checkResults(dataSTL, dataRadix, N);
#else
		printf("\n");
#endif

#if CHECK_KEY==1
	if(checkKey(dataIn,dataRadix,key,N))printf("Keys are good\n");

#endif
	}

#ifdef linux
	delete[] key;
#if NUMA>0
	numa_free(dataRadix, sizeof(long) * N);
        
#else

	delete[] dataRadix;
#endif
#endif
#ifdef _WIN32
	delete[] dataRadix;
#endif

	delete[] dataIn;
	delete[] dataSTL;
	
#if TIME_COUNT==1
	
        
        
#endif
	return 0;
}
示例#24
0
文件: libnuma.c 项目: Agobin/chapel
void INTERNAL qt_affinity_free(void  *ptr,
                               size_t bytes)
{                                      /*{{{ */
    numa_free(ptr, bytes);
}                                      /*}}} */
示例#25
0
文件: numa.c 项目: roccen/lagopus
static void *
s_numa_alloc(size_t sz, int cpu) {
  void *ret = NULL;

  if (likely(sz > 0)) {
    if (likely(cpu >= 0)) {
      if (likely(s_numa_nodes != NULL && s_n_cpus > 0)) {
        unsigned int node = s_numa_nodes[cpu];
        unsigned int allocd_node = UINT_MAX;
        struct bitmask *bmp;
        int r;
  
        bmp = numa_allocate_nodemask();
        numa_bitmask_setbit(bmp, node);

        errno = 0;
        r = (int)set_mempolicy(MPOL_BIND, bmp->maskp, bmp->size + 1);
        if (likely(r == 0)) {
          errno = 0;
          ret = numa_alloc_onnode(sz, (int)node);
          if (likely(ret != NULL)) {
            lagopus_result_t rl;

            /*
             * We need this "first touch" even using the
             * numa_alloc_onnode().
             */
            (void)memset(ret, 0, sz);

            errno = 0;
            r = (int)get_mempolicy((int *)&allocd_node, NULL, 0, ret,
                                   MPOL_F_NODE|MPOL_F_ADDR);
            if (likely(r == 0)) {
              if (unlikely(node != allocd_node)) {
                /*
                 * The memory is not allocated on the node, but it is
                 * still usable. Just return it.
                 */
                lagopus_msg_warning("can't allocate " PFSZ(u) " bytes memory "
                                    "for CPU %d (NUMA node %d).\n",
                                    sz, cpu, node);
              }
            } else {
              lagopus_perror(LAGOPUS_RESULT_POSIX_API_ERROR);
              lagopus_msg_error("get_mempolicy() returned %d.\n", r);
            }

            rl = s_add_addr(ret, sz);
            if (unlikely(rl != LAGOPUS_RESULT_OK)) {
              lagopus_perror(rl);
              lagopus_msg_error("can't register the allocated address.\n");
              numa_free(ret, sz);
              ret = NULL;
            }
          }

        } else {	/* r == 0 */
          lagopus_perror(LAGOPUS_RESULT_POSIX_API_ERROR);
          lagopus_msg_error("set_mempolicy() returned %d.\n", r);
        }

        numa_free_nodemask(bmp);
        set_mempolicy(MPOL_DEFAULT, NULL, 0);

      } else {	/* s_numa_nodes != NULL && s_n_cpus > 0 */
        /*
         * Not initialized or initialization failure.
         */
        lagopus_msg_warning("The NUMA related information is not initialized. "
                            "Use malloc(3) instead.\n");
        ret = malloc(sz);
      }

    } else {	/* cpu >= 0 */
      /*
       * Use pure malloc(3).
       */
      ret = malloc(sz);
    }

  }

  return ret;
}
示例#26
0
void release_csr_mat(struct csr_mat_t *mat)
{
    numa_free(mat->row_ptr, (mat->rows + 1) * sizeof(DWORD));
    numa_free(mat->col_idx, mat->non_zeros * sizeof(int));
    numa_free(mat->vals, mat->non_zeros * sizeof(FLOAT));
}
示例#27
0
文件: NumaNative.c 项目: maropu/jnuma
JNIEXPORT void JNICALL Java_xerial_jnuma_NumaNative_free
    (JNIEnv *env, jobject jobj, jlong address, jlong capacity) {
  if(address != 0) {
    numa_free((void*) address, (size_t) capacity);
  }
}
示例#28
0
int main(void)
{
	int node_id = 0;
	int arrival_lambda = 10;
	int thread_cpu_map[N_THREADS] = {1,6};
	int n = 10000;
	int i;
	int j;
	/*
	   pthread_spinlock_t *spinlock_ptr = malloc(sizeof(pthread_spinlock_t));
	   if(spinlock_ptr == NULL) //error handling of the malloc of the spinlock
	   {
	   printf("malloc of spinlock failed.\n");
	   }
	   else
	   {
	   printf("malloc of spinlock succeeded.\n");
	   }
	   free(spinlock_ptr);
	 */
	//pthread_t threads[N_THREADS];
	//cpu_set_t cpuset[N_THREADS]; //for setting the affinity of threads
	thread_params para[N_THREADS]; //The parameters to pass to the threads

	//printf("The return value of numa_get_run_node_mask(void) is %d\n",numa_get_run_node_mask());
	//printf("The return value of numa_max_node(void) is %d\n",numa_max_node());
	//numa_tonode_memory((void *)spinlock_ptr,sizeof(pthread_spinlock_t),node_id); //This doesn't work

	//initilize the spinlock pointer and put it on a specific node
	pthread_spinlock_t *spinlock_ptr = numa_alloc_onnode(sizeof(pthread_spinlock_t),node_id);
	if(spinlock_ptr == NULL) //error handling of the allocating of a spinlock pointer on a specific node
	{
		printf("alloc of spinlock on a node failed.\n");
	}
	else
	{
		printf("alloc of spinlock on a node succeeded.\n");
	}
	for(j = 0; j  < n; j++)
	{
		//initlize spinlock
		pthread_spin_init(spinlock_ptr,0);
		//create the threads
		for(i = 0; i < N_THREADS; i++)
		{
			int rc;
			int s;
			para[i].thread_id = i;
			para[i].arrival_lambda = arrival_lambda;
			para[i].spinlock_ptr = spinlock_ptr;
			CPU_ZERO(&cpuset[i]);
			CPU_SET(thread_cpu_map[i],&cpuset[i]);
			rc = pthread_create(&threads[i],NULL,work,(void*)&para[i]);
			if(rc)
			{
				printf("ERROR: return code from pthread_create() is %d for thread %d \n",rc,i);
				exit(-1);
			}
			flag = 1; 
		}
		for(i = 0; i < N_THREADS; i++)
		{
			pthread_join(threads[i],NULL);
		}
	}
	for(i = 0; i < N_THREADS; i++)
	{
		printf("The time to get one lock for thread %d is : %.9f\n",i,time_in_cs[i]/n);
	}
	double diff = abs_value(time_in_cs[0] - time_in_cs[1])/n;
	printf("The difference of the time to get one lock is : %.9f (%f) cycles\n",diff,diff*CPU_FREQ); //this is assuming there are only two processors (needs to be changed if there are more)
	pthread_spin_destroy(spinlock_ptr);
	numa_free((void *)spinlock_ptr,sizeof(pthread_spinlock_t));
	pthread_exit(NULL);
	return 0;
}