Example #1
0
void mcs_acquire(mcs_lock *L, mcs_qnode_ptr I) 
{
    I->next = NULL;
#ifndef  __tile__
    mcs_qnode_ptr pred = (mcs_qnode*) SWAP_PTR((volatile void*) L, (void*) I);
#else
    MEM_BARRIER;
    mcs_qnode_ptr pred = (mcs_qnode*) SWAP_PTR( L, I);
#endif
    if (pred == NULL) 		/* lock was free */
        return;
    I->waiting = 1; // word on which to spin
    MEM_BARRIER;
    pred->next = I; // make pred point to me

#if defined(OPTERON_OPTIMIZE)
    PREFETCHW(I);
#endif	/* OPTERON_OPTIMIZE */
    while (I->waiting != 0) 
    {
#ifndef __MIC__
    PAUSE;
#endif
#if defined(OPTERON_OPTIMIZE)
        pause_rep(23);
        PREFETCHW(I);
#endif	/* OPTERON_OPTIMIZE */
    }

}
Example #2
0
int *MallocPlus::memory_reorder(int *malloc_mem_ptr, int *iorder){
   map <void *, malloc_plus_memory_entry*>::iterator it = memory_ptr_dict.find(malloc_mem_ptr);

   if (it != memory_ptr_dict.end() ){
      malloc_plus_memory_entry *memory_item = it->second;
      int *ptr;

      memory_ptr_dict.erase(it);

      if (DEBUG) printf("Found memory item ptr %p name %s\n",memory_item->mem_ptr,memory_item->mem_name);
      int *tmp = (int *)malloc(memory_item->mem_nelem[0]*memory_item->mem_elsize);
#ifdef _OPENMP
#pragma omp parallel for
#endif
      for (uint ic = 0; ic < memory_item->mem_nelem[0]; ic++){
         tmp[ic] = malloc_mem_ptr[iorder[ic]];
      }
      SWAP_PTR(malloc_mem_ptr, tmp, ptr);
      free(tmp);
      memory_item->mem_ptr = malloc_mem_ptr;

      memory_ptr_dict.insert(std::pair<void*, malloc_plus_memory_entry*>(malloc_mem_ptr, memory_item) );
   } else {
      if (DEBUG) printf("Warning -- memory pointer %p not found\n",malloc_mem_ptr);
   }

   return(malloc_mem_ptr);
}
Example #3
0
volatile clh_qnode* clh_acquire(clh_lock *L, clh_qnode* I )
{
    I->locked=1;
    clh_qnode_ptr pred = (clh_qnode*) SWAP_PTR((volatile void*) (L), (void*) I);
    if (pred == NULL) 		/* lock was free */
        return NULL;
    while (pred->locked != 0)
    {
        PAUSE;
    }

    return pred;
}
Example #4
0
long applyFilters(double *outputData, double *inputData, double *timeData, 
    long rows, FILTER_STAGE *filterStage, long filterStages)
{
    long stage, row, frequencies;
    double *realimagInput, *realimagOutput;
    double length, dfrequency, factor;
    realimagOutput = NULL;
    if (!(realimagInput = (double*)malloc(sizeof(*realimagInput)*(rows+2))) ||
        !(realimagOutput = (double*)malloc(sizeof(*realimagOutput)*(rows+2))) )
        SDDS_Bomb("allocation failure");

    /* input data is real and has "rows" rows */
    /* result is interleaved real and imaginary */
    realFFT2(realimagInput, inputData, rows, 0);
    frequencies = rows/2 + 1;

    /* length is the assumed length of the periodic signal, 
       which is one interval longer that the actual data entered */
    length = ((double)rows)*(timeData[rows-1]-timeData[0])/((double)rows-1.0);
    dfrequency = factor = 1.0/length;
    
    for (stage=0; stage<filterStages; stage++) {
        if (!applyFilterStage(realimagOutput, realimagInput, frequencies, dfrequency, filterStage+stage))
            return 0;
        SWAP_PTR(realimagOutput, realimagInput);
        }
    /* input is still interleaved real and imaginary. The flag
       INVERSE_FFT ensures that the array is interpreted correctly. */
    /* output is interleaved real and imaginary */
#if DEBUG
    for (row=0; row<rows; row++; row++) {
      fprinf(stdout,"Real: %f, Imag: %i\n", realimagInput[row], realimagInput[row+1]);
    }
    
#endif    
    realFFT2(realimagOutput, realimagInput, rows, INVERSE_FFT);

    for (row=0; row<rows; row++)
        outputData[row] = realimagOutput[row];

    free(realimagOutput);
    free(realimagInput);

    return 1;
    }
Example #5
0
/**************************************************************************
 Function: reference_jacobi

 This routine contains the main iteration loop for the Jacobi iteration
 reference implementation (no OpenCL).

 params:
    a           two arrays to compute solution into
    max_iter    maximum number of iterations   
    size        size of array for this MPI rank
    tolerance   all differences should be les than this tolerance value
    mpi_ranks   number of MPI ranks in each dimension
    rank_pos    cartesian position of this rank    
    origin      origin for this rank
    d           discretion size
    mpi_comm    MPI communications structure
 **************************************************************************/
static void reference_jacobi(value_type *a[2],
                            unsigned int max_iter,
                            size_t size[DIMENSIONS],
                            value_type tolerance,
                            value_type d[DIMENSIONS]) {

    unsigned int rc, iter = 0;
    value_type max_diff, timer;

    /* init arrays by setting the initial value and the boundary conditions */
    set_initial_solution(a[OLD], size, INITIAL_GUESS);
    set_initial_solution(a[NEW], size, INITIAL_GUESS);
    set_boundary_conditions(a[OLD], size, d);
    set_boundary_conditions(a[NEW], size, d);

    /* print the initial solution guess */
    print_array("Init ", a[NEW], size, d);

    /*  iterate until maximum difference is less than the given tolerance
        or number of iterations is too high
     */
    do {
        /* swap array pointers for next iteration */
        SWAP_PTR(a[OLD], a[NEW]);

        /* iterate using a[OLD] as the input and a[NEW] as the output */
        max_diff = reference_jacobi_kernel(a[OLD], a[NEW], size);

        /* output status for user, overwrite the same line */
        if (0 == iter % 100) {
            printf("Iteration=%5d, max difference=%0.7f, target=%0.7f\r",
                iter, max_diff, tolerance);
            fflush(stdout);
        }

        /* increment counter */
        iter++;
    } while (max_diff > tolerance && max_iter > iter); /* do loop */

    /* output final iteration count and maximum difference value */
    printf("Iteration=%5d, max difference=%0.7f, execution time=%.3f seconds\n",
                    iter, max_diff, timer);

}
Example #6
0
void StencilSwap::fire(void){
	LOAD_FRAME(Stencil2DPartition);
	uint64_t timestep = FRAME(timeStep);
	double *src = FRAME(Initial); //matrix pointer initial Matrix[M][N]
	const uint64_t InitialM = FRAME(nRows); // matrix M row
	const uint64_t InitialN = FRAME(nCols); // Matrix N column
	double *dst = FRAME(New);
	timestep --;

	typedef double (*Array2D)[InitialN];
	Array2D DST = (Array2D) dst,
			SRC = (Array2D) src;
	SWAP_PTR(&DST,&SRC);

	if (timestep == 0)
		SIGNAL(signalUP);
	else if (timestep !=0){
	//	INVOKE(Stencil2D,NewMatrix,InitialM,InitialN,InitialMatrix,timestep,&Runtime::finalSignal);

		INVOKE(Stencil2DPartition,src,InitialM,InitialN,dst,timestep,&Runtime::finalSignal);
	}
	EXIT_TP();
}
Example #7
0
real_t *MallocPlus::memory_reorder(real_t *malloc_mem_ptr, int *iorder){
   list<malloc_plus_memory_entry>::iterator it;
   real_t *ptr;

   for ( it=memory_list.begin(); it != memory_list.end(); it++){
      if (DEBUG) printf("Testing it ptr %p ptr in %p name %s\n",it->mem_ptr,malloc_mem_ptr,it->mem_name);
      if (malloc_mem_ptr == it->mem_ptr) break;
   }
   if (it != memory_list.end() ){
      if (DEBUG) printf("Found it ptr %p name %s\n",it->mem_ptr,it->mem_name);
      real_t *tmp = (real_t *)malloc(it->mem_nelem*it->mem_elsize);
      for (uint ic = 0; ic < it->mem_nelem; ic++){
         tmp[ic] = malloc_mem_ptr[iorder[ic]];
      }
      SWAP_PTR(malloc_mem_ptr, tmp, ptr);
      free(tmp);
      it->mem_ptr = malloc_mem_ptr;
   } else {
      if (DEBUG) printf("Warning -- memory pointer %p not found\n",malloc_mem_ptr);
   }

   return(malloc_mem_ptr);
}
Example #8
0
void MallocPlus::memory_reorder_all(int *iorder){
   map <void *, malloc_plus_memory_entry*> memory_ptr_dict_old = memory_ptr_dict;
   map <void *, malloc_plus_memory_entry*>::iterator it_old;
   vector<int> inv_iorder;

   for ( it_old=memory_ptr_dict_old.begin(); it_old != memory_ptr_dict_old.end(); it_old++){
      malloc_plus_memory_entry *memory_item_old = it_old->second;

      map <void *, malloc_plus_memory_entry*>::iterator it = memory_ptr_dict.find(memory_item_old->mem_ptr);
      malloc_plus_memory_entry *memory_item = it_old->second;
      memory_ptr_dict.erase(it);

      if (memory_item_old->mem_flags & 0x100) {
         if (inv_iorder.size() < memory_item_old->mem_nelem[0]) {
            inv_iorder.resize(memory_item_old->mem_nelem[0]);
            for (int ic = 0; ic < (int)memory_item_old->mem_nelem[0]; ic++){
               inv_iorder[iorder[ic]] = ic;
            }
         }
         int *ptr;
         int *malloc_mem_ptr = (int *)memory_item_old->mem_ptr;
         int *tmp = (int *)malloc(memory_item_old->mem_nelem[0]*memory_item_old->mem_elsize);
         for (uint ic = 0; ic < memory_item_old->mem_nelem[0]; ic++){
            tmp[ic] = inv_iorder[malloc_mem_ptr[iorder[ic]]];
         }
         memory_replace(malloc_mem_ptr, tmp);
         SWAP_PTR(malloc_mem_ptr, tmp, ptr);
         free(tmp);
         memory_item->mem_ptr = malloc_mem_ptr;
         memory_ptr_dict.insert(std::pair<void*, malloc_plus_memory_entry*>(malloc_mem_ptr, memory_item) );
      } else if (memory_item_old->mem_elsize == 8){
         double *ptr;
         double *malloc_mem_ptr = (double *)memory_item_old->mem_ptr;
         double *tmp = (double *)malloc(memory_item_old->mem_nelem[0]*memory_item_old->mem_elsize);

         for (uint ic = 0; ic < memory_item_old->mem_nelem[0]; ic++){
            tmp[ic] = malloc_mem_ptr[iorder[ic]];
         }

         SWAP_PTR(malloc_mem_ptr, tmp, ptr);
         free(tmp);
         memory_item->mem_ptr = malloc_mem_ptr;
         memory_ptr_dict.insert(std::pair<void*, malloc_plus_memory_entry*>(malloc_mem_ptr, memory_item) );
      } else {
         float *ptr;
         float *malloc_mem_ptr = (float *)memory_item_old->mem_ptr;
         float *tmp = (float *)malloc(memory_item_old->mem_nelem[0]*memory_item_old->mem_elsize);
         for (uint ic = 0; ic < memory_item_old->mem_nelem[0]; ic++){
            tmp[ic] = malloc_mem_ptr[iorder[ic]];
         }
         memory_replace(malloc_mem_ptr, tmp);
         SWAP_PTR(malloc_mem_ptr, tmp, ptr);
         free(tmp);
         memory_item->mem_ptr = malloc_mem_ptr;
         memory_ptr_dict.insert(std::pair<void*, malloc_plus_memory_entry*>(malloc_mem_ptr, memory_item) );
      }

   }

   inv_iorder.clear();
}
Example #9
0
static GError*
_cache_load_from_m0(struct meta1_prefixes_set_s *m1ps,
		const gchar *ns_name,
		const struct addr_info_s *local_addr,
		struct addr_info_s *m0_addr,
		GArray **updated_prefixes,
		gboolean *meta0_ok)
{
	GError *err = NULL;
	GSList *m0info_list = NULL;

	EXTRA_ASSERT(m1ps != NULL);
	GRID_TRACE2("%s(%p,%s,%p,%p)", __FUNCTION__, m1ps, ns_name, local_addr,
			m0_addr);

	(void)ns_name;
	gchar m0[STRLEN_ADDRINFO];
	grid_addrinfo_to_string (m0_addr, m0, sizeof(m0));
	err = meta0_remote_get_meta1_all(m0, &m0info_list);
	if (err) {
		g_prefix_error(&err, "Remote error: ");
		return err;
	}
	if (!m0info_list) {
		GRID_DEBUG("META0 has no prefix configured!");
		return NULL;
	}

	*meta0_ok = TRUE;
	guint8 *cache = _cache_from_m0l(m0info_list, local_addr);
	GPtrArray *by_prefix = meta0_utils_list_to_array(m0info_list);

	g_mutex_lock(&m1ps->lock);
	GRID_DEBUG("Got %u prefixes from M0, %u in place",
			by_prefix->len, m1ps->by_prefix ? m1ps->by_prefix->len : 0);

	if ( m1ps->by_prefix ) {
		guint prefix;
		*updated_prefixes = g_array_new(FALSE, FALSE, sizeof(guint16));
		for( prefix=0 ; prefix <65536 ;prefix++) {
			if ( _cache_is_managed(m1ps->cache,(guint8 *)&prefix) != _cache_is_managed( cache,(guint8 *)&prefix)) {
				g_array_append_vals(*updated_prefixes, &prefix, 1);
			}
		}
	}

	SWAP_PTR(m1ps->by_prefix, by_prefix);
	SWAP_PTR(m1ps->cache, cache);
	g_mutex_unlock(&m1ps->lock);

	if (by_prefix)
		meta0_utils_array_clean(by_prefix);
	by_prefix = NULL;

	if (cache)
		g_free(cache);
	cache = NULL;

	g_slist_foreach(m0info_list, meta0_info_gclean, NULL);
	g_slist_free(m0info_list);
	return NULL;
}
Example #10
0
/**************************************************************************
 Function: ocl_jacobi

  This routine contains the main iteration loop for the Jacobi iteration
  using OpenCL kernel.

 params:
    a                       two arrays to compute solution into
    max_iter                maximum number of iterations
    size                    size of array for this MPI rank
    tolerance               all differences should be les than this tolerance value
    mpi_ranks               number of MPI ranks in each dimension
    rank_pos                cartesian position of this rank
    origin                  origin for this rank
    d                       discretion size
    mpi_comm                MPI communications structure
    local_workblock_size    size of local workblock for OpenCL kernel
    device_type             OpenCL device type
    full_copy               boolean if full buffer copy is to be done
 **************************************************************************/
static void ocl_jacobi(value_type *a[2],
                        unsigned int max_iter,
                        size_t size[DIMENSIONS],
                        value_type tolerance,
                        value_type d[DIMENSIONS],
                        size_t local_workblock_size[DIMENSIONS],
                        cl_device_type device_type,
                        unsigned int full_copy) {

    size_t array_size;
    unsigned int i, j, rc, iter = 0;
    size_t delta_buffer_size, delta_size[DIMENSIONS];
    size_t tile_delta_size, tile_cache_size;
    value_type max_diff, timer;
    icl_device* device_id;
    icl_kernel* kernel;
    cl_int err;
    icl_buffer *a_buf[2], *delta_buf;
    value_type *delta;
 
    /* convenience for y stride in array */
    cl_uint ystride = size[Y]+2*GHOST_CELL_WIDTH;
    
    /* init devices */
    icl_init_devices(device_type);
    
    /* find OpenCL device */
    device_id  = icl_get_device(0);


    /* build the kernel and verify the kernel */
    kernel = icl_create_kernel(device_id, "jacsolver_kernel.cl", "ocl_jacobi_local_copy", "", ICL_SOURCE);

    /* calculate size of kernel local memory  - also used later for kernel params */
    tile_delta_size = local_workblock_size[X] * local_workblock_size[Y];
    tile_cache_size = (local_workblock_size[X]+2*GHOST_CELL_WIDTH) * (local_workblock_size[Y]+2*GHOST_CELL_WIDTH);

    /* verify the device has enough resources for this device */
/*  I'm an optimist, we just hope for the best
  	if ((cluGetAvailableLocalMem(device_id, kernel) < tile_delta_size + tile_cache_size) ||
        (! cluCheckLocalWorkgroupSize(device_id, kernel, DIMENSIONS, local_workblock_size))) {
        local_workblock_size[X] = 1;
        local_workblock_size[Y] = 1;
    }
*/
    printf("Estimating solution using OpenCL Jacobi iteration with %d x %d workblock.\n", (int)local_workblock_size[X], (int)local_workblock_size[Y]);
    fflush(stdout);

    /* init arrays by setting the initial value and the boundary conditions */
    set_initial_solution(a[OLD], size, INITIAL_GUESS);
    set_initial_solution(a[NEW], size, INITIAL_GUESS);
    set_boundary_conditions(a[OLD], size, d);
    set_boundary_conditions(a[NEW], size, d);

    /* print the initial solution guess */ 
    print_array("Init ", a[NEW], size, d);

    /* allocate memory for differences */
    delta_size[X] = size[X] / local_workblock_size[X];
    delta_size[Y] = size[Y] / local_workblock_size[Y];
    delta_buffer_size = delta_size[X] * delta_size[Y];
    delta = (value_type *)malloc(sizeof(value_type) * delta_buffer_size);
    
    /* initialize deltas so that first execution of kernel with overlapping 
     * reduction on the host will work correctly and not prematurely exit
     */
    for (i=0; i<delta_size[X]; ++i) {
        for (j=0; j<delta_size[Y]; ++j) {
            delta[i * delta_size[Y] + j] = 1.0;
        }
    }

    /* create buffers for OpenCL device using host memory */
    array_size = (size[X]+2*GHOST_CELL_WIDTH) * ystride;
    a_buf[OLD] = icl_create_buffer(device_id, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(value_type) * array_size);
    a_buf[NEW] = icl_create_buffer(device_id, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(value_type) * array_size);
    delta_buf = icl_create_buffer(device_id, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(value_type) * delta_buffer_size);

    /* copy over buffers to device */
    icl_write_buffer(a_buf[OLD], CL_TRUE, sizeof(value_type) * array_size, a[OLD], NULL, NULL);
    icl_write_buffer(a_buf[NEW], CL_TRUE, sizeof(value_type) * array_size, a[NEW], NULL, NULL);

    /* set the kernel execution type  - data parallel */
 //   cluSetKernelNDRange(clu, kernel, DIMENSIONS, NULL, size, local_workblock_size);

    /*  iterate until maximum difference is less than the given tolerance
        or number of iterations is too high */
    do {
        /* swap array pointers for next iteration */
        SWAP_PTR(a[OLD], a[NEW]);
        SWAP_BUF(a_buf[OLD], a_buf[NEW]);
        icl_run_kernel(kernel, DIMENSIONS, size, local_workblock_size, NULL, NULL, 6,
                    (size_t)0,(void *) a_buf[OLD],
                    (size_t)0, (void *) a_buf[NEW],
                    sizeof(value_type) * tile_delta_size, NULL,
                    sizeof(value_type) * tile_cache_size, NULL,
                    (size_t)0, (void *) delta_buf,
                    sizeof(cl_uint), (void *) &ystride);

        /* while the kernel is running, calculate the reduction for the previous iteration */
        max_diff = ocl_jacobi_reduce(delta, delta_size);
        
        /* enqueue a synchronous copy of the delta. This will not occur until the kernel 
         * has finished. The deltas for each workgroup is a much smaller array to process
         */
        icl_read_buffer(delta_buf, CL_TRUE, sizeof(value_type) * delta_buffer_size, delta, NULL, NULL);
//        clEnqueueReadBuffer(queue, a_buf[NEW], CL_TRUE,    0, sizeof(value_type) * array_size, a[NEW], 0, NULL, NULL));

        /* output status for user, overwrite the same line */
        if ((0 == iter % 100)) {
            printf("Iteration=%5d, max difference=%0.7f, target=%0.7f\r",
                        iter, max_diff, tolerance);
            fflush(stdout);
        }

        
        /* increment the iteration counter */
        iter++;
    } while (max_diff > tolerance && max_iter >= iter); /* do loop */

    /* read back the final result */
    icl_read_buffer(a_buf[NEW], CL_TRUE, sizeof(value_type) * array_size, a[NEW], NULL, NULL);

    /* output final iteration count and maximum difference value */
    printf("Iteration=%5d, max difference=%0.7f, execution time=%.3f seconds\n", iter-1, max_diff, timer);
    fflush(stdout);

    /* finish usage of OpenCL device */
    icl_release_buffers(3, a_buf[OLD], a_buf[NEW], delta_buf);
    icl_release_kernel(kernel);
    free(delta);
}