void mcs_acquire(mcs_lock *L, mcs_qnode_ptr I) { I->next = NULL; #ifndef __tile__ mcs_qnode_ptr pred = (mcs_qnode*) SWAP_PTR((volatile void*) L, (void*) I); #else MEM_BARRIER; mcs_qnode_ptr pred = (mcs_qnode*) SWAP_PTR( L, I); #endif if (pred == NULL) /* lock was free */ return; I->waiting = 1; // word on which to spin MEM_BARRIER; pred->next = I; // make pred point to me #if defined(OPTERON_OPTIMIZE) PREFETCHW(I); #endif /* OPTERON_OPTIMIZE */ while (I->waiting != 0) { #ifndef __MIC__ PAUSE; #endif #if defined(OPTERON_OPTIMIZE) pause_rep(23); PREFETCHW(I); #endif /* OPTERON_OPTIMIZE */ } }
int *MallocPlus::memory_reorder(int *malloc_mem_ptr, int *iorder){ map <void *, malloc_plus_memory_entry*>::iterator it = memory_ptr_dict.find(malloc_mem_ptr); if (it != memory_ptr_dict.end() ){ malloc_plus_memory_entry *memory_item = it->second; int *ptr; memory_ptr_dict.erase(it); if (DEBUG) printf("Found memory item ptr %p name %s\n",memory_item->mem_ptr,memory_item->mem_name); int *tmp = (int *)malloc(memory_item->mem_nelem[0]*memory_item->mem_elsize); #ifdef _OPENMP #pragma omp parallel for #endif for (uint ic = 0; ic < memory_item->mem_nelem[0]; ic++){ tmp[ic] = malloc_mem_ptr[iorder[ic]]; } SWAP_PTR(malloc_mem_ptr, tmp, ptr); free(tmp); memory_item->mem_ptr = malloc_mem_ptr; memory_ptr_dict.insert(std::pair<void*, malloc_plus_memory_entry*>(malloc_mem_ptr, memory_item) ); } else { if (DEBUG) printf("Warning -- memory pointer %p not found\n",malloc_mem_ptr); } return(malloc_mem_ptr); }
volatile clh_qnode* clh_acquire(clh_lock *L, clh_qnode* I ) { I->locked=1; clh_qnode_ptr pred = (clh_qnode*) SWAP_PTR((volatile void*) (L), (void*) I); if (pred == NULL) /* lock was free */ return NULL; while (pred->locked != 0) { PAUSE; } return pred; }
long applyFilters(double *outputData, double *inputData, double *timeData, long rows, FILTER_STAGE *filterStage, long filterStages) { long stage, row, frequencies; double *realimagInput, *realimagOutput; double length, dfrequency, factor; realimagOutput = NULL; if (!(realimagInput = (double*)malloc(sizeof(*realimagInput)*(rows+2))) || !(realimagOutput = (double*)malloc(sizeof(*realimagOutput)*(rows+2))) ) SDDS_Bomb("allocation failure"); /* input data is real and has "rows" rows */ /* result is interleaved real and imaginary */ realFFT2(realimagInput, inputData, rows, 0); frequencies = rows/2 + 1; /* length is the assumed length of the periodic signal, which is one interval longer that the actual data entered */ length = ((double)rows)*(timeData[rows-1]-timeData[0])/((double)rows-1.0); dfrequency = factor = 1.0/length; for (stage=0; stage<filterStages; stage++) { if (!applyFilterStage(realimagOutput, realimagInput, frequencies, dfrequency, filterStage+stage)) return 0; SWAP_PTR(realimagOutput, realimagInput); } /* input is still interleaved real and imaginary. The flag INVERSE_FFT ensures that the array is interpreted correctly. */ /* output is interleaved real and imaginary */ #if DEBUG for (row=0; row<rows; row++; row++) { fprinf(stdout,"Real: %f, Imag: %i\n", realimagInput[row], realimagInput[row+1]); } #endif realFFT2(realimagOutput, realimagInput, rows, INVERSE_FFT); for (row=0; row<rows; row++) outputData[row] = realimagOutput[row]; free(realimagOutput); free(realimagInput); return 1; }
/************************************************************************** Function: reference_jacobi This routine contains the main iteration loop for the Jacobi iteration reference implementation (no OpenCL). params: a two arrays to compute solution into max_iter maximum number of iterations size size of array for this MPI rank tolerance all differences should be les than this tolerance value mpi_ranks number of MPI ranks in each dimension rank_pos cartesian position of this rank origin origin for this rank d discretion size mpi_comm MPI communications structure **************************************************************************/ static void reference_jacobi(value_type *a[2], unsigned int max_iter, size_t size[DIMENSIONS], value_type tolerance, value_type d[DIMENSIONS]) { unsigned int rc, iter = 0; value_type max_diff, timer; /* init arrays by setting the initial value and the boundary conditions */ set_initial_solution(a[OLD], size, INITIAL_GUESS); set_initial_solution(a[NEW], size, INITIAL_GUESS); set_boundary_conditions(a[OLD], size, d); set_boundary_conditions(a[NEW], size, d); /* print the initial solution guess */ print_array("Init ", a[NEW], size, d); /* iterate until maximum difference is less than the given tolerance or number of iterations is too high */ do { /* swap array pointers for next iteration */ SWAP_PTR(a[OLD], a[NEW]); /* iterate using a[OLD] as the input and a[NEW] as the output */ max_diff = reference_jacobi_kernel(a[OLD], a[NEW], size); /* output status for user, overwrite the same line */ if (0 == iter % 100) { printf("Iteration=%5d, max difference=%0.7f, target=%0.7f\r", iter, max_diff, tolerance); fflush(stdout); } /* increment counter */ iter++; } while (max_diff > tolerance && max_iter > iter); /* do loop */ /* output final iteration count and maximum difference value */ printf("Iteration=%5d, max difference=%0.7f, execution time=%.3f seconds\n", iter, max_diff, timer); }
void StencilSwap::fire(void){ LOAD_FRAME(Stencil2DPartition); uint64_t timestep = FRAME(timeStep); double *src = FRAME(Initial); //matrix pointer initial Matrix[M][N] const uint64_t InitialM = FRAME(nRows); // matrix M row const uint64_t InitialN = FRAME(nCols); // Matrix N column double *dst = FRAME(New); timestep --; typedef double (*Array2D)[InitialN]; Array2D DST = (Array2D) dst, SRC = (Array2D) src; SWAP_PTR(&DST,&SRC); if (timestep == 0) SIGNAL(signalUP); else if (timestep !=0){ // INVOKE(Stencil2D,NewMatrix,InitialM,InitialN,InitialMatrix,timestep,&Runtime::finalSignal); INVOKE(Stencil2DPartition,src,InitialM,InitialN,dst,timestep,&Runtime::finalSignal); } EXIT_TP(); }
real_t *MallocPlus::memory_reorder(real_t *malloc_mem_ptr, int *iorder){ list<malloc_plus_memory_entry>::iterator it; real_t *ptr; for ( it=memory_list.begin(); it != memory_list.end(); it++){ if (DEBUG) printf("Testing it ptr %p ptr in %p name %s\n",it->mem_ptr,malloc_mem_ptr,it->mem_name); if (malloc_mem_ptr == it->mem_ptr) break; } if (it != memory_list.end() ){ if (DEBUG) printf("Found it ptr %p name %s\n",it->mem_ptr,it->mem_name); real_t *tmp = (real_t *)malloc(it->mem_nelem*it->mem_elsize); for (uint ic = 0; ic < it->mem_nelem; ic++){ tmp[ic] = malloc_mem_ptr[iorder[ic]]; } SWAP_PTR(malloc_mem_ptr, tmp, ptr); free(tmp); it->mem_ptr = malloc_mem_ptr; } else { if (DEBUG) printf("Warning -- memory pointer %p not found\n",malloc_mem_ptr); } return(malloc_mem_ptr); }
void MallocPlus::memory_reorder_all(int *iorder){ map <void *, malloc_plus_memory_entry*> memory_ptr_dict_old = memory_ptr_dict; map <void *, malloc_plus_memory_entry*>::iterator it_old; vector<int> inv_iorder; for ( it_old=memory_ptr_dict_old.begin(); it_old != memory_ptr_dict_old.end(); it_old++){ malloc_plus_memory_entry *memory_item_old = it_old->second; map <void *, malloc_plus_memory_entry*>::iterator it = memory_ptr_dict.find(memory_item_old->mem_ptr); malloc_plus_memory_entry *memory_item = it_old->second; memory_ptr_dict.erase(it); if (memory_item_old->mem_flags & 0x100) { if (inv_iorder.size() < memory_item_old->mem_nelem[0]) { inv_iorder.resize(memory_item_old->mem_nelem[0]); for (int ic = 0; ic < (int)memory_item_old->mem_nelem[0]; ic++){ inv_iorder[iorder[ic]] = ic; } } int *ptr; int *malloc_mem_ptr = (int *)memory_item_old->mem_ptr; int *tmp = (int *)malloc(memory_item_old->mem_nelem[0]*memory_item_old->mem_elsize); for (uint ic = 0; ic < memory_item_old->mem_nelem[0]; ic++){ tmp[ic] = inv_iorder[malloc_mem_ptr[iorder[ic]]]; } memory_replace(malloc_mem_ptr, tmp); SWAP_PTR(malloc_mem_ptr, tmp, ptr); free(tmp); memory_item->mem_ptr = malloc_mem_ptr; memory_ptr_dict.insert(std::pair<void*, malloc_plus_memory_entry*>(malloc_mem_ptr, memory_item) ); } else if (memory_item_old->mem_elsize == 8){ double *ptr; double *malloc_mem_ptr = (double *)memory_item_old->mem_ptr; double *tmp = (double *)malloc(memory_item_old->mem_nelem[0]*memory_item_old->mem_elsize); for (uint ic = 0; ic < memory_item_old->mem_nelem[0]; ic++){ tmp[ic] = malloc_mem_ptr[iorder[ic]]; } SWAP_PTR(malloc_mem_ptr, tmp, ptr); free(tmp); memory_item->mem_ptr = malloc_mem_ptr; memory_ptr_dict.insert(std::pair<void*, malloc_plus_memory_entry*>(malloc_mem_ptr, memory_item) ); } else { float *ptr; float *malloc_mem_ptr = (float *)memory_item_old->mem_ptr; float *tmp = (float *)malloc(memory_item_old->mem_nelem[0]*memory_item_old->mem_elsize); for (uint ic = 0; ic < memory_item_old->mem_nelem[0]; ic++){ tmp[ic] = malloc_mem_ptr[iorder[ic]]; } memory_replace(malloc_mem_ptr, tmp); SWAP_PTR(malloc_mem_ptr, tmp, ptr); free(tmp); memory_item->mem_ptr = malloc_mem_ptr; memory_ptr_dict.insert(std::pair<void*, malloc_plus_memory_entry*>(malloc_mem_ptr, memory_item) ); } } inv_iorder.clear(); }
static GError* _cache_load_from_m0(struct meta1_prefixes_set_s *m1ps, const gchar *ns_name, const struct addr_info_s *local_addr, struct addr_info_s *m0_addr, GArray **updated_prefixes, gboolean *meta0_ok) { GError *err = NULL; GSList *m0info_list = NULL; EXTRA_ASSERT(m1ps != NULL); GRID_TRACE2("%s(%p,%s,%p,%p)", __FUNCTION__, m1ps, ns_name, local_addr, m0_addr); (void)ns_name; gchar m0[STRLEN_ADDRINFO]; grid_addrinfo_to_string (m0_addr, m0, sizeof(m0)); err = meta0_remote_get_meta1_all(m0, &m0info_list); if (err) { g_prefix_error(&err, "Remote error: "); return err; } if (!m0info_list) { GRID_DEBUG("META0 has no prefix configured!"); return NULL; } *meta0_ok = TRUE; guint8 *cache = _cache_from_m0l(m0info_list, local_addr); GPtrArray *by_prefix = meta0_utils_list_to_array(m0info_list); g_mutex_lock(&m1ps->lock); GRID_DEBUG("Got %u prefixes from M0, %u in place", by_prefix->len, m1ps->by_prefix ? m1ps->by_prefix->len : 0); if ( m1ps->by_prefix ) { guint prefix; *updated_prefixes = g_array_new(FALSE, FALSE, sizeof(guint16)); for( prefix=0 ; prefix <65536 ;prefix++) { if ( _cache_is_managed(m1ps->cache,(guint8 *)&prefix) != _cache_is_managed( cache,(guint8 *)&prefix)) { g_array_append_vals(*updated_prefixes, &prefix, 1); } } } SWAP_PTR(m1ps->by_prefix, by_prefix); SWAP_PTR(m1ps->cache, cache); g_mutex_unlock(&m1ps->lock); if (by_prefix) meta0_utils_array_clean(by_prefix); by_prefix = NULL; if (cache) g_free(cache); cache = NULL; g_slist_foreach(m0info_list, meta0_info_gclean, NULL); g_slist_free(m0info_list); return NULL; }
/************************************************************************** Function: ocl_jacobi This routine contains the main iteration loop for the Jacobi iteration using OpenCL kernel. params: a two arrays to compute solution into max_iter maximum number of iterations size size of array for this MPI rank tolerance all differences should be les than this tolerance value mpi_ranks number of MPI ranks in each dimension rank_pos cartesian position of this rank origin origin for this rank d discretion size mpi_comm MPI communications structure local_workblock_size size of local workblock for OpenCL kernel device_type OpenCL device type full_copy boolean if full buffer copy is to be done **************************************************************************/ static void ocl_jacobi(value_type *a[2], unsigned int max_iter, size_t size[DIMENSIONS], value_type tolerance, value_type d[DIMENSIONS], size_t local_workblock_size[DIMENSIONS], cl_device_type device_type, unsigned int full_copy) { size_t array_size; unsigned int i, j, rc, iter = 0; size_t delta_buffer_size, delta_size[DIMENSIONS]; size_t tile_delta_size, tile_cache_size; value_type max_diff, timer; icl_device* device_id; icl_kernel* kernel; cl_int err; icl_buffer *a_buf[2], *delta_buf; value_type *delta; /* convenience for y stride in array */ cl_uint ystride = size[Y]+2*GHOST_CELL_WIDTH; /* init devices */ icl_init_devices(device_type); /* find OpenCL device */ device_id = icl_get_device(0); /* build the kernel and verify the kernel */ kernel = icl_create_kernel(device_id, "jacsolver_kernel.cl", "ocl_jacobi_local_copy", "", ICL_SOURCE); /* calculate size of kernel local memory - also used later for kernel params */ tile_delta_size = local_workblock_size[X] * local_workblock_size[Y]; tile_cache_size = (local_workblock_size[X]+2*GHOST_CELL_WIDTH) * (local_workblock_size[Y]+2*GHOST_CELL_WIDTH); /* verify the device has enough resources for this device */ /* I'm an optimist, we just hope for the best if ((cluGetAvailableLocalMem(device_id, kernel) < tile_delta_size + tile_cache_size) || (! cluCheckLocalWorkgroupSize(device_id, kernel, DIMENSIONS, local_workblock_size))) { local_workblock_size[X] = 1; local_workblock_size[Y] = 1; } */ printf("Estimating solution using OpenCL Jacobi iteration with %d x %d workblock.\n", (int)local_workblock_size[X], (int)local_workblock_size[Y]); fflush(stdout); /* init arrays by setting the initial value and the boundary conditions */ set_initial_solution(a[OLD], size, INITIAL_GUESS); set_initial_solution(a[NEW], size, INITIAL_GUESS); set_boundary_conditions(a[OLD], size, d); set_boundary_conditions(a[NEW], size, d); /* print the initial solution guess */ print_array("Init ", a[NEW], size, d); /* allocate memory for differences */ delta_size[X] = size[X] / local_workblock_size[X]; delta_size[Y] = size[Y] / local_workblock_size[Y]; delta_buffer_size = delta_size[X] * delta_size[Y]; delta = (value_type *)malloc(sizeof(value_type) * delta_buffer_size); /* initialize deltas so that first execution of kernel with overlapping * reduction on the host will work correctly and not prematurely exit */ for (i=0; i<delta_size[X]; ++i) { for (j=0; j<delta_size[Y]; ++j) { delta[i * delta_size[Y] + j] = 1.0; } } /* create buffers for OpenCL device using host memory */ array_size = (size[X]+2*GHOST_CELL_WIDTH) * ystride; a_buf[OLD] = icl_create_buffer(device_id, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(value_type) * array_size); a_buf[NEW] = icl_create_buffer(device_id, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(value_type) * array_size); delta_buf = icl_create_buffer(device_id, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(value_type) * delta_buffer_size); /* copy over buffers to device */ icl_write_buffer(a_buf[OLD], CL_TRUE, sizeof(value_type) * array_size, a[OLD], NULL, NULL); icl_write_buffer(a_buf[NEW], CL_TRUE, sizeof(value_type) * array_size, a[NEW], NULL, NULL); /* set the kernel execution type - data parallel */ // cluSetKernelNDRange(clu, kernel, DIMENSIONS, NULL, size, local_workblock_size); /* iterate until maximum difference is less than the given tolerance or number of iterations is too high */ do { /* swap array pointers for next iteration */ SWAP_PTR(a[OLD], a[NEW]); SWAP_BUF(a_buf[OLD], a_buf[NEW]); icl_run_kernel(kernel, DIMENSIONS, size, local_workblock_size, NULL, NULL, 6, (size_t)0,(void *) a_buf[OLD], (size_t)0, (void *) a_buf[NEW], sizeof(value_type) * tile_delta_size, NULL, sizeof(value_type) * tile_cache_size, NULL, (size_t)0, (void *) delta_buf, sizeof(cl_uint), (void *) &ystride); /* while the kernel is running, calculate the reduction for the previous iteration */ max_diff = ocl_jacobi_reduce(delta, delta_size); /* enqueue a synchronous copy of the delta. This will not occur until the kernel * has finished. The deltas for each workgroup is a much smaller array to process */ icl_read_buffer(delta_buf, CL_TRUE, sizeof(value_type) * delta_buffer_size, delta, NULL, NULL); // clEnqueueReadBuffer(queue, a_buf[NEW], CL_TRUE, 0, sizeof(value_type) * array_size, a[NEW], 0, NULL, NULL)); /* output status for user, overwrite the same line */ if ((0 == iter % 100)) { printf("Iteration=%5d, max difference=%0.7f, target=%0.7f\r", iter, max_diff, tolerance); fflush(stdout); } /* increment the iteration counter */ iter++; } while (max_diff > tolerance && max_iter >= iter); /* do loop */ /* read back the final result */ icl_read_buffer(a_buf[NEW], CL_TRUE, sizeof(value_type) * array_size, a[NEW], NULL, NULL); /* output final iteration count and maximum difference value */ printf("Iteration=%5d, max difference=%0.7f, execution time=%.3f seconds\n", iter-1, max_diff, timer); fflush(stdout); /* finish usage of OpenCL device */ icl_release_buffers(3, a_buf[OLD], a_buf[NEW], delta_buf); icl_release_kernel(kernel); free(delta); }