static void free_device_data(oskar_Simulator* h, int* status) { int i; if (!h->d) return; for (i = 0; i < h->num_devices; ++i) { DeviceData* d = &(h->d[i]); if (!d) continue; if (i < h->num_gpus) oskar_device_set(h->gpu_ids[i], status); oskar_timer_free(d->tmr_compute); oskar_timer_free(d->tmr_copy); oskar_timer_free(d->tmr_clip); oskar_timer_free(d->tmr_E); oskar_timer_free(d->tmr_K); oskar_timer_free(d->tmr_join); oskar_timer_free(d->tmr_correlate); oskar_vis_block_free(d->vis_block_cpu[0], status); oskar_vis_block_free(d->vis_block_cpu[1], status); oskar_vis_block_free(d->vis_block, status); oskar_mem_free(d->u, status); oskar_mem_free(d->v, status); oskar_mem_free(d->w, status); oskar_sky_free(d->chunk, status); oskar_sky_free(d->chunk_clip, status); oskar_telescope_free(d->tel, status); oskar_station_work_free(d->station_work, status); oskar_jones_free(d->J, status); oskar_jones_free(d->E, status); oskar_jones_free(d->K, status); oskar_jones_free(d->R, status); memset(d, 0, sizeof(DeviceData)); } }
void oskar_interferometer_free(oskar_Interferometer* h, int* status) { int i; if (!h) return; oskar_interferometer_reset_cache(h, status); for (i = 0; i < h->num_gpus; ++i) { oskar_device_set(h->gpu_ids[i], status); oskar_device_reset(); } for (i = 0; i < h->num_sky_chunks; ++i) oskar_sky_free(h->sky_chunks[i], status); oskar_telescope_free(h->tel, status); oskar_mem_free(h->temp, status); oskar_timer_free(h->tmr_sim); oskar_timer_free(h->tmr_write); oskar_mutex_free(h->mutex); oskar_barrier_free(h->barrier); free(h->sky_chunks); free(h->gpu_ids); free(h->vis_name); free(h->ms_name); free(h->settings_path); free(h->d); free(h); }
void oskar_simulator_set_gpus(oskar_Simulator* h, int num, int* ids, int* status) { int i, num_gpus_avail; if (*status) return; free_device_data(h, status); num_gpus_avail = oskar_device_count(status); if (*status) return; if (num < 0) { h->num_gpus = num_gpus_avail; h->gpu_ids = (int*) realloc(h->gpu_ids, h->num_gpus * sizeof(int)); for (i = 0; i < h->num_gpus; ++i) h->gpu_ids[i] = i; } else if (num > 0) { if (num > num_gpus_avail) { oskar_log_error(h->log, "More GPUs were requested than found."); *status = OSKAR_ERR_COMPUTE_DEVICES; return; } h->num_gpus = num; h->gpu_ids = (int*) realloc(h->gpu_ids, h->num_gpus * sizeof(int)); for (i = 0; i < h->num_gpus; ++i) h->gpu_ids[i] = ids[i]; } else /* num == 0 */ { free(h->gpu_ids); h->gpu_ids = 0; h->num_gpus = 0; } for (i = 0; i < h->num_gpus; ++i) { oskar_device_set(h->gpu_ids[i], status); if (*status) return; } }
void oskar_imager_set_gpus(oskar_Imager* h, int num, const int* ids, int* status) { int i, num_gpus_avail; if (*status) return; oskar_imager_free_device_data(h, status); num_gpus_avail = oskar_device_count(status); if (*status) return; if (num < 0) { h->num_gpus = num_gpus_avail; h->gpu_ids = (int*) realloc(h->gpu_ids, h->num_gpus * sizeof(int)); for (i = 0; i < h->num_gpus; ++i) h->gpu_ids[i] = i; } else if (num > 0) { if (num > num_gpus_avail) { *status = OSKAR_ERR_COMPUTE_DEVICES; return; } h->num_gpus = num; h->gpu_ids = (int*) realloc(h->gpu_ids, h->num_gpus * sizeof(int)); for (i = 0; i < h->num_gpus; ++i) h->gpu_ids[i] = ids[i]; } else /* num == 0 */ { free(h->gpu_ids); h->gpu_ids = 0; h->num_gpus = 0; } for (i = 0; i < h->num_gpus; ++i) { oskar_device_set(h->gpu_ids[i], status); if (*status) return; } }
static void set_up_device_data(oskar_BeamPattern* h, int* status) { int i, beam_type, max_src, max_size, auto_power, cross_power, raw_data; if (*status) return; /* Get local variables. */ max_src = h->max_chunk_size; max_size = h->num_active_stations * max_src; beam_type = h->prec | OSKAR_COMPLEX; if (h->pol_mode == OSKAR_POL_MODE_FULL) beam_type |= OSKAR_MATRIX; raw_data = h->ixr_txt || h->ixr_fits || h->voltage_raw_txt || h->voltage_amp_txt || h->voltage_phase_txt || h->voltage_amp_fits || h->voltage_phase_fits; auto_power = h->auto_power_fits || h->auto_power_txt; cross_power = h->cross_power_raw_txt || h->cross_power_amp_fits || h->cross_power_phase_fits || h->cross_power_amp_txt || h->cross_power_phase_txt; /* Expand the number of devices to the number of selected GPUs, * if required. */ if (h->num_devices < h->num_gpus) oskar_beam_pattern_set_num_devices(h, h->num_gpus); for (i = 0; i < h->num_devices; ++i) { int dev_loc, i_stokes; DeviceData* d = &h->d[i]; if (*status) break; /* Select the device. */ if (i < h->num_gpus) { oskar_device_set(h->gpu_ids[i], status); dev_loc = OSKAR_GPU; } else { dev_loc = OSKAR_CPU; } /* Device memory. */ d->previous_chunk_index = -1; if (!d->tel) { d->jones_data = oskar_mem_create(beam_type, dev_loc, max_size, status); d->x = oskar_mem_create(h->prec, dev_loc, 1 + max_src, status); d->y = oskar_mem_create(h->prec, dev_loc, 1 + max_src, status); d->z = oskar_mem_create(h->prec, dev_loc, 1 + max_src, status); d->tel = oskar_telescope_create_copy(h->tel, dev_loc, status); d->work = oskar_station_work_create(h->prec, dev_loc, status); } /* Host memory. */ if (!d->jones_data_cpu[0] && raw_data) { d->jones_data_cpu[0] = oskar_mem_create(beam_type, OSKAR_CPU, max_size, status); d->jones_data_cpu[1] = oskar_mem_create(beam_type, OSKAR_CPU, max_size, status); } /* Auto-correlation beam output arrays. */ for (i_stokes = 0; i_stokes < 4; ++i_stokes) { if (!h->stokes[i_stokes]) continue; if (!d->auto_power[i_stokes] && auto_power) { /* Device memory. */ d->auto_power[i_stokes] = oskar_mem_create(beam_type, dev_loc, max_size, status); /* Host memory. */ d->auto_power_cpu[i_stokes][0] = oskar_mem_create( beam_type, OSKAR_CPU, max_size, status); d->auto_power_cpu[i_stokes][1] = oskar_mem_create( beam_type, OSKAR_CPU, max_size, status); if (h->average_single_axis == 'T') d->auto_power_time_avg[i_stokes] = oskar_mem_create( beam_type, OSKAR_CPU, max_size, status); if (h->average_single_axis == 'C') d->auto_power_channel_avg[i_stokes] = oskar_mem_create( beam_type, OSKAR_CPU, max_size, status); if (h->average_time_and_channel) d->auto_power_channel_and_time_avg[i_stokes] = oskar_mem_create(beam_type, OSKAR_CPU, max_size, status); } /* Cross-correlation beam output arrays. */ if (!d->cross_power[i_stokes] && cross_power) { if (h->num_active_stations < 2) { oskar_log_error(h->log, "Cannot create cross-power beam " "using less than two active stations."); *status = OSKAR_ERR_INVALID_ARGUMENT; break; } /* Device memory. */ d->cross_power[i_stokes] = oskar_mem_create( beam_type, dev_loc, max_src, status); /* Host memory. */ d->cross_power_cpu[i_stokes][0] = oskar_mem_create( beam_type, OSKAR_CPU, max_src, status); d->cross_power_cpu[i_stokes][1] = oskar_mem_create( beam_type, OSKAR_CPU, max_src, status); if (h->average_single_axis == 'T') d->cross_power_time_avg[i_stokes] = oskar_mem_create( beam_type, OSKAR_CPU, max_src, status); if (h->average_single_axis == 'C') d->cross_power_channel_avg[i_stokes] = oskar_mem_create( beam_type, OSKAR_CPU, max_src, status); if (h->average_time_and_channel) d->cross_power_channel_and_time_avg[i_stokes] = oskar_mem_create(beam_type, OSKAR_CPU, max_src, status); } if (d->auto_power[i_stokes]) oskar_mem_clear_contents(d->auto_power[i_stokes], status); if (d->cross_power[i_stokes]) oskar_mem_clear_contents(d->cross_power[i_stokes], status); } /* Timers. */ if (!d->tmr_compute) d->tmr_compute = oskar_timer_create(OSKAR_TIMER_NATIVE); } }
void oskar_simulator_run_block(oskar_Simulator* h, int block_index, int device_id, int* status) { double obs_start_mjd, dt_dump_days; int i_active, time_index_start, time_index_end; int num_channels, num_times_block, total_chunks, total_times; DeviceData* d; if (*status) return; /* Check that initialisation has happened. We can't initialise here, * as we're already multi-threaded at this point. */ if (!h->header) { *status = OSKAR_ERR_MEMORY_NOT_ALLOCATED; oskar_log_error(h->log, "Simulator not initalised. " "Call oskar_simulator_check_init() first."); return; } #ifdef _OPENMP if (!h->coords_only) { /* Disable any nested parallelism. */ omp_set_num_threads(1); omp_set_nested(0); } #endif /* Set the GPU to use. (Supposed to be a very low-overhead call.) */ if (device_id >= 0 && device_id < h->num_gpus) oskar_device_set(h->gpu_ids[device_id], status); /* Clear the visibility block. */ i_active = block_index % 2; /* Index of the active buffer. */ d = &(h->d[device_id]); oskar_timer_resume(d->tmr_compute); oskar_vis_block_clear(d->vis_block, status); /* Set the visibility block meta-data. */ total_chunks = h->num_sky_chunks; num_channels = h->num_channels; total_times = h->num_time_steps; obs_start_mjd = h->time_start_mjd_utc; dt_dump_days = h->time_inc_sec / 86400.0; time_index_start = block_index * h->max_times_per_block; time_index_end = time_index_start + h->max_times_per_block - 1; if (time_index_end >= total_times) time_index_end = total_times - 1; num_times_block = 1 + time_index_end - time_index_start; /* Set the number of active times in the block. */ oskar_vis_block_set_num_times(d->vis_block, num_times_block, status); oskar_vis_block_set_start_time_index(d->vis_block, time_index_start); /* Go though all possible work units in the block. A work unit is defined * as the simulation for one time and one sky chunk. */ while (!h->coords_only) { oskar_Sky* sky; int i_work_unit, i_chunk, i_time, i_channel, sim_time_idx; oskar_mutex_lock(h->mutex); i_work_unit = (h->work_unit_index)++; oskar_mutex_unlock(h->mutex); if ((i_work_unit >= num_times_block * total_chunks) || *status) break; /* Convert slice index to chunk/time index. */ i_chunk = i_work_unit / num_times_block; i_time = i_work_unit - i_chunk * num_times_block; sim_time_idx = time_index_start + i_time; /* Copy sky chunk to device only if different from the previous one. */ if (i_chunk != d->previous_chunk_index) { oskar_timer_resume(d->tmr_copy); oskar_sky_copy(d->chunk, h->sky_chunks[i_chunk], status); oskar_timer_pause(d->tmr_copy); } sky = h->apply_horizon_clip ? d->chunk_clip : d->chunk; /* Apply horizon clip if required. */ if (h->apply_horizon_clip) { double gast, mjd; mjd = obs_start_mjd + dt_dump_days * (sim_time_idx + 0.5); gast = oskar_convert_mjd_to_gast_fast(mjd); oskar_timer_resume(d->tmr_clip); oskar_sky_horizon_clip(d->chunk_clip, d->chunk, d->tel, gast, d->station_work, status); oskar_timer_pause(d->tmr_clip); } /* Simulate all baselines for all channels for this time and chunk. */ for (i_channel = 0; i_channel < num_channels; ++i_channel) { if (*status) break; if (h->log) { oskar_mutex_lock(h->mutex); oskar_log_message(h->log, 'S', 1, "Time %*i/%i, " "Chunk %*i/%i, Channel %*i/%i [Device %i, %i sources]", disp_width(total_times), sim_time_idx + 1, total_times, disp_width(total_chunks), i_chunk + 1, total_chunks, disp_width(num_channels), i_channel + 1, num_channels, device_id, oskar_sky_num_sources(sky)); oskar_mutex_unlock(h->mutex); } sim_baselines(h, d, sky, i_channel, i_time, sim_time_idx, status); } d->previous_chunk_index = i_chunk; } /* Copy the visibility block to host memory. */ oskar_timer_resume(d->tmr_copy); oskar_vis_block_copy(d->vis_block_cpu[i_active], d->vis_block, status); oskar_timer_pause(d->tmr_copy); oskar_timer_pause(d->tmr_compute); }
static void set_up_device_data(oskar_Simulator* h, int* status) { int i, dev_loc, complx, vistype, num_stations, num_src; if (*status) return; /* Get local variables. */ num_stations = oskar_telescope_num_stations(h->tel); num_src = h->max_sources_per_chunk; complx = (h->prec) | OSKAR_COMPLEX; vistype = complx; if (oskar_telescope_pol_mode(h->tel) == OSKAR_POL_MODE_FULL) vistype |= OSKAR_MATRIX; /* Expand the number of devices to the number of selected GPUs, * if required. */ if (h->num_devices < h->num_gpus) oskar_simulator_set_num_devices(h, h->num_gpus); for (i = 0; i < h->num_devices; ++i) { DeviceData* d = &h->d[i]; d->previous_chunk_index = -1; /* Select the device. */ if (i < h->num_gpus) { oskar_device_set(h->gpu_ids[i], status); dev_loc = OSKAR_GPU; } else { dev_loc = OSKAR_CPU; } /* Timers. */ if (!d->tmr_compute) { d->tmr_compute = oskar_timer_create(OSKAR_TIMER_NATIVE); d->tmr_copy = oskar_timer_create(OSKAR_TIMER_NATIVE); d->tmr_clip = oskar_timer_create(OSKAR_TIMER_NATIVE); d->tmr_E = oskar_timer_create(OSKAR_TIMER_NATIVE); d->tmr_K = oskar_timer_create(OSKAR_TIMER_NATIVE); d->tmr_join = oskar_timer_create(OSKAR_TIMER_NATIVE); d->tmr_correlate = oskar_timer_create(OSKAR_TIMER_NATIVE); } /* Visibility blocks. */ if (!d->vis_block) { d->vis_block = oskar_vis_block_create_from_header(dev_loc, h->header, status); d->vis_block_cpu[0] = oskar_vis_block_create_from_header(OSKAR_CPU, h->header, status); d->vis_block_cpu[1] = oskar_vis_block_create_from_header(OSKAR_CPU, h->header, status); } oskar_vis_block_clear(d->vis_block, status); oskar_vis_block_clear(d->vis_block_cpu[0], status); oskar_vis_block_clear(d->vis_block_cpu[1], status); /* Device scratch memory. */ if (!d->tel) { d->u = oskar_mem_create(h->prec, dev_loc, num_stations, status); d->v = oskar_mem_create(h->prec, dev_loc, num_stations, status); d->w = oskar_mem_create(h->prec, dev_loc, num_stations, status); d->chunk = oskar_sky_create(h->prec, dev_loc, num_src, status); d->chunk_clip = oskar_sky_create(h->prec, dev_loc, num_src, status); d->tel = oskar_telescope_create_copy(h->tel, dev_loc, status); d->J = oskar_jones_create(vistype, dev_loc, num_stations, num_src, status); d->R = oskar_type_is_matrix(vistype) ? oskar_jones_create(vistype, dev_loc, num_stations, num_src, status) : 0; d->E = oskar_jones_create(vistype, dev_loc, num_stations, num_src, status); d->K = oskar_jones_create(complx, dev_loc, num_stations, num_src, status); d->Z = 0; d->station_work = oskar_station_work_create(h->prec, dev_loc, status); } } }
void oskar_imager_finalise_plane(oskar_Imager* h, oskar_Mem* plane, double plane_norm, int* status) { int size, num_cells; DeviceData* d; if (*status) return; /* Apply normalisation. */ if (plane_norm > 0.0 || plane_norm < 0.0) oskar_mem_scale_real(plane, 1.0 / plane_norm, status); if (h->algorithm == OSKAR_ALGORITHM_DFT_2D || h->algorithm == OSKAR_ALGORITHM_DFT_3D) return; /* Check plane is complex type, as plane must be gridded visibilities. */ if (!oskar_mem_is_complex(plane)) { *status = OSKAR_ERR_TYPE_MISMATCH; return; } /* Make image using FFT and apply grid correction. */ size = h->grid_size; num_cells = size * size; d = &h->d[0]; if (oskar_mem_precision(plane) == OSKAR_DOUBLE) { oskar_fftphase_cd(size, size, oskar_mem_double(plane, status)); if (h->fft_on_gpu) { #ifdef OSKAR_HAVE_CUDA oskar_device_set(h->cuda_device_ids[0], status); oskar_mem_copy(d->plane_gpu, plane, status); cufftExecZ2Z(h->cufft_plan, oskar_mem_void(d->plane_gpu), oskar_mem_void(d->plane_gpu), CUFFT_FORWARD); oskar_mem_copy(plane, d->plane_gpu, status); #else *status = OSKAR_ERR_CUDA_NOT_AVAILABLE; #endif } else { oskar_fftpack_cfft2f(size, size, size, oskar_mem_double(plane, status), oskar_mem_double(h->fftpack_wsave, status), oskar_mem_double(h->fftpack_work, status)); oskar_mem_scale_real(plane, (double)num_cells, status); } oskar_fftphase_cd(size, size, oskar_mem_double(plane, status)); oskar_grid_correction_d(size, oskar_mem_double(h->corr_func, status), oskar_mem_double(plane, status)); } else { oskar_fftphase_cf(size, size, oskar_mem_float(plane, status)); if (h->fft_on_gpu) { #ifdef OSKAR_HAVE_CUDA oskar_device_set(h->cuda_device_ids[0], status); oskar_mem_copy(d->plane_gpu, plane, status); cufftExecC2C(h->cufft_plan, oskar_mem_void(d->plane_gpu), oskar_mem_void(d->plane_gpu), CUFFT_FORWARD); oskar_mem_copy(plane, d->plane_gpu, status); #else *status = OSKAR_ERR_CUDA_NOT_AVAILABLE; #endif } else { oskar_fftpack_cfft2f_f(size, size, size, oskar_mem_float(plane, status), oskar_mem_float(h->fftpack_wsave, status), oskar_mem_float(h->fftpack_work, status)); oskar_mem_scale_real(plane, (double)num_cells, status); } oskar_fftphase_cf(size, size, oskar_mem_float(plane, status)); oskar_grid_correction_f(size, oskar_mem_double(h->corr_func, status), oskar_mem_float(plane, status)); } }
static void* run_blocks(void* arg) { oskar_Imager* h; oskar_Mem *plane, *uu, *vv, *ww = 0, *amp, *weight, *block, *l, *m, *n; size_t max_size; const size_t smallest = 1024, largest = 65536; int dev_loc = OSKAR_CPU, *status; /* Get thread function arguments. */ h = ((ThreadArgs*)arg)->h; const int thread_id = ((ThreadArgs*)arg)->thread_id; const int num_vis = ((ThreadArgs*)arg)->num_vis; plane = ((ThreadArgs*)arg)->plane; status = &(h->status); /* Set the device used by the thread. */ if (thread_id < h->num_gpus) { dev_loc = h->dev_loc; oskar_device_set(h->dev_loc, h->gpu_ids[thread_id], status); } /* Copy visibility data to device. */ uu = oskar_mem_create_copy(((ThreadArgs*)arg)->uu, dev_loc, status); vv = oskar_mem_create_copy(((ThreadArgs*)arg)->vv, dev_loc, status); amp = oskar_mem_create_copy(((ThreadArgs*)arg)->amp, dev_loc, status); weight = oskar_mem_create_copy(((ThreadArgs*)arg)->weight, dev_loc, status); if (h->algorithm == OSKAR_ALGORITHM_DFT_3D) ww = oskar_mem_create_copy(((ThreadArgs*)arg)->ww, dev_loc, status); #ifdef _OPENMP /* Disable nested parallelism. */ omp_set_nested(0); omp_set_num_threads(1); #endif /* Calculate the maximum pixel block size, and number of blocks. */ const size_t num_pixels = (size_t)h->image_size * (size_t)h->image_size; max_size = num_pixels / h->num_devices; max_size = ((max_size + smallest - 1) / smallest) * smallest; if (max_size > largest) max_size = largest; if (max_size < smallest) max_size = smallest; const int num_blocks = (int) ((num_pixels + max_size - 1) / max_size); /* Allocate device memory for pixel block data. */ block = oskar_mem_create(h->imager_prec, dev_loc, 0, status); l = oskar_mem_create(h->imager_prec, dev_loc, max_size, status); m = oskar_mem_create(h->imager_prec, dev_loc, max_size, status); n = oskar_mem_create(h->imager_prec, dev_loc, max_size, status); /* Loop until all blocks are done. */ for (;;) { size_t block_size; /* Get a unique block index. */ oskar_mutex_lock(h->mutex); const int i_block = (h->i_block)++; oskar_mutex_unlock(h->mutex); if ((i_block >= num_blocks) || *status) break; /* Calculate the block size. */ const size_t block_start = i_block * max_size; block_size = num_pixels - block_start; if (block_size > max_size) block_size = max_size; /* Copy the (l,m,n) positions for the block. */ oskar_mem_copy_contents(l, h->l, 0, block_start, block_size, status); oskar_mem_copy_contents(m, h->m, 0, block_start, block_size, status); if (h->algorithm == OSKAR_ALGORITHM_DFT_3D) oskar_mem_copy_contents(n, h->n, 0, block_start, block_size, status); /* Run DFT for the block. */ oskar_dft_c2r(num_vis, 2.0 * M_PI, uu, vv, ww, amp, weight, (int) block_size, l, m, n, block, status); /* Add data to existing pixels. */ oskar_mem_add(plane, plane, block, block_start, block_start, 0, block_size, status); } /* Free memory. */ oskar_mem_free(uu, status); oskar_mem_free(vv, status); oskar_mem_free(ww, status); oskar_mem_free(amp, status); oskar_mem_free(weight, status); oskar_mem_free(block, status); oskar_mem_free(l, status); oskar_mem_free(m, status); oskar_mem_free(n, status); return 0; }