void OffloadDescriptor::merge_var_descs( VarDesc *vars, VarDesc2 *vars2, int vars_total ) { // number of variable descriptors received from host and generated // locally should match if (m_vars_total < vars_total) { LIBOFFLOAD_ERROR(c_merge_var_descs1); exit(1); } for (int i = 0; i < m_vars_total; i++) { if (i < vars_total) { // variable type must match if (m_vars[i].type.bits != vars[i].type.bits) { LIBOFFLOAD_ERROR(c_merge_var_descs2); exit(1); } m_vars[i].ptr = vars[i].ptr; m_vars[i].into = vars[i].into; const char *var_sname = ""; if (vars2 != NULL) { if (vars2[i].sname != NULL) { var_sname = vars2[i].sname; } } OFFLOAD_DEBUG_TRACE_1(2, get_offload_number(), c_offload_var, " VarDesc %d, var=%s, %s, %s\n", i, var_sname, vardesc_direction_as_string[m_vars[i].direction.bits], vardesc_type_as_string[m_vars[i].type.src]); if (vars2 != NULL && vars2[i].dname != NULL) { OFFLOAD_TRACE(2, " into=%s, %s\n", vars2[i].dname, vardesc_type_as_string[m_vars[i].type.dst]); } } OFFLOAD_TRACE(2, " type_src=%d, type_dstn=%d, direction=%d, " "alloc_if=%d, free_if=%d, align=%d, mic_offset=%d, flags=0x%x, " "offset=%lld, size=%lld, count/disp=%lld, ptr=%p into=%p\n", m_vars[i].type.src, m_vars[i].type.dst, m_vars[i].direction.bits, m_vars[i].alloc_if, m_vars[i].free_if, m_vars[i].align, m_vars[i].mic_offset, m_vars[i].flags.bits, m_vars[i].offset, m_vars[i].size, m_vars[i].count, m_vars[i].ptr, m_vars[i].into); } }
void HostFptrTableRegister(void *table, int num_entries, int ordered) const { OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myoregister, "%s(%p, %d, %d)\n", __func__, table, num_entries, ordered); CheckResult(__func__, m_host_fptr_table_register(table, num_entries, ordered)); }
void ArenaRelease( MyoArena arena ) const { OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myoarenarelease, "%s()\n", __func__); CheckResult(__func__, m_arena_release(arena)); }
void* SharedAlignedArenaFree( MyoArena arena, void* ptr ) const { OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myosharedalignedarenafree, "%s(%u, %p)\n", __func__, arena, ptr); return m_arena_aligned_free(arena, ptr); }
void* SharedAlignedArenaMalloc( MyoArena arena, size_t size, size_t align ) const { OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myosharedalignedarenamalloc, "%s(%u, %lld, %lld)\n", __func__, arena, size, align); return m_arena_aligned_malloc(arena, size, align); }
pid_t Engine::init_device(void) { struct init_data { int device_index; int devices_total; int console_level; int offload_report_level; } data; COIRESULT res; COIEVENT event; pid_t pid; OFFLOAD_DEBUG_TRACE_1(2, 0, c_offload_init, "Initializing device with logical index %d " "and physical index %d\n", m_index, m_physical_index); // setup misc data data.device_index = m_index; data.devices_total = mic_engines_total; data.console_level = console_enabled; data.offload_report_level = offload_report_level; res = COI::PipelineRunFunction(get_pipeline(), m_funcs[c_func_init], 0, 0, 0, 0, 0, &data, sizeof(data), &pid, sizeof(pid), &event); check_result(res, c_pipeline_run_func, m_index, res); res = COI::EventWait(1, &event, -1, 1, 0, 0); check_result(res, c_event_wait, res); OFFLOAD_DEBUG_TRACE(2, "Device process pid is %d\n", pid); return pid; }
static void __offload_myo_shared_table_register(SharedTableEntry *entry) { int entries = 0; SharedTableEntry *t_start; OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, entry); t_start = entry; while (t_start->varName != 0) { OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_mic_myo_shared, "myo shared entry name = \"%s\" addr = %p\n", t_start->varName, t_start->sharedAddr); t_start++; entries++; } if (entries > 0) { OFFLOAD_DEBUG_TRACE(3, "myoiMicVarTableRegister(%p, %d)\n", entry, entries); CheckResult("myoiMicVarTableRegister", myoiMicVarTableRegister(entry, entries)); } }
void OffloadDescriptor::gather_copyout_data() { OFFLOAD_TIMER_STOP(c_offload_target_compute); OFFLOAD_TIMER_START(c_offload_target_gather_outputs); for (int i = 0; i < m_vars_total; i++) { bool src_is_for_mic = (m_vars[i].direction.out || m_vars[i].into == NULL); switch (m_vars[i].type.src) { case c_data_ptr_array: break; case c_data: case c_void_ptr: case c_cean_var: if (m_vars[i].direction.out && !m_vars[i].flags.is_static) { m_out.send_data( static_cast<char*>(m_vars[i].ptr) + m_vars[i].disp, m_vars[i].size); } break; case c_dv: break; case c_string_ptr: case c_data_ptr: case c_cean_var_ptr: case c_dv_ptr: if (m_vars[i].free_if && src_is_for_mic && !m_vars[i].flags.is_static) { void *buf = *static_cast<char**>(m_vars[i].ptr) - m_vars[i].mic_offset - (m_vars[i].flags.is_stack_buf? 0 : m_vars[i].offset); if (buf == NULL) { break; } // decrement buffer reference count OFFLOAD_TIMER_START(c_offload_target_release_buffer_refs); BufReleaseRef(buf); OFFLOAD_TIMER_STOP(c_offload_target_release_buffer_refs); } break; case c_func_ptr: if (m_vars[i].direction.out) { m_out.send_func_ptr(*((void**) m_vars[i].ptr)); } break; case c_dv_data: case c_dv_ptr_data: case c_dv_data_slice: case c_dv_ptr_data_slice: if (src_is_for_mic && m_vars[i].free_if && !m_vars[i].flags.is_static) { ArrDesc *dvp = (m_vars[i].type.src == c_dv_data || m_vars[i].type.src == c_dv_data_slice) ? static_cast<ArrDesc*>(m_vars[i].ptr) : *static_cast<ArrDesc**>(m_vars[i].ptr); void *buf = reinterpret_cast<char*>(dvp->Base) - m_vars[i].mic_offset - m_vars[i].offset; if (buf == NULL) { break; } // decrement buffer reference count OFFLOAD_TIMER_START(c_offload_target_release_buffer_refs); BufReleaseRef(buf); OFFLOAD_TIMER_STOP(c_offload_target_release_buffer_refs); } break; default: LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars[i].type.dst); abort(); } if (m_vars[i].into) { switch (m_vars[i].type.dst) { case c_data_ptr_array: break; case c_data: case c_void_ptr: case c_cean_var: case c_dv: break; case c_string_ptr: case c_data_ptr: case c_cean_var_ptr: case c_dv_ptr: if (m_vars[i].direction.in && m_vars[i].free_if && !m_vars[i].flags.is_static_dstn) { void *buf = *static_cast<char**>(m_vars[i].into) - m_vars[i].mic_offset - (m_vars[i].flags.is_stack_buf? 0 : m_vars[i].offset); if (buf == NULL) { break; } // decrement buffer reference count OFFLOAD_TIMER_START( c_offload_target_release_buffer_refs); BufReleaseRef(buf); OFFLOAD_TIMER_STOP( c_offload_target_release_buffer_refs); } break; case c_func_ptr: break; case c_dv_data: case c_dv_ptr_data: case c_dv_data_slice: case c_dv_ptr_data_slice: if (m_vars[i].free_if && m_vars[i].direction.in && !m_vars[i].flags.is_static_dstn) { ArrDesc *dvp = (m_vars[i].type.dst == c_dv_data_slice || m_vars[i].type.dst == c_dv_data) ? static_cast<ArrDesc*>(m_vars[i].into) : *static_cast<ArrDesc**>(m_vars[i].into); void *buf = reinterpret_cast<char*>(dvp->Base) - m_vars[i].mic_offset - m_vars[i].offset; if (buf == NULL) { break; } // decrement buffer reference count OFFLOAD_TIMER_START( c_offload_target_release_buffer_refs); BufReleaseRef(buf); OFFLOAD_TIMER_STOP( c_offload_target_release_buffer_refs); } break; default: LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars[i].type.dst); abort(); } } } OFFLOAD_DEBUG_TRACE(2, "OUT buffer @ p %p size %lld\n", m_out.get_buffer_start(), m_out.get_buffer_size()); OFFLOAD_DEBUG_DUMP_BYTES(2, m_out.get_buffer_start(), m_out.get_buffer_size()); OFFLOAD_DEBUG_TRACE_1(1, get_offload_number(), c_offload_copyout_data, "Total copyout data sent to host: [%lld] bytes\n", m_out.get_tfr_size()); OFFLOAD_TIMER_STOP(c_offload_target_gather_outputs); }
void OffloadDescriptor::offload( uint32_t buffer_count, void** buffers, void* misc_data, uint16_t misc_data_len, void* return_data, uint16_t return_data_len ) { FunctionDescriptor *func = (FunctionDescriptor*) misc_data; const char *name = func->data; OffloadDescriptor ofld; char *in_data = 0; char *out_data = 0; char *timer_data = 0; console_enabled = func->console_enabled; timer_enabled = func->timer_enabled; offload_report_level = func->offload_report_level; offload_number = func->offload_number; ofld.set_offload_number(func->offload_number); #ifdef SEP_SUPPORT if (sep_monitor) { if (__sync_fetch_and_add(&sep_counter, 1) == 0) { OFFLOAD_DEBUG_TRACE(2, "VTResumeSampling\n"); VTResumeSampling(); } } #endif // SEP_SUPPORT OFFLOAD_DEBUG_TRACE_1(2, ofld.get_offload_number(), c_offload_start_target_func, "Offload \"%s\" started\n", name); // initialize timer data OFFLOAD_TIMER_INIT(); OFFLOAD_TIMER_START(c_offload_target_total_time); OFFLOAD_TIMER_START(c_offload_target_descriptor_setup); // get input/output buffer addresses if (func->in_datalen > 0 || func->out_datalen > 0) { if (func->data_offset != 0) { in_data = (char*) misc_data + func->data_offset; out_data = (char*) return_data; } else { char *inout_buf = (char*) buffers[--buffer_count]; in_data = inout_buf; out_data = inout_buf; } } // assign variable descriptors ofld.m_vars_total = func->vars_num; if (ofld.m_vars_total > 0) { uint64_t var_data_len = ofld.m_vars_total * sizeof(VarDesc); ofld.m_vars = (VarDesc*) malloc(var_data_len); if (ofld.m_vars == NULL) LIBOFFLOAD_ERROR(c_malloc); memcpy(ofld.m_vars, in_data, var_data_len); in_data += var_data_len; func->in_datalen -= var_data_len; } // timer data if (func->timer_enabled) { uint64_t timer_data_len = OFFLOAD_TIMER_DATALEN(); timer_data = out_data; out_data += timer_data_len; func->out_datalen -= timer_data_len; } // init Marshallers ofld.m_in.init_buffer(in_data, func->in_datalen); ofld.m_out.init_buffer(out_data, func->out_datalen); // copy buffers to offload descriptor std::copy(buffers, buffers + buffer_count, std::back_inserter(ofld.m_buffers)); OFFLOAD_TIMER_STOP(c_offload_target_descriptor_setup); // find offload entry address OFFLOAD_TIMER_START(c_offload_target_func_lookup); offload_func_with_parms entry = (offload_func_with_parms) __offload_entries.find_addr(name); if (entry == NULL) { #if OFFLOAD_DEBUG > 0 if (console_enabled > 2) { __offload_entries.dump(); } #endif LIBOFFLOAD_ERROR(c_offload_descriptor_offload, name); exit(1); } OFFLOAD_TIMER_STOP(c_offload_target_func_lookup); OFFLOAD_TIMER_START(c_offload_target_func_time); // execute offload entry entry(&ofld); OFFLOAD_TIMER_STOP(c_offload_target_func_time); OFFLOAD_TIMER_STOP(c_offload_target_total_time); // copy timer data to the buffer OFFLOAD_TIMER_TARGET_DATA(timer_data); OFFLOAD_DEBUG_TRACE(2, "Offload \"%s\" finished\n", name); #ifdef SEP_SUPPORT if (sep_monitor) { if (__sync_sub_and_fetch(&sep_counter, 1) == 0) { OFFLOAD_DEBUG_TRACE(2, "VTPauseSampling\n"); VTPauseSampling(); } } #endif // SEP_SUPPORT }
extern "C" OFFLOAD OFFLOAD_TARGET_ACQUIRE( TARGET_TYPE target_type, int target_number, int is_optional, _Offload_status* status, const char* file, uint64_t line ) { bool retval; OFFLOAD ofld; // initialize status if (status != 0) { status->result = OFFLOAD_UNAVAILABLE; status->device_number = -1; status->data_sent = 0; status->data_received = 0; } // make sure libray is initialized retval = __offload_init_library(); // OFFLOAD_TIMER_INIT must follow call to __offload_init_library OffloadHostTimerData * timer_data = OFFLOAD_TIMER_INIT(file, line); OFFLOAD_TIMER_START(timer_data, c_offload_host_total_offload); OFFLOAD_TIMER_START(timer_data, c_offload_host_initialize); // initialize all devices is init_type is on_offload_all if (retval && __offload_init_type == c_init_on_offload_all) { for (int i = 0; i < mic_engines_total; i++) { mic_engines[i].init(); } } OFFLOAD_TIMER_STOP(timer_data, c_offload_host_initialize); OFFLOAD_TIMER_START(timer_data, c_offload_host_target_acquire); if (target_type == TARGET_HOST) { // Host always available retval = true; } else if (target_type == TARGET_MIC) { if (target_number >= -1) { if (retval) { if (target_number >= 0) { // User provided the device number target_number = target_number % mic_engines_total; } else { // use device 0 target_number = 0; } // reserve device in ORSL if (is_optional) { if (!ORSL::try_reserve(target_number)) { target_number = -1; } } else { if (!ORSL::reserve(target_number)) { target_number = -1; } } // initialize device if (target_number >= 0 && __offload_init_type == c_init_on_offload) { OFFLOAD_TIMER_START(timer_data, c_offload_host_initialize); mic_engines[target_number].init(); OFFLOAD_TIMER_STOP(timer_data, c_offload_host_initialize); } } else { // fallback to CPU target_number = -1; } if (target_number < 0 || !retval) { if (!is_optional && status == 0) { LIBOFFLOAD_ERROR(c_device_is_not_available); exit(1); } retval = false; } } else { LIBOFFLOAD_ERROR(c_invalid_device_number); exit(1); } } if (retval) { ofld = new OffloadDescriptor(target_number, status, !is_optional, false, timer_data); OFFLOAD_TIMER_HOST_MIC_NUM(timer_data, target_number); Offload_Report_Prolog(timer_data); OFFLOAD_DEBUG_TRACE_1(2, timer_data->offload_number, c_offload_start, "Starting offload: target_type = %d, " "number = %d, is_optional = %d\n", target_type, target_number, is_optional); OFFLOAD_TIMER_STOP(timer_data, c_offload_host_target_acquire); } else { ofld = NULL; OFFLOAD_TIMER_STOP(timer_data, c_offload_host_target_acquire); OFFLOAD_TIMER_STOP(timer_data, c_offload_host_total_offload); offload_report_free_data(timer_data); } return ofld; }
extern "C" OFFLOAD OFFLOAD_TARGET_ACQUIRE1( const int* device_num, const char* file, uint64_t line ) { int target_number; // make sure libray is initialized and at least one device is available if (!__offload_init_library()) { LIBOFFLOAD_ERROR(c_device_is_not_available); exit(1); } // OFFLOAD_TIMER_INIT must follow call to __offload_init_library OffloadHostTimerData * timer_data = OFFLOAD_TIMER_INIT(file, line); OFFLOAD_TIMER_START(timer_data, c_offload_host_total_offload); OFFLOAD_TIMER_START(timer_data, c_offload_host_initialize); if (__offload_init_type == c_init_on_offload_all) { for (int i = 0; i < mic_engines_total; i++) { mic_engines[i].init(); } } OFFLOAD_TIMER_STOP(timer_data, c_offload_host_initialize); OFFLOAD_TIMER_START(timer_data, c_offload_host_target_acquire); // use default device number if it is not provided if (device_num != 0) { target_number = *device_num; } else { target_number = __omp_device_num; } // device number should be a non-negative integer value if (target_number < 0) { LIBOFFLOAD_ERROR(c_omp_invalid_device_num); exit(1); } // should we do this for OpenMP? target_number %= mic_engines_total; // reserve device in ORSL if (!ORSL::reserve(target_number)) { LIBOFFLOAD_ERROR(c_device_is_not_available); exit(1); } // initialize device(s) OFFLOAD_TIMER_START(timer_data, c_offload_host_initialize); if (__offload_init_type == c_init_on_offload) { mic_engines[target_number].init(); } OFFLOAD_TIMER_STOP(timer_data, c_offload_host_initialize); OFFLOAD ofld = new OffloadDescriptor(target_number, 0, true, true, timer_data); OFFLOAD_TIMER_HOST_MIC_NUM(timer_data, target_number); Offload_Report_Prolog(timer_data); OFFLOAD_DEBUG_TRACE_1(2, timer_data->offload_number, c_offload_start, "Starting OpenMP offload, device = %d\n", target_number); OFFLOAD_TIMER_STOP(timer_data, c_offload_host_target_acquire); return ofld; }
void* SharedAlignedMalloc(size_t size, size_t align) const { OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myosharedalignedmalloc, "%s(%lld, %lld)\n", __func__, size, align); return m_shared_aligned_malloc(size, align); }
void Acquire(void) const { OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myoacquire, "%s()\n", __func__); CheckResult(__func__, m_acquire()); }
void LibFini(void) const { OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myofini, "%s()\n", __func__); m_lib_fini(); }
// Wrappers for MYO client functions void LibInit(void *arg, void *func) const { OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myoinit, "%s(%p, %p)\n", __func__, arg, func); CheckResult(__func__, m_lib_init(arg, func)); }
void Release(void) const { OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myorelease, "%s()\n", __func__); CheckResult(__func__, m_release()); }
void SharedFree(void *ptr) const { OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myosharedfree, "%s(%p)\n", __func__, ptr); m_shared_free(ptr); }
void OffloadDescriptor::merge_var_descs( VarDesc *vars, VarDesc2 *vars2, int vars_total ) { // number of variable descriptors received from host and generated // locally should match if (m_vars_total < vars_total) { LIBOFFLOAD_ERROR(c_merge_var_descs1); exit(1); } for (int i = 0; i < m_vars_total; i++) { // instead of m_vars[i].type.src we will use m_vars_extra[i].type_src if (i < vars_total) { // variable type must match if (m_vars[i].type.bits != vars[i].type.bits) { OFFLOAD_TRACE(2, "m_vars[%d].type.bits=%08x, vars[%d].type.bits=%08x\n", i, m_vars[i].type.bits, i, vars[i].type.bits); LIBOFFLOAD_ERROR(c_merge_var_descs2); exit(1); } if (m_vars[i].type.src == c_extended_type) { VarDescExtendedType *etype = reinterpret_cast<VarDescExtendedType*>(vars[i].ptr); m_vars_extra[i].type_src = etype->extended_type; m_vars[i].ptr = etype->ptr; } else { m_vars_extra[i].type_src = m_vars[i].type.src; if (!(m_vars[i].flags.use_device_ptr && m_vars[i].type.src == c_dv)) { m_vars[i].ptr = vars[i].ptr; } } // instead of m_vars[i].type.dst we will use m_vars_extra[i].type_dst if (m_vars[i].type.dst == c_extended_type && i < vars_total) { VarDescExtendedType *etype = reinterpret_cast<VarDescExtendedType*>(vars[i].into); m_vars_extra[i].type_dst = etype->extended_type; m_vars[i].into = etype->ptr; } else { m_vars_extra[i].type_dst = m_vars[i].type.dst; m_vars[i].into = vars[i].into; } const char *var_sname = ""; if (vars2 != NULL) { if (vars2[i].sname != NULL) { var_sname = vars2[i].sname; } } OFFLOAD_DEBUG_TRACE_1(2, get_offload_number(), c_offload_var, " VarDesc %d, var=%s, %s, %s\n", i, var_sname, vardesc_direction_as_string[m_vars[i].direction.bits], vardesc_type_as_string[m_vars_extra[i].type_src]); if (vars2 != NULL && vars2[i].dname != NULL) { OFFLOAD_TRACE(2, " into=%s, %s\n", vars2[i].dname, vardesc_type_as_string[m_vars_extra[i].type_dst]); } } else { m_vars_extra[i].type_src = m_vars[i].type.src; m_vars_extra[i].type_dst = m_vars[i].type.dst; } OFFLOAD_TRACE(2, " type_src=%d, type_dstn=%d, direction=%d, " "alloc_if=%d, free_if=%d, align=%d, mic_offset=%d, flags=0x%x, " "offset=%lld, size=%lld, count/disp=%lld, ptr=%p into=%p\n", m_vars_extra[i].type_src, m_vars_extra[i].type_dst, m_vars[i].direction.bits, m_vars[i].alloc_if, m_vars[i].free_if, m_vars[i].align, m_vars[i].mic_offset, m_vars[i].flags.bits, m_vars[i].offset, m_vars[i].size, m_vars[i].count, m_vars[i].ptr, m_vars[i].into); } }