static cycles_t send_caps(void) { errval_t err; cycles_t time_taken = 0; srand(bench_tsc()); // random starting seed for (int i=0; i<CAPS_PER_CORE; i++) { coreid_t to_core; do { to_core = rand() % num_cores; } while(to_core == my_coreid); do { cycles_t start = bench_tsc(); err = bindings[to_core]->tx_vtbl.send_cap( bindings[to_core], NOP_CONT, my_caps[i]); if (i >= 20 && i <= (CAPS_PER_CORE - 20)) { // avoid warmup / cooldown time_taken += (bench_tsc() - start); } } while(redo_message(err)); if (err_is_fail(err)) { DEBUG_ERR(err, "xcorecap: cap send failed\n"); abort(); } } return time_taken / (CAPS_PER_CORE - 40); }
void experiment(coreid_t idx) { timestamps = malloc(sizeof(struct timestamps) * MAX_COUNT); assert(timestamps != NULL); struct bench_ump_binding *bu = (struct bench_ump_binding*)array[idx]; struct flounder_ump_state *fus = &bu->ump_state; struct ump_chan *chan = &fus->chan; struct ump_chan_state *send = &chan->send_chan; struct ump_chan_state *recv = &chan->endpoint.chan; printf("Running latency between core %"PRIuCOREID" and core %"PRIuCOREID"\n", my_core_id, idx); /* Run experiment */ for (int i = 0; i < MAX_COUNT; i++) { volatile struct ump_message *msg; struct ump_control ctrl; timestamps[i].time0 = bench_tsc(); msg = ump_impl_get_next(send, &ctrl); msg->header.control = ctrl; while (!ump_impl_recv(recv)); timestamps[i].time1 = bench_tsc(); } /* Print results */ for (int i = MAX_COUNT / 10; i < MAX_COUNT; i++) { if (timestamps[i].time1 > timestamps[i].time0) { printf("page %d took %"PRIuCYCLES"\n", i, timestamps[i].time1 - bench_tscoverhead() - timestamps[i].time0); } } }
void timing_sync_bench(void) { static cycles_t timestamp[ITERATIONS]; for(int i = 0; i < ITERATIONS; i++) { cycles_t start = bench_tsc(); errval_t err = timing_sync_timer(); assert(err_is_ok(err)); cycles_t end = bench_tsc(); timestamp[i] = end - start; } for(int i = 0; i < ITERATIONS; i++) { printf("duration %d: %" PRIuCYCLES "\n", i, timestamp[i]); } int nthreads = 0; for(int i = 0; i <= MAX_COREID; i++) { struct intermon_binding *b = NULL; errval_t err = intermon_binding_get(i, &b); if(err_is_ok(err) && b != NULL) { nthreads++; } } printf("number of threads: %d\n", nthreads); printf("client done.\n"); }
static void run_experiment(void) { for (int i = 0; i < MAX_COUNT; i++) { timestamps[i].time0 = bench_tsc(); clock_get_timestamp(); timestamps[i].time1 = bench_tsc(); } }
static void prepare_bomp(void) { debug_printf("prepare_bomp\n"); cycles_t tsc_start = bench_tsc(); bomp_bomp_init(nthreads); cycles_t tsc_end = bench_tsc(); timer_xompinit = bench_time_diff(tsc_start, tsc_end); }
/** * \brief Measure overhead of taking timestamp */ static void measure_tsc(void) { uint64_t begin; uint64_t end; begin = bench_tsc(); for(int i = 0; i < 1000; i++) { end = bench_tsc(); } tsc_overhead = (end - begin) / 1000; }
int main(int argc, char *argv[]) { bench_init(); int k = 300; while(k--) { uint64_t start = bench_tsc(); for (volatile int i = 0; i < ITERATIONS; i++); uint64_t end = bench_tsc(); printf("%"PRIu64"\n", end - start); } return EXIT_SUCCESS; }
int main(int argc, char *argv[]) { volatile uint64_t workcnt = 0; int nthreads; debug_printf("bomptest started.\n"); bench_init(); #if CONFIG_TRACE errval_t err = trace_control(TRACE_EVENT(TRACE_SUBSYS_ROUTE, TRACE_EVENT_ROUTE_BENCH_START, 0), TRACE_EVENT(TRACE_SUBSYS_ROUTE, TRACE_EVENT_ROUTE_BENCH_STOP, 0), 0); assert(err_is_ok(err)); #endif if(argc == 2) { nthreads = atoi(argv[1]); backend_span_domain(nthreads, STACK_SIZE); bomp_custom_init(NULL); omp_set_num_threads(nthreads); } else { assert(!"Specify number of threads"); } trace_event(TRACE_SUBSYS_ROUTE, TRACE_EVENT_ROUTE_BENCH_START, 0); uint64_t start = bench_tsc(); #pragma omp parallel while(rdtsc() < start + 805000000ULL) { workcnt++; } uint64_t end = bench_tsc(); trace_event(TRACE_SUBSYS_ROUTE, TRACE_EVENT_ROUTE_BENCH_STOP, 0); printf("done. time taken: %" PRIu64 " cycles.\n", end - start); #if CONFIG_TRACE char *buf = malloc(4096*4096); trace_dump(buf, 4096*4096, NULL); printf("%s\n", buf); #endif for(;;); return 0; }
static void single_run(int32_t chunksize, int32_t repetitions) { errval_t err; vfs_handle_t handle; // create file err = vfs_create(FILENAME, &handle); assert(err_is_ok(err)); // create chunk containing arbitraty data uint8_t *chunk = malloc(chunksize); assert(chunk != NULL); // start time printf("Start run with chunksize: %" PRId32 ", repetitions: %" PRId32 "\n", chunksize, repetitions); cycles_t start_cycles = bench_tsc(); size_t written = 0; for (int32_t i = 0; i < repetitions; i++) { err = vfs_write(handle, chunk, chunksize, &written); assert(err_is_ok(err)); assert(written == chunksize); } err = vfs_close(handle); assert(err_is_ok(err)); // end time cycles_t end_cycles = bench_tsc(); // evaluation cycles_t cycles = end_cycles - start_cycles; uint64_t ms = bench_tsc_to_ms(cycles); double sec = (double) ms / 1000.0; int64_t bytes_written = chunksize * repetitions; double kibibytes_written = (double) bytes_written / 1024.0; double mebibytes_written = (double) bytes_written / (1024.0 * 1024.0); double kibps = kibibytes_written / sec; printf("%" PRId64 " bytes (%.1f KiB, %.1f MiB) written in %" PRIuCYCLES " cycles (%" PRIu64 " ms, %.1f s) -> %.1f KiB/s\n", bytes_written, kibibytes_written, mebibytes_written, cycles, ms, sec, kibps); // cleanup free(chunk); err = vfs_remove(FILENAME); assert(err_is_ok(err)); }
void mp_barrier(cycles_t *measurement) { coreid_t tid = get_core_id(); #ifdef QRM_DBG_ENABLED ++_num_barrier; uint32_t _num_barrier_recv = _num_barrier; #endif debug_printfff(DBG__REDUCE, "barrier enter #%d\n", _num_barrier); // Recution // -------------------------------------------------- #ifdef QRM_DBG_ENABLED uint32_t _tmp = #endif mp_reduce(_num_barrier); #ifdef QRM_DBG_ENABLED // Sanity check if (tid==get_sequentializer()) { assert (_tmp == get_num_threads()*_num_barrier); } if (measurement) *measurement = bench_tsc(); #endif // Broadcast // -------------------------------------------------- if (tid == get_sequentializer()) { mp_send_ab(_num_barrier); } else { #ifdef QRM_DBG_ENABLED _num_barrier_recv = #endif mp_receive_forward(0); } #ifdef QRM_DBG_ENABLED if (_num_barrier_recv != _num_barrier) { debug_printf("ASSERTION fail %d != %d\n", _num_barrier_recv, _num_barrier); } assert (_num_barrier_recv == _num_barrier); // Add a shared memory barrier to absolutely make sure that // everybody finished the barrier before leaving - this simplifies // debugging, as the programm will get stuck if barriers are // broken, rather than some threads (wrongly) continuing and // causing problems somewhere else #if 0 // Enable separately debug_printfff(DBG_REDUCE, "finished barrier .. waiting for others\n"); shl_barrier_shm(get_num_threads()); #endif #endif debug_printfff(DBG__REDUCE, "barrier complete #%d\n", _num_barrier); }
static cycles_t retype_caps(void) { errval_t err; cycles_t time_taken = 0; for (int i=0; i<CAPS_PER_CORE; i++) { cycles_t start = bench_tsc(); err = cap_retype(retyped_caps[i], my_caps[i], ObjType_Frame, CHILD_BITS); if (i >= 20 && i <= (CAPS_PER_CORE - 20)) { // avoid warmup / cooldown time_taken += (bench_tsc() - start); } if (err_is_fail(err)) { DEBUG_ERR(err, "xcorecap: Retype to frame failed\n"); } } return time_taken / (CAPS_PER_CORE - 40); }
void record_packet_transmit_to_bf(void){ the_stats.last_packet_transmit_to_bf_ts = bench_tsc(); //printf("TOBF %"PRIu64" - %"PRIu64" = %"PRIu64"\n",the_stats.last_packet_transmit_to_bf_ts, the_stats.last_packet_receive_net_ts,the_stats.last_packet_transmit_to_bf_ts - the_stats.last_packet_receive_net_ts ); if(the_stats.last_packet_transmit_to_bf_ts > the_stats.last_packet_receive_net_ts){ push_net_to_bf_diff(the_stats.last_packet_transmit_to_bf_ts - the_stats.last_packet_receive_net_ts ); } else { //printf("TOBF Skipped packet because of wrong ts order\n"); } }
// called when a message is received static inline void message_received(void) { errval_t err; // save timestamp timestamps[i].time1 = bench_tsc(); // trace receive event err = trace_event(TRACE_SUBSYS_MULTIHOP, TRACE_EVENT_MULTIHOP_MESSAGE_RECEIVE, 0); if (err_is_fail(err)) { USER_PANIC_ERR(err, "trace_event failed"); } reply_received = true; }
void record_packet_receive_from_net(void){ the_stats.last_packet_receive_net_ts = bench_tsc(); }
// continue experiment static void experiment_cont(void* arg) { errval_t err; static bool flag = false; static int message_type = 0; // Experiment finished (with this message type) if (i == MAX_COUNT - 1) { #if CONFIG_TRACE #else // print measured times for (int j = MAX_COUNT / 10; j < MAX_COUNT; j++) { printf( "page %d took %"PRIuCYCLES"\n", j, timestamps[j].time1 - bench_tscoverhead() - timestamps[j].time0); } #endif // go to next message type message_type++; flag = false; i = 0; if (message_type > 13) { // stop tracing err = trace_event(TRACE_SUBSYS_MULTIHOP, TRACE_EVENT_MULTIHOP_BENCH_STOP, 0); if (err_is_fail(err)) { USER_PANIC_ERR(err, "trace_event failed"); } #if CONFIG_TRACE // dump trace char *buf = malloc(50*4096*4096); size_t length = trace_dump(buf, 20*4096*4096, NULL); printf("%s\n", buf); printf("length of buffer %lu\n", length); #endif printf("client done!\n"); return; } } if (!flag) { // Start experiment #if CONFIG_TRACE #else printf("Running latency test for message %s...\n", get_message_name(message_type)); #endif flag = true; timestamps[i].time0 = bench_tsc(); } else { // Continue experiment i++; timestamps[i].time0 = bench_tsc(); } // trace send event err = trace_event(TRACE_SUBSYS_MULTIHOP, TRACE_EVENT_MULTIHOP_MESSAGE_SEND, message_type); if (err_is_fail(err)) { USER_PANIC_ERR(err, "trace_event failed"); } // send next message switch (message_type) { case 0: err = binding->tx_vtbl.fsb_empty_request(binding, NOP_CONT); break; case 1: err = binding->tx_vtbl.fsb_payload32_1_request(binding, NOP_CONT, 1); break; case 2: err = binding->tx_vtbl.fsb_payload32_2_request(binding, NOP_CONT, 1, 2); break; case 3: err = binding->tx_vtbl.fsb_payload32_4_request(binding, NOP_CONT, 1, 2, 3, 4); break; case 4: err = binding->tx_vtbl.fsb_payload32_8_request(binding, NOP_CONT, 1, 2, 3, 4, 5, 6, 7, 8); break; case 5: err = binding->tx_vtbl.fsb_payload32_16_request(binding, NOP_CONT, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); break; case 6: err = binding->tx_vtbl.fsb_payload64_1_request(binding, NOP_CONT, 1); break; case 7: err = binding->tx_vtbl.fsb_payload64_2_request(binding, NOP_CONT, 1, 2); break; case 8: err = binding->tx_vtbl.fsb_payload64_4_request(binding, NOP_CONT, 1, 2, 3, 4); break; case 9: err = binding->tx_vtbl.fsb_payload64_8_request(binding, NOP_CONT, 1, 2, 3, 4, 5, 6, 7, 8); break; case 10: err = binding->tx_vtbl.fsb_payload64_16_request(binding, NOP_CONT, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); break; case 11: err = binding->tx_vtbl.fsb_buffer_request(binding, NOP_CONT, &buffer, 1); break; case 12: err = binding->tx_vtbl.fsb_buffer_request(binding, NOP_CONT, buffer2, 100); break; case 13: err = binding->tx_vtbl.fsb_buffer_request(binding, NOP_CONT, buffer3, 1000); break; default: printf("unknown message type\n"); abort(); break; } // make sure send was successful if (err_is_fail(err)) { USER_PANIC_ERR(err, "while running experiment\n"); } // receive reply (by dispatching events from the // waitset we use for the benchmark) while (reply_received == false) { event_dispatch(&signal_waitset); } experiment(); }
static void gw_req_memory_call_rx(struct xomp_binding *b, uint64_t addr, uint8_t type) { XWI_DEBUG("gw_req_memory_call_rx: addr:%lx, tyep: %u\n", addr, type); #if XOMP_BENCH_WORKER_EN cycles_t mem_timer = bench_tsc(); #endif struct txq_msg_st *msg_st = txq_msg_st_alloc(&txq); assert(msg_st != NULL); struct capref frame; if (type == XOMP_FRAME_TYPE_REPL_RW) { type = XOMP_FRAME_TYPE_SHARED_RW; } assert(!(worker_id & XOMP_WID_GATEWAY_FLAG)); msg_st->send = gw_req_memory_response_tx; msg_st->cleanup = NULL; XWR_DEBUG("Requesting frame from gateway: [%016lx]\n", usrdata); msg_st->err = xomp_gateway_get_memory(addr, &frame); if (err_is_fail(msg_st->err)) { txq_send(msg_st); return; } vregion_flags_t map_flags; switch ((xomp_frame_type_t) type) { case XOMP_FRAME_TYPE_MSG: map_flags = VREGION_FLAGS_READ_WRITE; break; case XOMP_FRAME_TYPE_SHARED_RW: case XOMP_FRAME_TYPE_REPL_RW: map_flags = VREGION_FLAGS_READ_WRITE; break; case XOMP_FRAME_TYPE_SHARED_RO: map_flags = VREGION_FLAGS_READ; break; default: USER_PANIC("unknown type: %u", type) break; } struct frame_identity id; msg_st->err = invoke_frame_identify(frame, &id); if (err_is_fail(msg_st->err)) { txq_send(msg_st); return; } if (addr) { msg_st->err = vspace_map_one_frame_fixed_attr(addr, (1UL << id.bits), frame, map_flags, NULL, NULL); } else { void *map_addr; msg_st->err = vspace_map_one_frame_attr(&map_addr, (1UL << id.bits), frame, map_flags, NULL, NULL); } #if XOMP_BENCH_WORKER_EN mem_timer = bench_tsc() - mem_timer; debug_printf("%lx mem request %016lx took %lu cycles, %lu ms\n", worker_id, addr, mem_timer, bench_tsc_to_ms(mem_timer)); #endif txq_send(msg_st); }
errval_t spawn_xcore_monitor(coreid_t coreid, int hwid, enum cpu_type cpu_type, const char *cmdline, struct frame_identity urpc_frame_id, struct capref kcb) { uint64_t start = 0; const char *monitorname = NULL, *cpuname = NULL; genpaddr_t arch_page_size; errval_t err; err = get_architecture_config(cpu_type, &arch_page_size, &monitorname, &cpuname); assert(err_is_ok(err)); DEBUG("loading kernel: %s\n", cpuname); DEBUG("loading 1st app: %s\n", monitorname); // compute size of frame needed and allocate it DEBUG("%s:%s:%d: urpc_frame_id.base=%"PRIxGENPADDR"\n", __FILE__, __FUNCTION__, __LINE__, urpc_frame_id.base); DEBUG("%s:%s:%d: urpc_frame_id.size=%d\n", __FILE__, __FUNCTION__, __LINE__, urpc_frame_id.bits); if (benchmark_flag) { start = bench_tsc(); } static size_t cpu_binary_size; static lvaddr_t cpu_binary = 0; static genpaddr_t cpu_binary_phys; static const char* cached_cpuname = NULL; if (cpu_binary == 0) { cached_cpuname = cpuname; // XXX: Caching these for now, until we have unmap err = lookup_module(cpuname, &cpu_binary, &cpu_binary_phys, &cpu_binary_size); if (err_is_fail(err)) { DEBUG_ERR(err, "Can not lookup module"); return err; } } // Ensure caching actually works and we're // always loading same binary. If this starts to fail, get rid of caching. assert (strcmp(cached_cpuname, cpuname) == 0); static size_t monitor_binary_size; static lvaddr_t monitor_binary = 0; static genpaddr_t monitor_binary_phys; static const char* cached_monitorname = NULL; if (monitor_binary == 0) { cached_monitorname = monitorname; // XXX: Caching these for now, until we have unmap err = lookup_module(monitorname, &monitor_binary, &monitor_binary_phys, &monitor_binary_size); if (err_is_fail(err)) { DEBUG_ERR(err, "Can not lookup module"); return err; } } // Again, ensure caching actually worked (see above) assert (strcmp(cached_monitorname, monitorname) == 0); if (benchmark_flag) { bench_data->load = bench_tsc() - start; start = bench_tsc(); } struct capref cpu_memory_cap; struct frame_identity frameid; size_t cpu_memory; err = allocate_kernel_memory(cpu_binary, arch_page_size, &cpu_memory_cap, &cpu_memory, &frameid); if (err_is_fail(err)) { DEBUG_ERR(err, "Can not allocate space for new app kernel."); return err; } err = cap_mark_remote(cpu_memory_cap); if (err_is_fail(err)) { DEBUG_ERR(err, "Can not mark cap remote."); return err; } void *cpu_buf_memory; err = vspace_map_one_frame(&cpu_buf_memory, cpu_memory, cpu_memory_cap, NULL, NULL); if (err_is_fail(err)) { return err_push(err, LIB_ERR_VSPACE_MAP); } if (benchmark_flag) { bench_data->alloc_cpu = bench_tsc() - start; start = bench_tsc(); } /* Chunk of memory to load monitor on the app core */ struct capref spawn_memory_cap; struct frame_identity spawn_memory_identity; err = frame_alloc_identify(&spawn_memory_cap, X86_CORE_DATA_PAGES * arch_page_size, NULL, &spawn_memory_identity); if (err_is_fail(err)) { return err_push(err, LIB_ERR_FRAME_ALLOC); } err = cap_mark_remote(spawn_memory_cap); if (err_is_fail(err)) { DEBUG_ERR(err, "Can not mark cap remote."); return err; } if (benchmark_flag) { bench_data->alloc_mon = bench_tsc() - start; start = bench_tsc(); } /* Load cpu */ struct elf_allocate_state state; state.vbase = (char *)cpu_buf_memory + arch_page_size; assert(sizeof(struct x86_core_data) <= arch_page_size); state.elfbase = elf_virtual_base(cpu_binary); struct Elf64_Ehdr *cpu_head = (struct Elf64_Ehdr *)cpu_binary; genvaddr_t cpu_entry; err = elf_load(cpu_head->e_machine, elfload_allocate, &state, cpu_binary, cpu_binary_size, &cpu_entry); if (err_is_fail(err)) { return err; } if (benchmark_flag) { bench_data->elf_load = bench_tsc() - start; start = bench_tsc(); } err = relocate_cpu_binary(cpu_binary, cpu_head, state, frameid, arch_page_size); if (err_is_fail(err)) { DEBUG_ERR(err, "Can not relocate new kernel."); return err; } if (benchmark_flag) { bench_data->elf_reloc = bench_tsc() - start; } genvaddr_t cpu_reloc_entry = cpu_entry - state.elfbase + frameid.base + arch_page_size; /* Compute entry point in the foreign address space */ forvaddr_t foreign_cpu_reloc_entry = (forvaddr_t)cpu_reloc_entry; /* Setup the core_data struct in the new kernel */ struct x86_core_data *core_data = (struct x86_core_data *)cpu_buf_memory; switch (cpu_head->e_machine) { case EM_X86_64: case EM_K1OM: core_data->elf.size = sizeof(struct Elf64_Shdr); core_data->elf.addr = cpu_binary_phys + (uintptr_t)cpu_head->e_shoff; core_data->elf.num = cpu_head->e_shnum; break; case EM_386: core_data->elf.size = sizeof(struct Elf32_Shdr); struct Elf32_Ehdr *head32 = (struct Elf32_Ehdr *)cpu_binary; core_data->elf.addr = cpu_binary_phys + (uintptr_t)head32->e_shoff; core_data->elf.num = head32->e_shnum; break; default: return SPAWN_ERR_UNKNOWN_TARGET_ARCH; } core_data->module_start = cpu_binary_phys; core_data->module_end = cpu_binary_phys + cpu_binary_size; core_data->urpc_frame_base = urpc_frame_id.base; core_data->urpc_frame_bits = urpc_frame_id.bits; core_data->monitor_binary = monitor_binary_phys; core_data->monitor_binary_size = monitor_binary_size; core_data->memory_base_start = spawn_memory_identity.base; core_data->memory_bits = spawn_memory_identity.bits; core_data->src_core_id = disp_get_core_id(); core_data->src_arch_id = my_arch_id; core_data->dst_core_id = coreid; struct frame_identity fid; err = invoke_frame_identify(kcb, &fid); if (err_is_fail(err)) { USER_PANIC_ERR(err, "Invoke frame identity for KCB failed. " "Did you add the syscall handler for that architecture?"); } DEBUG("%s:%s:%d: fid.base is 0x%"PRIxGENPADDR"\n", __FILE__, __FUNCTION__, __LINE__, fid.base); core_data->kcb = (genpaddr_t) fid.base; #ifdef CONFIG_FLOUNDER_BACKEND_UMP_IPI core_data->chan_id = chanid; #endif if (cmdline != NULL) { // copy as much of command line as will fit snprintf(core_data->kernel_cmdline, sizeof(core_data->kernel_cmdline), "%s %s", cpuname, cmdline); // ensure termination core_data->kernel_cmdline[sizeof(core_data->kernel_cmdline) - 1] = '\0'; DEBUG("%s:%s:%d: %s\n", __FILE__, __FUNCTION__, __LINE__, core_data->kernel_cmdline); } /* Invoke kernel capability to boot new core */ if (cpu_type == CPU_X86_64 || cpu_type == CPU_K1OM) { start_aps_x86_64_start(hwid, foreign_cpu_reloc_entry); } #ifndef __k1om__ else if (cpu_type == CPU_X86_32) { start_aps_x86_32_start(hwid, foreign_cpu_reloc_entry); } #endif /* Clean up */ // XXX: Should not delete the remote caps? err = cap_destroy(spawn_memory_cap); if (err_is_fail(err)) { USER_PANIC_ERR(err, "cap_destroy failed"); } err = vspace_unmap(cpu_buf_memory); if (err_is_fail(err)) { USER_PANIC_ERR(err, "vspace unmap CPU driver memory failed"); } err = cap_destroy(cpu_memory_cap); if (err_is_fail(err)) { USER_PANIC_ERR(err, "cap_destroy failed"); } return SYS_ERR_OK; }
static errval_t replicate_frame(lvaddr_t addr, struct capref *frame) { errval_t err; #if XOMP_BENCH_WORKER_EN cycles_t repl_timer = bench_tsc(); #endif struct frame_identity id; err = invoke_frame_identify(*frame, &id); if (err_is_fail(err)) { return err; } XWR_DEBUG("Replicating frame: [%016lx]\n", id.base); struct capref replicate; err = frame_alloc(&replicate, (1UL << id.bits), NULL); if (err_is_fail(err)) { return err; } XWR_DEBUG("registering memory with DMA service\n"); #if XOMP_BENCH_WORKER_EN cycles_t register_timer = bench_tsc(); #endif err = dma_register_memory((struct dma_device *) dma_dev, *frame); if (err_is_fail(err)) { return err; } err = dma_register_memory((struct dma_device *) dma_dev, replicate); if (err_is_fail(err)) { return err; } #if XOMP_BENCH_WORKER_EN cycles_t register_timer_end = bench_tsc(); #endif struct dma_req_setup setup = { .done_cb = dma_replication_cb, .cb_arg = NULL, .args = { .memcpy = { .src = id.base, .bytes = (1UL << id.bits) } } }; err = invoke_frame_identify(replicate, &id); if (err_is_fail(err)) { return err; } setup.args.memcpy.dst = id.base; dma_replication_done = 0x0; XWR_DEBUG("DMA request for replication\n"); err = dma_request_memcpy((struct dma_device *)dma_dev, &setup, NULL); if (err_is_fail(err)) { return err; } while (!dma_replication_done) { messages_wait_and_handle_next(); } XWR_DEBUG("Replication done.\n"); *frame = replicate; #if XOMP_BENCH_WORKER_EN cycles_t timer_end = bench_tsc(); debug_printf("%lx replication took %lu cycles, %lu ms\n", worker_id, timer_end - repl_timer, bench_tsc_to_ms(timer_end - repl_timer)); debug_printf("%lx register mem took %lu cycles, %lu ms\n", worker_id, register_timer_end - register_timer, bench_tsc_to_ms(register_timer_end - register_timer)); #endif return SYS_ERR_OK; }
static void add_memory_call_rx(struct xomp_binding *b, struct capref frame, uint64_t addr, uint8_t type) { XWI_DEBUG("add_memory_call_rx: addr:%lx, tyep: %u\n", addr, type); struct txq_msg_st *msg_st = txq_msg_st_alloc(&txq); assert(msg_st != NULL); msg_st->send = add_memory_response_tx; msg_st->cleanup = NULL; uint32_t map_flags = 0x0; switch ((xomp_frame_type_t) type) { case XOMP_FRAME_TYPE_MSG: map_flags = VREGION_FLAGS_READ_WRITE; break; case XOMP_FRAME_TYPE_SHARED_RW: map_flags = VREGION_FLAGS_READ_WRITE; break; case XOMP_FRAME_TYPE_SHARED_RO: map_flags = VREGION_FLAGS_READ; break; default: USER_PANIC("unknown type: %u", type) break; } struct frame_identity id; msg_st->err = invoke_frame_identify(frame, &id); if(err_is_fail(msg_st->err)) { txq_send(msg_st); return; } #if XOMP_WORKER_ENABLE_DMA if (0) { // todo: replicate frame on the same node if needed.. replicate_frame(addr, &frame); } #endif #if XOMP_BENCH_WORKER_EN cycles_t map_start = bench_tsc(); #endif if (addr) { msg_st->err = vspace_map_one_frame_fixed_attr(addr, (1UL << id.bits), frame, map_flags, NULL, NULL); } else { void *map_addr; msg_st->err = vspace_map_one_frame_attr(&map_addr, (1UL << id.bits), frame, map_flags, NULL, NULL); } #if XOMP_BENCH_WORKER_EN cycles_t timer_end = bench_tsc(); debug_printf("%lx mem map %016lx took %lu cycles, %lu ms\n", worker_id, addr, timer_end - map_start, bench_tsc_to_ms(timer_end - map_start)); #endif txq_send(msg_st); }
static void do_work_rx(struct xomp_binding *b, uint64_t fn, uint64_t arg, uint64_t id, uint64_t flags) { errval_t err; XWP_DEBUG("do_work_rx: fn:%lx, id:%lx\n", fn, id); #if XOMP_BENCH_WORKER_EN cycles_t work_timer = bench_tsc(); #endif struct txq_msg_st *msg_st = txq_msg_st_alloc(&txq); assert(msg_st != NULL); msg_st->err = SYS_ERR_OK; struct bomp_work *work = tls; XWP_DEBUG("do_work_rx: threadid = %u, nthreads = %u\n", work->thread_id, work->num_threads); g_bomp_state->num_threads = work->num_threads; struct xomp_msg_st *st = (struct xomp_msg_st *) msg_st; st->args.done_notify.id = id; if (arg) { msg_st->send = done_with_arg_tx; st->args.done_notify.arg = arg; } else { msg_st->send = done_notify_tx; } if (fn & XOMP_FN_INDEX_FLAG) { uint32_t idx = fn & ~XOMP_FN_INDEX_FLAG; char *fn_name; err = spawn_symval_lookup_idx(idx, &fn_name, &fn); if (err_is_fail(err)) { msg_st->err = err; txq_send(msg_st); return; } XWP_DEBUG("do_work_rx: function index %u -> %s\n", idx, fn_name); } xomp_worker_fn_t fnct = (xomp_worker_fn_t) fn; XWP_DEBUG("do_work_rx: calling fnct %p with argument %p\n", fnct, work->data); for (uint32_t i = 0; i < work->num_vtreads; ++i) { fnct(work->data); work->thread_id++; } #if XOMP_BENCH_WORKER_EN work_timer = bench_tsc() - work_timer; debug_printf("%lx work took %lu cycles, %lu ms\n", worker_id, work_timer, bench_tsc_to_ms(work_timer)); #endif txq_send(msg_st); }
static int prepare_xomp(int argc, char *argv[]) { errval_t err; xomp_wloc_t location = XOMP_WORKER_LOC_MIXED; for (int i = 3; i < argc; ++i) { if (!strncmp(argv[i], "--location=", 11)) { char *p = strchr(argv[i], '='); p++; if (!strcmp(p, "local")) { location = XOMP_WORKER_LOC_LOCAL; } } } if (location == XOMP_WORKER_LOC_MIXED) { debug_printf("waiting for xeon phi to be ready\n"); err = xeon_phi_domain_blocking_lookup("xeon_phi.0.ready", NULL); EXPECT_SUCCESS(err, "nameservice_blocking_lookup"); err = xeon_phi_domain_blocking_lookup("xeon_phi.1.ready", NULL); EXPECT_SUCCESS(err, "nameservice_blocking_lookup"); #if XOMP_BENCH_ENABLED xomp_master_bench_enable(BENCH_RUN_COUNT, nthreads, XOMP_MASTER_BENCH_MEM_ADD); #endif } struct xomp_spawn local_info = { .argc = argc, .argv = argv, #ifdef __k1om__ .path = "/k1om/sbin/benchmarks/bomp_mm", #else .path = "/x86_64/sbin/benchmarks/bomp_mm", #endif }; struct xomp_spawn remote_info = { .argc = argc, .argv = argv, .path = "/k1om/sbin/benchmarks/bomp_mm", }; struct xomp_args xomp_arg = { .type = XOMP_ARG_TYPE_DISTINCT, .core_stride = 0, // use default .args = { .distinct = { .nthreads = nthreads, .worker_loc = location, .nphi = 2, .local = local_info, .remote = remote_info } } }; cycles_t tsc_start = bench_tsc(); if (bomp_xomp_init(&xomp_arg)) { debug_printf("bomp init failed!\n"); exit(1); } cycles_t tsc_end = bench_tsc(); timer_xompinit = bench_time_diff(tsc_start, tsc_end); return (location == XOMP_WORKER_LOC_LOCAL); } int main(int argc, char *argv[]) { errval_t err; xomp_wid_t wid; bench_init(); err = xomp_worker_parse_cmdline(argc, argv, &wid); if (err_is_ok(err)) { struct xomp_args xw_arg = { .type = XOMP_ARG_TYPE_WORKER, .args = { .worker = { .id = wid } } }; bomp_xomp_init(&xw_arg); } if (argc < 4) { debug_printf("Usage: %s <size> <numthreats>\n", argv[0]); exit(1); } nthreads = strtoul(argv[1], NULL, 10); if (nthreads == 0) { debug_printf("num threads must be >0\n"); exit(1); } DEBUG("\n"); DEBUG("======================================================\n"); debug_printf("Num Threads: %u\n", nthreads); uint8_t is_shared = 0; for (int i = 2; i < argc; ++i) { if (!strcmp(argv[i], "bomp")) { prepare_bomp(); is_shared = 1; } else if (!strcmp(argv[i], "xomp")) { is_shared = prepare_xomp(argc, argv); } else { debug_printf("ignoring argument {%s}\n", argv[i]); } } debug_printf("-------------------------------------\n"); debug_printf("init time: %lu\n", timer_xompinit); debug_printf("-------------------------------------\n"); #if XOMP_BENCH_ENABLED xomp_master_bench_print_results(); #endif while (1) ; }
int start_aps_x86_32_start(uint8_t core_id, genvaddr_t entry) { DEBUG("%s:%d: start_aps_x86_32_start\n", __FILE__, __LINE__); // Copy the startup code to the real-mode address uint8_t *real_src = (uint8_t *) &x86_32_start_ap; uint8_t *real_end = (uint8_t *) &x86_32_start_ap_end; struct capref bootcap; struct acpi_rpc_client* acl = get_acpi_rpc_client(); errval_t error_code; errval_t err = acl->vtbl.mm_realloc_range_proxy(acl, 16, 0x0, &bootcap, &error_code); if (err_is_fail(err)) { USER_PANIC_ERR(err, "mm_alloc_range_proxy failed."); } if (err_is_fail(error_code)) { USER_PANIC_ERR(error_code, "mm_alloc_range_proxy return failed."); } void* real_base; err = vspace_map_one_frame(&real_base, 1<<16, bootcap, NULL, NULL); uint8_t* real_dest = (uint8_t*)real_base + X86_32_REAL_MODE_LINEAR_OFFSET; memcpy(real_dest, real_src, real_end - real_src); /* Pointer to the entry point called from init_ap.S */ volatile uint64_t *absolute_entry_ptr = (volatile uint64_t *) (( (lpaddr_t) &x86_32_init_ap_absolute_entry - (lpaddr_t) &x86_32_start_ap ) + real_dest); //copy the address of the function start (in boot.S) to the long-mode //assembler code to be able to perform an absolute jump *absolute_entry_ptr = entry; // pointer to the shared global variable amongst all kernels volatile uint64_t *ap_global = (volatile uint64_t *) (( (lpaddr_t) &x86_32_init_ap_global - (lpaddr_t) &x86_32_start_ap ) + real_dest); genpaddr_t global; struct monitor_blocking_rpc_client *mc = get_monitor_blocking_rpc_client(); err = mc->vtbl.get_global_paddr(mc, &global); if (err_is_fail(err)) { DEBUG_ERR(err, "invoke spawn core"); return err_push(err, MON_ERR_SPAWN_CORE); } *ap_global = (uint64_t)(genpaddr_t)global; // pointer to the pseudo-lock used to detect boot up of new core volatile uint32_t *ap_wait = (volatile uint32_t *) ((lpaddr_t) &x86_32_init_ap_wait - ((lpaddr_t) &x86_32_start_ap) + real_dest); // Pointer to the lock variable in the realmode code volatile uint8_t *ap_lock = (volatile uint8_t *) ((lpaddr_t) &x86_32_init_ap_lock - ((lpaddr_t) &x86_32_start_ap) + real_dest); *ap_wait = AP_STARTING_UP; end = bench_tsc(); err = invoke_send_init_ipi(ipi_cap, core_id); if (err_is_fail(err)) { DEBUG_ERR(err, "invoke send init ipi"); return err; } err = invoke_send_start_ipi(ipi_cap, core_id, entry); if (err_is_fail(err)) { DEBUG_ERR(err, "invoke sipi"); return err; } //give the new core a bit time to start-up and set the lock for (uint64_t i = 0; i < STARTUP_TIMEOUT; i++) { if (*ap_lock != 0) { break; } } // If the lock is set, the core has been started, otherwise assume, that // a core with this APIC ID doesn't exist. if (*ap_lock != 0) { while (*ap_wait != AP_STARTED); trace_event(TRACE_SUBSYS_KERNEL, TRACE_EVENT_KERNEL_CORE_START_REQUEST_ACK, core_id); *ap_lock = 0; return 0; } assert(!"badness"); return -1; }