void core_power_DPM_t::translate_params(system_core *core_params, system_L2 *L2_params) { core_power_t::translate_params(core_params, L2_params); struct core_knobs_t *knobs = core->knobs; core_params->machine_type = 0; // OoO core_params->number_hardware_threads = 2; core_params->number_instruction_fetch_ports = 2; core_params->fp_issue_width = 2; core_params->prediction_width = 1; core_params->pipelines_per_core[0] = 1; core_params->pipelines_per_core[1] = 1; core_params->pipeline_depth[0] = 15; core_params->pipeline_depth[1] = 15; core_params->instruction_window_scheme = 1; //RSBASED 0; // PHYREG core_params->instruction_window_size = knobs->exec.RS_size; core_params->archi_Regs_IRF_size = 16; core_params->archi_Regs_FRF_size = 32; core_params->phy_Regs_IRF_size = 256; core_params->phy_Regs_FRF_size = 256; core_params->rename_scheme = 0; //RAM-based core_params->register_windows_size = 0; strcpy(core_params->LSU_order, "inorder"); core_params->memory_ports = 2; // private L2 data cache if (core->memory.DL2) { zesto_assert(L2_params != NULL, (void)0); L2_params->L2_config[0] = core->memory.DL2->sets * core->memory.DL2->assoc * core->memory.DL2->linesize; L2_params->L2_config[1] = core->memory.DL2->linesize; L2_params->L2_config[2] = core->memory.DL2->assoc; L2_params->L2_config[3] = core->memory.DL2->banks; L2_params->L2_config[5] = core->memory.DL2->latency; // See LLC comment in base class for setting throughput == latency. L2_params->L2_config[4] = L2_params->L2_config[5]; L2_params->L2_config[6] = core->memory.DL2->bank_width; L2_params->L2_config[7] = (core->memory.DL2->write_policy == WRITE_THROUGH) ? 0 : 1; L2_params->device_type = XML->sys.device_type; L2_params->ports[0] = 0; L2_params->ports[1] = 0; L2_params->ports[2] = 1; // # MSHRs L2_params->buffer_sizes[0] = core->memory.DL2->MSHR_size; // # fill buffers L2_params->buffer_sizes[1] = core->memory.DL2->heap_size; // # PF buffers L2_params->buffer_sizes[2] = core->memory.DL2->PFF_size; // # WB buffers L2_params->buffer_sizes[3] = core->memory.DL2->MSHR_WB_size; } }
void core_power_DPM_t::translate_stats(xiosim::stats::StatsDatabase* sdb, system_core* core_stats, system_L2* L2_stats) { core_power_t::translate_stats(sdb, core_stats, L2_stats); xiosim::stats::Statistic<counter_t>* stat = nullptr; int coreID = core->id; stat = stat_find_core_stat<counter_t>(sdb, coreID, "fetch_uops"); core_stats->int_instructions = stat->get_final_val(); core_stats->fp_instructions = 0; stat = stat_find_core_stat<counter_t>(sdb, coreID, "commit_uops"); core_stats->committed_int_instructions = stat->get_final_val(); core_stats->committed_fp_instructions = 0; stat = stat_find_core_stat<counter_t>(sdb, coreID, "commit_uops"); core_stats->ROB_reads = stat->get_final_val(); stat = stat_find_core_stat<counter_t>(sdb, coreID, "ROB_writes"); core_stats->ROB_writes = stat->get_final_val(); stat = stat_find_core_stat<counter_t>(sdb, coreID, "regfile_reads"); core_stats->rename_reads = stat->get_final_val(); stat = stat_find_core_stat<counter_t>(sdb, coreID, "regfile_writes"); core_stats->rename_writes = stat->get_final_val(); stat = stat_find_core_stat<counter_t>(sdb, coreID, "fp_regfile_reads"); core_stats->fp_rename_reads = stat->get_final_val(); stat = stat_find_core_stat<counter_t>(sdb, coreID, "fp_regfile_writes"); core_stats->fp_rename_writes = stat->get_final_val(); stat = stat_find_core_stat<counter_t>(sdb, coreID, "alloc_uops"); core_stats->inst_window_reads = stat->get_final_val(); stat = stat_find_core_stat<counter_t>(sdb, coreID, "alloc_uops"); core_stats->inst_window_writes = stat->get_final_val(); core_stats->inst_window_wakeup_accesses = 0; core_stats->fp_inst_window_reads = 0; core_stats->fp_inst_window_writes = 0; core_stats->fp_inst_window_wakeup_accesses = 0; stat = stat_find_core_stat<counter_t>(sdb, coreID, "oracle_total_calls"); core_stats->context_switches = stat->get_final_val(); if (core->memory.DL2) { zesto_assert(L2_stats != NULL, (void)0); stat = stat_find_core_stat<counter_t>(sdb, coreID, "DL2.load_lookups"); L2_stats->read_accesses = stat->get_final_val(); stat = stat_find_core_stat<counter_t>(sdb, coreID, "DL2.load_misses"); L2_stats->read_misses = stat->get_final_val(); stat = stat_find_core_stat<counter_t>(sdb, coreID, "DL2.store_lookups"); L2_stats->write_accesses = stat->get_final_val(); stat = stat_find_core_stat<counter_t>(sdb, coreID, "DL2.store_misses"); L2_stats->write_misses = stat->get_final_val(); } }
/* In-order instruction commit. Individual uops cannot commit until it is guaranteed that the entire Mop's worth of uops will commit. */ void core_commit_IO_DPM_t::IO_step(void) { struct core_knobs_t * knobs = core->knobs; int commit_count = 0; stall_reason = CSTALL_NONE; int branches_committed = 0; /* This is just a deadlock watchdog. If something got messed up in the pipeline and no forward progress is being made, this code will eventually detect it. A global watchdog will check if any core is making progress and accordingly if not.*/ if(core->current_thread->active && ((core->sim_cycle - core->exec->last_completed) > deadlock_threshold)) { deadlocked = true; #ifdef ZTRACE ztrace_print(core->id, "Possible deadlock detected."); #endif return; } /* deallocate at most one store from the (senior) STQ per cycle */ core->exec->STQ_deallocate_senior(); /* MAIN COMMIT LOOP */ for(commit_count=0;commit_count<knobs->commit.width;commit_count++) { if(ROB_num <= 0) /* nothing to commit */ { stall_reason = CSTALL_EMPTY; break; } struct Mop_t * Mop = ROB[ROB_head]->Mop; /* For branches, don't commit until the corresponding jeclear (if any) has been processed by the front-end. */ if(Mop->commit.jeclear_in_flight) { stall_reason = CSTALL_JECLEAR_INFLIGHT; break; } if(Mop->decode.is_ctrl && knobs->commit.branch_limit && (branches_committed >= knobs->commit.branch_limit)) { stall_reason = CSTALL_MAX_BRANCHES; break; } if(Mop->oracle.spec_mode) // zesto_fatal("oldest instruction in processor is on wrong-path",(void)0); zesto_assert(false, (void)0); /* Are all uops in the Mop completed? */ if(Mop->commit.complete_index != -1) /* still some outstanding insts */ { struct uop_t * uop = &Mop->uop[Mop->commit.complete_index]; while(uop->timing.when_completed <= core->sim_cycle || uop->decode.is_sta || uop->decode.is_std) { /* stores get added to the STQ at commit */ if(uop->decode.is_sta) { if(!core->exec->exec_fused_ST(uop)) { stall_reason = CSTALL_STQ; break; } } zesto_assert(uop->timing.when_completed <= core->sim_cycle, (void)0); Mop->commit.complete_index += uop->decode.has_imm ? 3 : 1; if(Mop->commit.complete_index >= Mop->decode.flow_length) { Mop->commit.complete_index = -1; /* Mark this Mop as all done */ #ifdef ZTRACE ztrace_print(Mop,"c|complete|all uops completed execution"); #endif if(Mop->fetch.bpred_update) { core->fetch->bpred->update(Mop->fetch.bpred_update, Mop->decode.opflags, Mop->fetch.PC, Mop->fetch.ftPC, Mop->decode.targetPC, Mop->oracle.NextPC, Mop->oracle.taken_branch); core->fetch->bpred->return_state_cache(Mop->fetch.bpred_update); Mop->fetch.bpred_update = NULL; } break; } uop = &Mop->uop[Mop->commit.complete_index]; } } if(stall_reason != CSTALL_NONE) break; if(Mop->commit.complete_index == -1) /* commit the uops if the Mop is done */ { struct uop_t * uop = ROB[ROB_head]; zesto_assert(uop->timing.when_completed <= core->sim_cycle,(void)0); zesto_assert(uop->alloc.ROB_index == ROB_head,(void)0); zesto_assert(uop == &Mop->uop[Mop->commit.commit_index],(void)0); if(uop->decode.BOM && (uop->Mop->timing.when_commit_started == TICK_T_MAX)) uop->Mop->timing.when_commit_started = core->sim_cycle; //SK - load deallocation moved to end of payload pipe if(uop->decode.is_sta) core->exec->STQ_deallocate_sta(); if(uop->decode.is_std) /* we alloc on STA, dealloc on STD */ { if(!core->exec->STQ_deallocate_std(uop)) { stall_reason = CSTALL_STQ; break; } } /* any remaining transactions in-flight (only for loads) should now be ignored - such load requests may exist, for example as a result of a load that completes early due to a hit in the STQ while the cache request is still making its way through the memory hierarchy. */ if(uop->decode.is_load) uop->exec.action_id = core->new_action_id(); #ifdef ZTRACE ztrace_print(uop,"c|commit|uop committed"); #endif if(uop->decode.EOM) uop->Mop->timing.when_commit_finished = core->sim_cycle; /* remove uop from ROB */ if((!uop->decode.in_fusion) || (uop->decode.fusion_next == NULL)) /* fusion dealloc's on fusion-tail */ { ROB[ROB_head] = NULL; ROB_num --; ROB_eff_num --; ROB_head = modinc(ROB_head,knobs->commit.ROB_size); //(ROB_head+1) % knobs->commit.ROB_size; if(uop->decode.in_fusion) { ZESTO_STAT(core->stat.commit_fusions++;) } } else /* fusion body doesn't count toward commit width */ {
/* In-order instruction commit. Individual uops cannot commit until it is guaranteed that the entire Mop's worth of uops will commit. */ void core_commit_STM_t::step(void) { struct core_knobs_t * knobs = core->knobs; int commit_count = 0; enum commit_stall_t stall_reason = CSTALL_NONE; /* This is just a deadlock watchdog. If something got messed up in the pipeline and no forward progress is being made, this code will eventually detect it. A global watchdog will check if any core is making progress and accordingly if not.*/ if(core->active && ((core->sim_cycle - core->exec->last_completed) > deadlock_threshold)) { deadlocked = true; #ifdef ZTRACE ztrace_print(core->id, "Possible deadlock detected."); #endif return; } /* MAIN COMMIT LOOP */ for(commit_count=0;commit_count<knobs->commit.width;commit_count++) { if(ROB_num <= 0) /* nothing to commit */ { stall_reason = commit_count?CSTALL_NONE:CSTALL_EMPTY; break; } struct Mop_t * Mop = ROB[ROB_head]->Mop; if(Mop->oracle.spec_mode) fatal("oldest instruction in processor is on wrong-path"); /* Are all uops in the Mop completed? */ if(Mop->commit.complete_index != -1) /* still some outstanding insts */ { while(Mop->uop[Mop->commit.complete_index].timing.when_completed <= core->sim_cycle) { struct uop_t * uop = &Mop->uop[Mop->commit.complete_index]; Mop->commit.complete_index += uop->decode.has_imm ? 3 : 1; if(Mop->commit.complete_index >= (int) Mop->decode.flow_length) { Mop->commit.complete_index = -1; /* Mark this Mop as all done */ if(Mop->fetch.bpred_update) { core->fetch->bpred->update(Mop->fetch.bpred_update, Mop->decode.opflags, Mop->fetch.PC, Mop->fetch.ftPC, Mop->decode.targetPC, Mop->oracle.NextPC, Mop->oracle.taken_branch); core->fetch->bpred->return_state_cache(Mop->fetch.bpred_update); Mop->fetch.bpred_update = NULL; } break; } } } if(Mop->commit.complete_index == -1) /* commit the uops if the Mop is done */ { struct uop_t * uop = ROB[ROB_head]; zesto_assert(uop->timing.when_completed <= core->sim_cycle,(void)0); zesto_assert(uop->alloc.ROB_index == ROB_head,(void)0); zesto_assert(uop == &Mop->uop[Mop->commit.commit_index],(void)0); if(uop->decode.BOM && (uop->Mop->timing.when_commit_started == TICK_T_MAX)) uop->Mop->timing.when_commit_started = core->sim_cycle; if(uop->decode.is_load) core->exec->LDQ_deallocate(uop); else if(uop->decode.is_sta) core->exec->STQ_deallocate_sta(); else if(uop->decode.is_std) /* we alloc on STA, dealloc on STD */ { if(!core->exec->STQ_deallocate_std(uop)) break; } /* any remaining transactions in-flight (only for loads) should now be ignored - such load requests may exist, for example as a result of a load that completes early due to a hit in the STQ while the cache request is still making its way through the memory hierarchy. */ if(uop->decode.is_load) uop->exec.action_id = core->new_action_id(); if(uop->decode.EOM) uop->Mop->timing.when_commit_finished = core->sim_cycle; /* remove uop from ROB */ ROB[ROB_head] = NULL; ROB_num --; ROB_head = modinc(ROB_head,knobs->commit.ROB_size); //(ROB_head+1) % knobs->commit.ROB_size; uop->alloc.ROB_index = -1; /* this cleans up idep/odep ptrs, register mappings, and commit stores to the real (non-spec) memory system */ core->oracle->commit_uop(uop); /* mark uop as committed in Mop */ Mop->commit.commit_index += uop->decode.has_imm ? 3 : 1; if(Mop->commit.commit_index >= (int) Mop->decode.flow_length) { Mop->commit.commit_index = -1; /* The entire Mop has been committed */ /* Update stats */ if(Mop->uop[Mop->decode.last_uop_index].decode.EOM) { ZESTO_STAT(core->stat.commit_insn++;) } ZESTO_STAT(core->stat.commit_uops += Mop->stat.num_uops;) ZESTO_STAT(core->stat.commit_refs += Mop->stat.num_refs;)
void core_alloc_DPM_t::step(void) { struct core_knobs_t * knobs = core->knobs; int stage, i; enum alloc_stall_t stall_reason = ASTALL_NONE; /*========================================================================*/ /*== Dispatch insts if ROB, RS, and LQ/SQ entries available (as needed) ==*/ stage = knobs->alloc.depth-1; if(occupancy[stage]) /* are there uops in the last stage of the alloc pipe? */ { for(i=0; i < knobs->alloc.width; i++) /* if so, scan all slots (width) of this stage */ { struct uop_t * uop = pipe[stage][i]; int abort_alloc = false; /* if using drain flush: */ /* is the back-end still draining from a misprediction recovery? */ if(knobs->alloc.drain_flush && drain_in_progress) { if(!core->commit->ROB_empty()) { stall_reason = ASTALL_DRAIN; break; } else drain_in_progress = false; } if(uop) { while(uop) /* this while loop is to handle multiple uops fused together into the same slot */ { if(uop->timing.when_allocated == TICK_T_MAX) { /* is the ROB full? */ if((!uop->decode.in_fusion||uop->decode.fusion_head) && !core->commit->ROB_available()) { stall_reason = ASTALL_ROB; abort_alloc = true; break; } /* for loads, is the LDQ full? */ if((uop->decode.is_load || uop->decode.is_lfence) && !core->exec->LDQ_available()) { stall_reason = ASTALL_LDQ; abort_alloc = true; break; } /* for stores, allocate STQ entry on STA. NOTE: This is different from Bob Colwell's description in Shen&Lipasti Chap 7 where he describes allocation on STD. We emit STA uops first since the oracle needs to use the STA result to feed the following STD uop. */ if((uop->decode.is_sta || uop->decode.is_sfence) && !core->exec->STQ_available()) { stall_reason = ASTALL_STQ; abort_alloc = true; break; } /* is the RS full? -- don't need to alloc for NOPs,fences, signals */ if(!core->exec->RS_available() && !uop->decode.is_nop && !uop->decode.is_lfence && !uop->decode.is_sfence && !is_uop_helix_signal(uop)) { stall_reason = ASTALL_RS; abort_alloc = true; break; } /* ALL ALLOC STALL CONDITIONS PASSED */ /* place in ROB */ if((!uop->decode.in_fusion) || uop->decode.is_fusion_head) core->commit->ROB_insert(uop); else /* fusion body doesn't occupy additional ROB entries */ core->commit->ROB_fuse_insert(uop); /* place in LDQ/STQ if needed */ if(uop->decode.is_load || uop->decode.is_lfence) core->exec->LDQ_insert(uop); else if(uop->decode.is_sta || uop->decode.is_sfence) core->exec->STQ_insert_sta(uop); else if(uop->decode.is_std) core->exec->STQ_insert_std(uop); /* port bindings */ if(!uop->decode.is_nop && !uop->Mop->decode.is_trap && !uop->decode.is_lfence && !uop->decode.is_sfence && !is_uop_helix_signal(uop)) { /* port-binding is trivial when there's only one valid port */ if(knobs->exec.port_binding[uop->decode.FU_class].num_FUs == 1) { uop->alloc.port_assignment = knobs->exec.port_binding[uop->decode.FU_class].ports[0]; } else /* else assign uop to least loaded port */ { int min_load = INT_MAX; int index = -1; for(int j=0;j<knobs->exec.port_binding[uop->decode.FU_class].num_FUs;j++) { int port = knobs->exec.port_binding[uop->decode.FU_class].ports[j]; if(port_loading[port] < min_load) { min_load = port_loading[port]; index = port; } } uop->alloc.port_assignment = index; } port_loading[uop->alloc.port_assignment]++; /* only allocate for non-fused or fusion-head */ if((!uop->decode.in_fusion) || uop->decode.is_fusion_head) core->exec->RS_insert(uop); else core->exec->RS_fuse_insert(uop); /* Get input mappings - this is a proxy for explicit register numbers, which you can always get from idep_uop->alloc.ROB_index */ for(size_t j=0;j<MAX_IDEPS;j++) { /* This use of oracle info is valid: at this point the processor would be looking up this information in the RAT, but this saves us having to explicitly store/track the RAT state. */ uop->exec.idep_uop[j] = uop->oracle.idep_uop[j]; /* Add self onto parent's output list. This output list doesn't have a real microarchitectural counter part, but it makes the simulation faster by not having to perform a whole mess of associative searches each time any sort of broadcast is needed. The parent's odep list only points to uops which have dispatched into the OOO core (i.e. has left the alloc pipe). */ if(uop->exec.idep_uop[j]) { struct odep_t * odep = core->get_odep_link(); odep->next = uop->exec.idep_uop[j]->exec.odep_uop; uop->exec.idep_uop[j]->exec.odep_uop = odep; odep->uop = uop; //odep->aflags = (uop->decode.idep_name[j] == DCREG(MD_REG_AFLAGS)); odep->op_num = j; } } /* Update read stats */ for(size_t j=0;j<MAX_IDEPS;j++) { if(x86::is_ireg(uop->decode.idep_name[j])) core->stat.regfile_reads++; else if(x86::is_freg(uop->decode.idep_name[j])) core->stat.fp_regfile_reads++; } /* check "scoreboard" for operand readiness (we're not actually explicitly implementing a scoreboard); if value is ready, read it into data-capture window or payload RAM. */ tick_t when_ready = 0; for(size_t j=0;j<MAX_IDEPS;j++) /* for possible input argument */ { if(uop->exec.idep_uop[j]) /* if the parent uop exists (i.e., still in the processor) */ { uop->timing.when_itag_ready[j] = uop->exec.idep_uop[j]->timing.when_otag_ready; if(uop->exec.idep_uop[j]->exec.ovalue_valid) { uop->timing.when_ival_ready[j] = uop->exec.idep_uop[j]->timing.when_completed; uop->exec.ivalue_valid[j] = true; } } else /* read from ARF */ { uop->timing.when_itag_ready[j] = core->sim_cycle; uop->timing.when_ival_ready[j] = core->sim_cycle; uop->exec.ivalue_valid[j] = true; /* applies to invalid (DNA) inputs as well */ } if(when_ready < uop->timing.when_itag_ready[j]) when_ready = uop->timing.when_itag_ready[j]; } uop->timing.when_ready = when_ready; if(when_ready < TICK_T_MAX) /* add to readyQ if appropriate */ core->exec->insert_ready_uop(uop); } else /* is_nop || is_trap || is_lfence || is_sfence */ { /* NOP's don't go through exec pipeline; they go straight to the ROB and are immediately marked as completed (they still take up space in the ROB though). */ /* Since traps/interrupts aren't really properly modeled in SimpleScalar, we just let it go through without doing anything. */ /* Similarly fences don't need to go through RS and go straight to the LDQ */ uop->timing.when_ready = core->sim_cycle; uop->timing.when_issued = core->sim_cycle; if (!uop->decode.is_lfence) uop->timing.when_completed = core->sim_cycle; if (uop->decode.is_lfence || uop->decode.is_sfence || is_uop_helix_signal(uop) || uop->decode.is_nop) uop->timing.when_exec = core->sim_cycle; if (is_uop_helix_signal(uop) && uop->decode.is_sta) core->exec->STQ_set_addr(uop); if (is_uop_helix_signal(uop) && uop->decode.is_std) core->exec->STQ_set_data(uop); zesto_assert(!uop->decode.is_load, (void)0); } uop->timing.when_allocated = core->sim_cycle; #ifdef ZTRACE ztrace_print_start(uop,"a|alloc:ROB=%d,",uop->alloc.ROB_index); if(uop->alloc.RS_index == -1) // nop ztrace_print_cont(core->id, "RS=."); else ztrace_print_cont(core->id, "RS=%d",uop->alloc.RS_index); if(uop->decode.in_fusion && !uop->decode.is_fusion_head) ztrace_print_cont(core->id, "f"); if(uop->alloc.LDQ_index == -1) ztrace_print_cont(core->id, ":LDQ=."); else ztrace_print_cont(core->id, ":LDQ=%d",uop->alloc.LDQ_index); if(uop->alloc.STQ_index == -1) ztrace_print_cont(core->id, ":STQ=."); else ztrace_print_cont(core->id, ":STQ=%d",uop->alloc.STQ_index); ztrace_print_cont(core->id, ":pb=%d",uop->alloc.port_assignment); ztrace_print_finish(core->id, "|uop alloc'd and dispatched"); #endif } if(uop->decode.in_fusion) uop = uop->decode.fusion_next; else uop = NULL; } if(abort_alloc) break; if((!pipe[stage][i]->decode.in_fusion) || !uop) /* either not fused, or complete fused uops alloc'd */ { uop = pipe[stage][i]; /* may be NULL if we just finished a fused set */ /* update stats */ if(uop->decode.EOM) ZESTO_STAT(core->stat.alloc_insn++;) ZESTO_STAT(core->stat.alloc_uops++;) if(uop->decode.in_fusion) ZESTO_STAT(core->stat.alloc_eff_uops += uop->decode.fusion_size;) else ZESTO_STAT(core->stat.alloc_eff_uops++;)