void X86CpuDumpSummary(Timing *self, FILE *f) { X86Cpu *cpu = asX86Cpu(self); double inst_per_cycle; double uinst_per_cycle; double branch_acc; /* Calculate statistics */ inst_per_cycle = asTiming(cpu)->cycle ? (double)cpu->num_committed_inst / asTiming(cpu)->cycle : 0.0; uinst_per_cycle = asTiming(cpu)->cycle ? (double)cpu->num_committed_uinst / asTiming(cpu)->cycle : 0.0; branch_acc = cpu->num_branch_uinst ? (double)(cpu->num_branch_uinst - cpu->num_mispred_branch_uinst) / cpu->num_branch_uinst : 0.0; /* Print statistics */ fprintf(f, "FastForwardInstructions = %lld\n", cpu->num_fast_forward_inst); fprintf(f, "CommittedInstructions = %lld\n", cpu->num_committed_inst); fprintf(f, "CommittedInstructionsPerCycle = %.4g\n", inst_per_cycle); fprintf(f, "CommittedMicroInstructions = %lld\n", cpu->num_committed_uinst); fprintf(f, "CommittedMicroInstructionsPerCycle = %.4g\n", uinst_per_cycle); fprintf(f, "BranchPredictionAccuracy = %.4g\n", branch_acc); /* Call parent */ TimingDumpSummary(self, f); }
void ARMCpuCreate(ARMCpu *self) { /* Parent */ TimingCreate(asTiming(self)); /* Virtual functions */ asObject(self)->Dump = ARMCpuDump; asTiming(self)->DumpSummary = ARMCpuDumpSummary; asTiming(self)->Run = ARMCpuRun; }
void frm_lds_decode(struct frm_lds_t *lds) { struct frm_uop_t *uop; int instructions_processed = 0; int list_entries; int list_index = 0; int i; list_entries = list_count(lds->issue_buffer); /* Sanity check the issue buffer */ assert(list_entries <= frm_gpu_lds_issue_buffer_size); for (i = 0; i < list_entries; i++) { uop = list_get(lds->issue_buffer, list_index); assert(uop); instructions_processed++; /* Uop not ready yet */ if (asTiming(frm_gpu)->cycle < uop->issue_ready) { list_index++; continue; } /* Stall if the issue width has been reached. */ if (instructions_processed > frm_gpu_lds_width) { frm_trace( "si.inst id=%lld cu=%d wf=%d uop_id=%lld " "stg=\"s\"\n", uop->id_in_sm, lds->sm->id, uop->warp->id, uop->id_in_warp); list_index++; continue; } /* Sanity check the decode buffer */ assert(list_count(lds->decode_buffer) <= frm_gpu_lds_decode_buffer_size); /* Stall if the decode buffer is full. */ if (list_count(lds->decode_buffer) == frm_gpu_lds_decode_buffer_size) { frm_trace( "si.inst id=%lld cu=%d wf=%d uop_id=%lld " "stg=\"s\"\n", uop->id_in_sm, lds->sm->id, uop->warp->id, uop->id_in_warp); list_index++; continue; } uop->decode_ready = asTiming(frm_gpu)->cycle + frm_gpu_lds_decode_latency; list_remove(lds->issue_buffer, uop); list_enqueue(lds->decode_buffer, uop); frm_trace( "si.inst id=%lld cu=%d wf=%d uop_id=%lld " "stg=\"lds-d\"\n", uop->id_in_sm, lds->sm->id, uop->warp->id, uop->id_in_warp); } }
void EvgGpuDumpSummary(Timing *self, FILE *f) { double inst_per_cycle; /* Call parent */ TimingDumpSummary(asTiming(self), f); /* Additional statistics */ inst_per_cycle = asTiming(evg_gpu)->cycle ? (double) asEmu(evg_emu)->instructions / asTiming(evg_gpu)->cycle : 0.0; fprintf(f, "IPC = %.4g\n", inst_per_cycle); }
void EvgGpuCreate(EvgGpu *self) { struct evg_compute_unit_t *compute_unit; int compute_unit_id; /* Parent */ TimingCreate(asTiming(self)); /* Frequency */ asTiming(self)->frequency = evg_gpu_frequency; asTiming(self)->frequency_domain = esim_new_domain(evg_gpu_frequency); /* Initialize */ self->trash_uop_list = linked_list_create(); self->compute_units = xcalloc(evg_gpu_num_compute_units, sizeof(void *)); EVG_GPU_FOREACH_COMPUTE_UNIT(compute_unit_id) { self->compute_units[compute_unit_id] = evg_compute_unit_create(); compute_unit = self->compute_units[compute_unit_id]; compute_unit->id = compute_unit_id; DOUBLE_LINKED_LIST_INSERT_TAIL(self, ready, compute_unit); } /* Virtual functions */ asObject(self)->Dump = EvgGpuDump; asTiming(self)->DumpSummary = EvgGpuDumpSummary; asTiming(self)->Run = EvgGpuRun; asTiming(self)->MemConfigCheck = EvgGpuMemConfigCheck; asTiming(self)->MemConfigDefault = EvgGpuMemConfigDefault; asTiming(self)->MemConfigParseEntry = EvgGpuMemConfigParseEntry; }
void frm_sm_update_fetch_visualization( struct frm_sm_t *sm, int non_active_fb) { struct frm_uop_t *uop; int list_entries; int i; /* Update visualization states for all instructions not issued */ list_entries = list_count(sm->fetch_buffers[non_active_fb]); for (i = 0; i < list_entries; i++) { uop = list_get(sm->fetch_buffers[non_active_fb], i); assert(uop); /* Skip all uops that have not yet completed the fetch */ if (asTiming(frm_gpu)->cycle < uop->fetch_ready) { continue; } frm_trace("si.inst id=%lld cu=%d wf=%d uop_id=%lld stg=\"s\"\n", uop->id_in_sm, sm->id, uop->warp->id, uop->id_in_warp); } }
static int X86ThreadIssueSQ(X86Thread *self, int quantum) { X86Cpu *cpu = self->cpu; X86Core *core = self->core; struct x86_uop_t *store; struct linked_list_t *sq = self->sq; struct mod_client_info_t *client_info; /* Process SQ */ linked_list_head(sq); while (!linked_list_is_end(sq) && quantum) { /* Get store */ store = linked_list_get(sq); assert(store->uinst->opcode == x86_uinst_store); /* Only committed stores issue */ if (store->in_rob) break; /* Check that memory system entry is ready */ if (!mod_can_access(self->data_mod, store->phy_addr)) break; /* Remove store from store queue */ X86ThreadRemoveFromSQ(self); /* create and fill the mod_client_info_t object */ client_info = mod_client_info_create(self->data_mod); client_info->prefetcher_eip = store->eip; /* Issue store */ mod_access(self->data_mod, mod_access_store, store->phy_addr, NULL, core->event_queue, store, client_info); /* The cache system will place the store at the head of the * event queue when it is ready. For now, mark "in_event_queue" to * prevent the uop from being freed. */ store->in_event_queue = 1; store->issued = 1; store->issue_when = asTiming(cpu)->cycle; /* Statistics */ core->num_issued_uinst_array[store->uinst->opcode]++; core->lsq_reads++; core->reg_file_int_reads += store->ph_int_idep_count; core->reg_file_fp_reads += store->ph_fp_idep_count; self->num_issued_uinst_array[store->uinst->opcode]++; self->lsq_reads++; self->reg_file_int_reads += store->ph_int_idep_count; self->reg_file_fp_reads += store->ph_fp_idep_count; cpu->num_issued_uinst_array[store->uinst->opcode]++; if (store->trace_cache) self->trace_cache->num_issued_uinst++; /* One more instruction, update quantum. */ quantum--; /* MMU statistics */ if (*mmu_report_file_name) mmu_access_page(store->phy_addr, mmu_access_write); } return quantum; }
static int X86ThreadCanFetch(X86Thread *self) { X86Cpu *cpu = self->cpu; X86Context *ctx = self->ctx; unsigned int phy_addr; unsigned int block; /* Context must be running */ if (!ctx || !X86ContextGetState(ctx, X86ContextRunning)) return 0; /* Fetch stalled or context evict signal activated */ if (self->fetch_stall_until >= asTiming(cpu)->cycle || ctx->evict_signal) return 0; /* Fetch queue must have not exceeded the limit of stored bytes * to be able to store new macro-instructions. */ if (self->fetchq_occ >= x86_fetch_queue_size) return 0; /* If the next fetch address belongs to a new block, cache system * must be accessible to read it. */ block = self->fetch_neip & ~(self->inst_mod->block_size - 1); if (block != self->fetch_block) { phy_addr = mmu_translate(self->ctx->address_space_index, self->fetch_neip); if (!mod_can_access(self->inst_mod, phy_addr)) return 0; } /* We can fetch */ return 1; }
void frm_lds_write(struct frm_lds_t *lds) { struct frm_uop_t *uop; int instructions_processed = 0; int list_entries; int list_index = 0; int i; list_entries = list_count(lds->mem_buffer); /* Sanity check the mem buffer */ assert(list_entries <= frm_gpu_lds_max_inflight_mem_accesses); for (i = 0; i < list_entries; i++) { uop = list_get(lds->mem_buffer, list_index); assert(uop); instructions_processed++; /* Uop is not ready yet */ if (uop->lds_witness) { list_index++; continue; } /* Stall if the width has been reached */ if (instructions_processed > frm_gpu_lds_width) { frm_trace( "si.inst id=%lld cu=%d wf=%d uop_id=%lld " "stg=\"s\"\n", uop->id_in_sm, lds->sm->id, uop->warp->id, uop->id_in_warp); list_index++; continue; } /* Sanity check the write buffer */ assert(list_count(lds->write_buffer) <= frm_gpu_lds_write_buffer_size); /* Stop if the write buffer is full */ if (list_count(lds->write_buffer) >= frm_gpu_lds_write_buffer_size) { frm_trace( "si.inst id=%lld cu=%d wf=%d uop_id=%lld " "stg=\"s\"\n", uop->id_in_sm, lds->sm->id, uop->warp->id, uop->id_in_warp); list_index++; continue; } /* Access complete, remove the uop from the queue */ uop->write_ready = asTiming(frm_gpu)->cycle + frm_gpu_lds_write_latency; list_remove(lds->mem_buffer, uop); list_enqueue(lds->write_buffer, uop); instructions_processed++; frm_trace( "si.inst id=%lld cu=%d wf=%d uop_id=%lld " "stg=\"lds-w\"\n", uop->id_in_sm, lds->sm->id, uop->warp->id, uop->id_in_warp); } }
void X86CpuDumpUopReport(X86Cpu *self, FILE *f, long long *uop_stats, char *prefix, int peak_ipc) { long long uinst_int_count = 0; long long uinst_logic_count = 0; long long uinst_fp_count = 0; long long uinst_mem_count = 0; long long uinst_ctrl_count = 0; long long uinst_total = 0; char *name; enum x86_uinst_flag_t flags; int i; for (i = 0; i < x86_uinst_opcode_count; i++) { name = x86_uinst_info[i].name; flags = x86_uinst_info[i].flags; fprintf(f, "%s.Uop.%s = %lld\n", prefix, name, uop_stats[i]); if (flags & X86_UINST_INT) uinst_int_count += uop_stats[i]; if (flags & X86_UINST_LOGIC) uinst_logic_count += uop_stats[i]; if (flags & X86_UINST_FP) uinst_fp_count += uop_stats[i]; if (flags & X86_UINST_MEM) uinst_mem_count += uop_stats[i]; if (flags & X86_UINST_CTRL) uinst_ctrl_count += uop_stats[i]; uinst_total += uop_stats[i]; } fprintf(f, "%s.Integer = %lld\n", prefix, uinst_int_count); fprintf(f, "%s.Logic = %lld\n", prefix, uinst_logic_count); fprintf(f, "%s.FloatingPoint = %lld\n", prefix, uinst_fp_count); fprintf(f, "%s.Memory = %lld\n", prefix, uinst_mem_count); fprintf(f, "%s.Ctrl = %lld\n", prefix, uinst_ctrl_count); fprintf(f, "%s.WndSwitch = %lld\n", prefix, uop_stats[x86_uinst_call] + uop_stats[x86_uinst_ret]); fprintf(f, "%s.Total = %lld\n", prefix, uinst_total); fprintf(f, "%s.IPC = %.4g\n", prefix, asTiming(self)->cycle ? (double)uinst_total / asTiming(self)->cycle : 0.0); fprintf(f, "%s.DutyCycle = %.4g\n", prefix, asTiming(self)->cycle && peak_ipc ? (double)uinst_total / asTiming(self)->cycle / peak_ipc : 0.0); fprintf(f, "\n"); }
void X86CpuCreate(X86Cpu *self, X86Emu *emu) { X86Core *core; X86Thread *thread; char name[MAX_STRING_SIZE]; int i; int j; /* Parent */ TimingCreate(asTiming(self)); /* Frequency */ asTiming(self)->frequency = x86_cpu_frequency; asTiming(self)->frequency_domain = esim_new_domain(x86_cpu_frequency); /* Initialize */ self->emu = emu; self->uop_trace_list = linked_list_create(); /* Create cores */ self->cores = xcalloc(x86_cpu_num_cores, sizeof(X86Core *)); for (i = 0; i < x86_cpu_num_cores; i++) self->cores[i] = new (X86Core, self); /* Assign names and IDs to cores and threads */ for (i = 0; i < x86_cpu_num_cores; i++) { core = self->cores[i]; snprintf(name, sizeof name, "c%d", i); X86CoreSetName(core, name); core->id = i; for (j = 0; j < x86_cpu_num_threads; j++) { thread = core->threads[j]; snprintf(name, sizeof name, "c%dt%d", i, j); X86ThreadSetName(thread, name); thread->id_in_core = j; thread->id_in_cpu = i * x86_cpu_num_threads + j; } } /* Virtual functions */ asObject(self)->Dump = X86CpuDump; asTiming(self)->DumpSummary = X86CpuDumpSummary; asTiming(self)->Run = X86CpuRun; asTiming(self)->MemConfigCheck = X86CpuMemConfigCheck; asTiming(self)->MemConfigDefault = X86CpuMemConfigDefault; asTiming(self)->MemConfigParseEntry = X86CpuMemConfigParseEntry; /* Trace */ x86_trace_header("x86.init version=\"%d.%d\" num_cores=%d num_threads=%d\n", X86_TRACE_VERSION_MAJOR, X86_TRACE_VERSION_MINOR, x86_cpu_num_cores, x86_cpu_num_threads); }
void frm_vector_mem_complete(struct frm_vector_mem_unit_t *vector_mem) { struct frm_uop_t *uop = NULL; int list_entries; int i; int list_index = 0; /* Process completed memory instructions */ list_entries = list_count(vector_mem->write_buffer); /* Sanity check the write buffer */ assert(list_entries <= frm_gpu_vector_mem_width); for (i = 0; i < list_entries; i++) { uop = list_get(vector_mem->write_buffer, list_index); assert(uop); /* Uop is not ready */ if (asTiming(frm_gpu)->cycle < uop->write_ready) { list_index++; continue; } /* Access complete, remove the uop from the queue */ list_remove(vector_mem->write_buffer, uop); assert(uop->warp_inst_queue_entry->lgkm_cnt > 0); uop->warp_inst_queue_entry->lgkm_cnt--; frm_trace("si.end_inst id=%lld cu=%d\n", uop->id_in_sm, uop->sm->id); /* Free uop */ frm_uop_free(uop); /* Statistics */ vector_mem->inst_count++; frm_gpu->last_complete_cycle = asTiming(frm_gpu)->cycle; } }
void frm_sm_spatial_report_dump(struct frm_sm_t *sm) { FILE *f = spatial_report_file; fprintf(f,"CU,%d,MemAcc,%lld,MappedWGs,%lld,Cycles,%lld\n", sm->id, sm->vector_mem_unit.inflight_mem_accesses, sm->interval_mapped_thread_blocks, asTiming(frm_gpu)->cycle); }
void evg_cu_spatial_report_dump(struct evg_compute_unit_t *compute_unit) { FILE *f = spatial_report_file; fprintf(f, "CU,%d,CFInst,%lld,MemAcc,%lld,TEXInstn,%lld,ALUInstn,%lld,Cycles,%" "lld \n", compute_unit->id, compute_unit->cf_engine.interval_inst_count, compute_unit->inflight_mem_accesses, compute_unit->tex_engine.interval_inst_count, compute_unit->alu_engine.interval_inst_count, asTiming(evg_gpu)->cycle); }
void si_simd_complete(struct si_simd_t *simd) { struct si_uop_t *uop; int list_entries; int list_index = 0; int i; list_entries = list_count(simd->exec_buffer); assert(list_entries <= si_gpu_simd_exec_buffer_size); for (i = 0; i < list_entries; i++) { uop = list_get(simd->exec_buffer, list_index); assert(uop); if (asTiming(si_gpu)->cycle < uop->execute_ready) { list_index++; continue; } /* Access complete, remove the uop from the queue */ list_remove(simd->exec_buffer, uop); si_trace("si.end_inst id=%lld cu=%d\n", uop->id_in_compute_unit, uop->compute_unit->id); /* Free uop */ si_uop_free(uop); /* Statistics */ simd->inst_count++; si_gpu->last_complete_cycle = asTiming(si_gpu)->cycle; } }
void evg_cu_interval_update(struct evg_compute_unit_t *compute_unit) { /* If interval - reset the counters in all the engines */ compute_unit->interval_cycle++; if (!(asTiming(evg_gpu)->cycle % spatial_profiling_interval)) { evg_cu_spatial_report_dump(compute_unit); compute_unit->cf_engine.interval_inst_count = 0; compute_unit->alu_engine.interval_inst_count = 0; compute_unit->tex_engine.interval_inst_count = 0; /* This counter is not reset since memory accesses could still be in flight * in the hierarchy*/ /* compute_unit->inflight_mem_accesses = 0; */ compute_unit->interval_cycle = 0; } }
void si_cu_spatial_report_dump(struct si_compute_unit_t *compute_unit) { FILE *f = spatial_report_file; fprintf(f, "CU,%d,MemAcc,%lld,MappedWGs,%lld,UnmappedWGs,%lld,ALUIssued,%lld,LDSIssued,%lld,Cycles,%lld\n", compute_unit->id, compute_unit->vector_mem_unit.inflight_mem_accesses, compute_unit->interval_mapped_work_groups, compute_unit->interval_unmapped_work_groups, compute_unit->interval_alu_issued, compute_unit->interval_lds_issued, asTiming(si_gpu)->cycle); }
void frm_sm_interval_update(struct frm_sm_t *sm) { /* If interval - reset the counters in all the engines */ sm->interval_cycle ++; if (!(asTiming(frm_gpu)->cycle % spatial_profiling_interval)) { frm_sm_spatial_report_dump(sm); /* * This counter is not reset since memory accesses could still * be in flight in the hierarchy * sm->inflight_mem_accesses = 0; */ sm->interval_cycle = 0; sm->interval_mapped_thread_blocks = 0; } }
int X86ThreadCacheMissInEventQueue(X86Thread *self) { X86Cpu *cpu = self->cpu; X86Core *core = self->core; struct linked_list_t *event_queue = core->event_queue; struct x86_uop_t *uop; LINKED_LIST_FOR_EACH(event_queue) { uop = linked_list_get(event_queue); if (uop->thread != self || uop->uinst->opcode != x86_uinst_load) continue; if (asTiming(cpu)->cycle - uop->issue_when > 5) return 1; } return 0; }
int X86ThreadLongLatencyInEventQueue(X86Thread *self) { X86Cpu *cpu = self->cpu; X86Core *core = self->core; struct linked_list_t *event_queue = core->event_queue; struct x86_uop_t *uop; LINKED_LIST_FOR_EACH(event_queue) { uop = linked_list_get(event_queue); if (uop->thread != self) continue; if (asTiming(cpu)->cycle - uop->issue_when > 20) return 1; } return 0; }
void si_cu_interval_update(struct si_compute_unit_t *compute_unit) { /* If interval - reset the counters in all the engines */ compute_unit->interval_cycle ++; if (!(asTiming(si_gpu)->cycle % spatial_profiling_interval)) { si_cu_spatial_report_dump(compute_unit); /* * This counter is not reset since memory accesses could still * be in flight in the hierarchy * compute_unit->inflight_mem_accesses = 0; */ compute_unit->interval_cycle = 0; compute_unit->interval_mapped_work_groups = 0; compute_unit->interval_unmapped_work_groups = 0; compute_unit->interval_alu_issued = 0; compute_unit->interval_lds_issued = 0; } }
static struct evg_wavefront_t *evg_schedule_greedy( struct evg_compute_unit_t *compute_unit) { struct evg_wavefront_t *wavefront, *temp_wavefront; struct linked_list_t *wavefront_pool = compute_unit->wavefront_pool; /* Check all candidates */ temp_wavefront = NULL; LINKED_LIST_FOR_EACH(wavefront_pool) { /* Get wavefront from list */ wavefront = linked_list_get(wavefront_pool); /* Wavefront must be running, * and the corresponding slot in fetch buffer must be free. */ assert(wavefront->id_in_compute_unit < evg_gpu->wavefronts_per_compute_unit); if (!DOUBLE_LINKED_LIST_MEMBER(wavefront->work_group, running, wavefront) || compute_unit->cf_engine.fetch_buffer[wavefront->id_in_compute_unit]) continue; /* Select current wavefront temporarily */ if (!temp_wavefront || temp_wavefront->sched_when < wavefront->sched_when) temp_wavefront = wavefront; } /* No wavefront found */ wavefront = NULL; if (!temp_wavefront) return NULL; /* Wavefront found, remove from pool and return. */ assert(temp_wavefront->clause_kind == EVG_CLAUSE_CF); linked_list_find(wavefront_pool, temp_wavefront); assert(!wavefront_pool->error_code); linked_list_remove(wavefront_pool); temp_wavefront->sched_when = asTiming(evg_gpu)->cycle; return temp_wavefront; }
void frm_vector_mem_mem(struct frm_vector_mem_unit_t *vector_mem) { struct frm_uop_t *uop; struct frm_thread_uop_t *thread_uop; struct frm_thread_t *thread; int thread_id; int instructions_processed = 0; int list_entries; int i; enum mod_access_kind_t access_kind; int list_index = 0; list_entries = list_count(vector_mem->read_buffer); /* Sanity check the read buffer */ assert(list_entries <= frm_gpu_vector_mem_read_buffer_size); for (i = 0; i < list_entries; i++) { uop = list_get(vector_mem->read_buffer, list_index); assert(uop); instructions_processed++; /* Uop is not ready yet */ if (asTiming(frm_gpu)->cycle < uop->read_ready) { list_index++; continue; } /* Stall if the width has been reached. */ if (instructions_processed > frm_gpu_vector_mem_width) { frm_trace("si.inst id=%lld cu=%d wf=%d uop_id=%lld " "stg=\"s\"\n", uop->id_in_sm, vector_mem->sm->id, uop->warp->id, uop->id_in_warp); list_index++; continue; } /* Sanity check mem buffer */ assert(list_count(vector_mem->mem_buffer) <= frm_gpu_vector_mem_max_inflight_mem_accesses); /* Stall if there is not room in the memory buffer */ if (list_count(vector_mem->mem_buffer) == frm_gpu_vector_mem_max_inflight_mem_accesses) { frm_trace("si.inst id=%lld cu=%d wf=%d uop_id=%lld " "stg=\"s\"\n", uop->id_in_sm, vector_mem->sm->id, uop->warp->id, uop->id_in_warp); list_index++; continue; } /* Set the access type */ if (uop->vector_mem_write && !uop->glc) access_kind = mod_access_nc_store; else if (uop->vector_mem_write && uop->glc) access_kind = mod_access_store; else if (uop->vector_mem_read) access_kind = mod_access_load; else fatal("%s: invalid access kind", __FUNCTION__); /* Access global memory */ assert(!uop->global_mem_witness); for (thread_id = uop->warp->threads[0]->id_in_warp; thread_id < uop->warp->thread_count; thread_id++) { thread = uop->warp->threads[thread_id]; thread_uop = &uop->thread_uop[thread->id_in_warp]; mod_access(vector_mem->sm->global_memory, access_kind, thread_uop->global_mem_access_addr, &uop->global_mem_witness, NULL, NULL, NULL); uop->global_mem_witness--; } if(frm_spatial_report_active) { if (uop->vector_mem_write) { uop->num_global_mem_write += uop->global_mem_witness; frm_report_global_mem_inflight(uop->sm, uop->num_global_mem_write); } else if (uop->vector_mem_read) { uop->num_global_mem_read += uop->global_mem_witness; frm_report_global_mem_inflight(uop->sm, uop->num_global_mem_read); } else fatal("%s: invalid access kind", __FUNCTION__); } /* Increment outstanding memory access count */ uop->warp_inst_queue_entry->lgkm_cnt++; /* Transfer the uop to the mem buffer */ list_remove(vector_mem->read_buffer, uop); list_enqueue(vector_mem->mem_buffer, uop); frm_trace("si.inst id=%lld cu=%d wf=%d uop_id=%lld " "stg=\"mem-m\"\n", uop->id_in_sm, vector_mem->sm->id, uop->warp->id, uop->id_in_warp); } }
void frm_lds_mem(struct frm_lds_t *lds) { struct frm_uop_t *uop; struct frm_thread_uop_t *thread_uop; struct frm_thread_t *thread; int thread_id; int instructions_processed = 0; int list_entries; int i, j; enum mod_access_kind_t access_type; int list_index = 0; list_entries = list_count(lds->read_buffer); /* Sanity check the read buffer */ assert(list_entries <= frm_gpu_lds_read_buffer_size); for (i = 0; i < list_entries; i++) { uop = list_get(lds->read_buffer, list_index); assert(uop); instructions_processed++; /* Uop is not ready yet */ if (asTiming(frm_gpu)->cycle < uop->read_ready) { list_index++; continue; } /* Stall if the width has been reached. */ if (instructions_processed > frm_gpu_lds_width) { frm_trace( "si.inst id=%lld cu=%d wf=%d uop_id=%lld " "stg=\"s\"\n", uop->id_in_sm, lds->sm->id, uop->warp->id, uop->id_in_warp); list_index++; continue; } /* Sanity check uop */ assert(uop->lds_read || uop->lds_write); /* Sanity check mem buffer */ assert(list_count(lds->mem_buffer) <= frm_gpu_lds_max_inflight_mem_accesses); /* Stall if there is no room in the memory buffer */ if (list_count(lds->mem_buffer) == frm_gpu_lds_max_inflight_mem_accesses) { frm_trace( "si.inst id=%lld cu=%d wf=%d uop_id=%lld " "stg=\"s\"\n", uop->id_in_sm, lds->sm->id, uop->warp->id, uop->id_in_warp); list_index++; continue; } /* Access local memory */ for (thread_id = uop->warp->threads[0]->id_in_warp; thread_id < uop->warp->thread_count; thread_id++) { thread = uop->warp->threads[thread_id]; thread_uop = &uop->thread_uop[thread->id_in_warp]; for (j = 0; j < thread_uop->lds_access_count; j++) { if (thread->lds_access_type[j] == 1) { access_type = mod_access_load; } else if (thread->lds_access_type[j] == 2) { access_type = mod_access_store; } else { fatal("%s: invalid lds access type", __FUNCTION__); } mod_access(lds->sm->lds_module, access_type, thread_uop->lds_access_addr[j], &uop->lds_witness, NULL, NULL, NULL); uop->lds_witness--; } } /* Increment outstanding memory access count */ uop->warp_inst_queue_entry->lgkm_cnt++; /* Transfer the uop to the mem buffer */ list_remove(lds->read_buffer, uop); list_enqueue(lds->mem_buffer, uop); frm_trace( "si.inst id=%lld cu=%d wf=%d uop_id=%lld " "stg=\"lds-m\"\n", uop->id_in_sm, lds->sm->id, uop->warp->id, uop->id_in_warp); } }
void si_simd_execute(struct si_simd_t *simd) { struct si_uop_t *uop; int list_entries; int list_index = 0; int instructions_processed = 0; int i; list_entries = list_count(simd->decode_buffer); /* Sanity check the decode buffer */ assert(list_entries <= si_gpu_simd_decode_buffer_size); for (i = 0; i < list_entries; i++) { uop = list_get(simd->decode_buffer, list_index); assert(uop); instructions_processed++; /* Uop is not ready yet */ if (asTiming(si_gpu)->cycle < uop->decode_ready) { list_index++; continue; } /* Stall if the width has been reached */ if (instructions_processed > si_gpu_simd_width) { si_trace("si.inst id=%lld cu=%d wf=%d uop_id=%lld " "stg=\"s\"\n", uop->id_in_compute_unit, simd->compute_unit->id, uop->wavefront->id, uop->id_in_wavefront); list_index++; continue; } /* Sanity check exec buffer */ assert(list_count(simd->exec_buffer) <= si_gpu_simd_exec_buffer_size); /* Stall if SIMD unit is full */ if (list_count(simd->exec_buffer) == si_gpu_simd_exec_buffer_size) { si_trace("si.inst id=%lld cu=%d wf=%d uop_id=%lld " "stg=\"s\"\n", uop->id_in_compute_unit, simd->compute_unit->id, uop->wavefront->id, uop->id_in_wavefront); list_index++; continue; } /* Includes time for pipelined read-exec-write of * all subwavefronts */ uop->execute_ready = asTiming(si_gpu)->cycle + si_gpu_simd_exec_latency; /* Transfer the uop to the outstanding execution buffer */ list_remove(simd->decode_buffer, uop); list_enqueue(simd->exec_buffer, uop); uop->wavefront_pool_entry->ready_next_cycle = 1; si_trace("si.inst id=%lld cu=%d wf=%d uop_id=%lld " "stg=\"simd-e\"\n", uop->id_in_compute_unit, simd->compute_unit->id, uop->wavefront->id, uop->id_in_wavefront); } }
void frm_vector_mem_write(struct frm_vector_mem_unit_t *vector_mem) { struct frm_uop_t *uop; int instructions_processed = 0; int list_entries; int list_index = 0; int i; list_entries = list_count(vector_mem->mem_buffer); /* Sanity check the mem buffer */ assert(list_entries <= frm_gpu_vector_mem_max_inflight_mem_accesses); for (i = 0; i < list_entries; i++) { uop = list_get(vector_mem->mem_buffer, list_index); assert(uop); instructions_processed++; /* Uop is not ready yet */ if (uop->global_mem_witness) { list_index++; continue; } /* Stall if the width has been reached. */ if (instructions_processed > frm_gpu_vector_mem_width) { frm_trace("si.inst id=%lld cu=%d wf=%d uop_id=%lld " "stg=\"s\"\n", uop->id_in_sm, vector_mem->sm->id, uop->warp->id, uop->id_in_warp); list_index++; continue; } /* Sanity check write buffer */ assert(list_count(vector_mem->write_buffer) <= frm_gpu_vector_mem_write_buffer_size); /* Stop if the write buffer is full. */ if (list_count(vector_mem->write_buffer) == frm_gpu_vector_mem_write_buffer_size) { frm_trace("si.inst id=%lld cu=%d wf=%d uop_id=%lld " "stg=\"s\"\n", uop->id_in_sm, vector_mem->sm->id, uop->warp->id, uop->id_in_warp); list_index++; continue; } /* Access complete, remove the uop from the queue */ uop->write_ready = asTiming(frm_gpu)->cycle + frm_gpu_vector_mem_write_latency; /* In the above context, access means any of the * mod_access calls in frm_vector_mem_mem. Means all * inflight accesses for uop are done */ if(frm_spatial_report_active) { if (uop->vector_mem_write) { frm_report_global_mem_finish(uop->sm, uop->num_global_mem_write); } else if (uop->vector_mem_read) { frm_report_global_mem_finish(uop->sm, uop->num_global_mem_read); } else { fatal("%s: invalid access kind", __FUNCTION__); } } list_remove(vector_mem->mem_buffer, uop); list_enqueue(vector_mem->write_buffer, uop); frm_trace("si.inst id=%lld cu=%d wf=%d uop_id=%lld " "stg=\"mem-w\"\n", uop->id_in_sm, vector_mem->sm->id, uop->warp->id, uop->id_in_warp); } }
void si_simd_decode(struct si_simd_t *simd) { struct si_uop_t *uop; int instructions_processed = 0; int list_entries; int list_index = 0; int i; list_entries = list_count(simd->issue_buffer); /* Sanity check the issue buffer */ assert(list_entries <= si_gpu_simd_issue_buffer_size); for (i = 0; i < list_entries; i++) { uop = list_get(simd->issue_buffer, list_index); assert(uop); instructions_processed++; /* Uop not ready yet */ if (asTiming(si_gpu)->cycle < uop->issue_ready) { list_index++; continue; } /* Stall if the issue width has been reached. */ if (instructions_processed > si_gpu_simd_width) { si_trace("si.inst id=%lld cu=%d wf=%d uop_id=%lld " "stg=\"s\"\n", uop->id_in_compute_unit, simd->compute_unit->id, uop->wavefront->id, uop->id_in_wavefront); list_index++; continue; } /* Sanity check the decode buffer */ assert(list_count(simd->decode_buffer) <= si_gpu_simd_decode_buffer_size); /* Stall if the decode buffer is full. */ if (list_count(simd->decode_buffer) == si_gpu_simd_decode_buffer_size) { si_trace("si.inst id=%lld cu=%d wf=%d uop_id=%lld " "stg=\"s\"\n", uop->id_in_compute_unit, simd->compute_unit->id, uop->wavefront->id, uop->id_in_wavefront); list_index++; continue; } uop->decode_ready = asTiming(si_gpu)->cycle + si_gpu_simd_decode_latency; list_remove(simd->issue_buffer, uop); list_enqueue(simd->decode_buffer, uop); if (si_spatial_report_active) si_alu_report_new_inst(simd->compute_unit); si_trace("si.inst id=%lld cu=%d wf=%d uop_id=%lld " "stg=\"simd-d\"\n", uop->id_in_compute_unit, simd->compute_unit->id, uop->wavefront->id, uop->id_in_wavefront); } }
static int X86ThreadIssueLQ(X86Thread *self, int quant) { X86Core *core = self->core; X86Cpu *cpu = self->cpu; struct linked_list_t *lq = self->lq; struct x86_uop_t *load; struct mod_client_info_t *client_info; /* Process lq */ linked_list_head(lq); while (!linked_list_is_end(lq) && quant) { /* Get element from load queue. If it is not ready, go to the next one */ load = linked_list_get(lq); if (!load->ready && !X86ThreadIsUopReady(self, load)) { linked_list_next(lq); continue; } load->ready = 1; /* Check that memory system is accessible */ if (!mod_can_access(self->data_mod, load->phy_addr)) { linked_list_next(lq); continue; } /* Remove from load queue */ assert(load->uinst->opcode == x86_uinst_load); X86ThreadRemoveFromLQ(self); /* create and fill the mod_client_info_t object */ client_info = mod_client_info_create(self->data_mod); client_info->prefetcher_eip = load->eip; /* Access memory system */ mod_access(self->data_mod, mod_access_load, load->phy_addr, NULL, core->event_queue, load, client_info); /* The cache system will place the load at the head of the * event queue when it is ready. For now, mark "in_event_queue" to * prevent the uop from being freed. */ load->in_event_queue = 1; load->issued = 1; load->issue_when = asTiming(cpu)->cycle; /* Statistics */ core->num_issued_uinst_array[load->uinst->opcode]++; core->lsq_reads++; core->reg_file_int_reads += load->ph_int_idep_count; core->reg_file_fp_reads += load->ph_fp_idep_count; self->num_issued_uinst_array[load->uinst->opcode]++; self->lsq_reads++; self->reg_file_int_reads += load->ph_int_idep_count; self->reg_file_fp_reads += load->ph_fp_idep_count; cpu->num_issued_uinst_array[load->uinst->opcode]++; if (load->trace_cache) self->trace_cache->num_issued_uinst++; /* One more instruction issued, update quantum. */ quant--; /* MMU statistics */ MMUAccessPage(cpu->mmu, load->phy_addr, mmu_access_read); /* Trace */ x86_trace("x86.inst id=%lld core=%d stg=\"i\"\n", load->id_in_core, core->id); } return quant; }
static int X86ThreadIssuePreQ(X86Thread *self, int quantum) { X86Core *core = self->core; X86Cpu *cpu = self->cpu; struct linked_list_t *preq = self->preq; struct x86_uop_t *prefetch; /* Process preq */ linked_list_head(preq); while (!linked_list_is_end(preq) && quantum) { /* Get element from prefetch queue. If it is not ready, go to the next one */ prefetch = linked_list_get(preq); if (!prefetch->ready && !X86ThreadIsUopReady(self, prefetch)) { linked_list_next(preq); continue; } /* * Make sure its not been prefetched recently. This is just to avoid unnecessary * memory traffic. Even though the cache will realise a "hit" on redundant * prefetches, its still helpful to avoid going to the memory (cache). */ if (prefetch_history_is_redundant(core->prefetch_history, self->data_mod, prefetch->phy_addr)) { /* remove from queue. do not prefetch. */ assert(prefetch->uinst->opcode == x86_uinst_prefetch); X86ThreadRemovePreQ(self); prefetch->completed = 1; x86_uop_free_if_not_queued(prefetch); continue; } prefetch->ready = 1; /* Check that memory system is accessible */ if (!mod_can_access(self->data_mod, prefetch->phy_addr)) { linked_list_next(preq); continue; } /* Remove from prefetch queue */ assert(prefetch->uinst->opcode == x86_uinst_prefetch); X86ThreadRemovePreQ(self); /* Access memory system */ mod_access(self->data_mod, mod_access_prefetch, prefetch->phy_addr, NULL, core->event_queue, prefetch, NULL); /* Record prefetched address */ prefetch_history_record(core->prefetch_history, prefetch->phy_addr); /* The cache system will place the prefetch at the head of the * event queue when it is ready. For now, mark "in_event_queue" to * prevent the uop from being freed. */ prefetch->in_event_queue = 1; prefetch->issued = 1; prefetch->issue_when = asTiming(cpu)->cycle; /* Statistics */ core->num_issued_uinst_array[prefetch->uinst->opcode]++; core->lsq_reads++; core->reg_file_int_reads += prefetch->ph_int_idep_count; core->reg_file_fp_reads += prefetch->ph_fp_idep_count; self->num_issued_uinst_array[prefetch->uinst->opcode]++; self->lsq_reads++; self->reg_file_int_reads += prefetch->ph_int_idep_count; self->reg_file_fp_reads += prefetch->ph_fp_idep_count; cpu->num_issued_uinst_array[prefetch->uinst->opcode]++; if (prefetch->trace_cache) self->trace_cache->num_issued_uinst++; /* One more instruction issued, update quantum. */ quantum--; /* MMU statistics */ MMUAccessPage(cpu->mmu, prefetch->phy_addr, mmu_access_read); /* Trace */ x86_trace("x86.inst id=%lld core=%d stg=\"i\"\n", prefetch->id_in_core, core->id); } return quantum; }
static int X86ThreadIssueIQ(X86Thread *self, int quant) { X86Cpu *cpu = self->cpu; X86Core *core = self->core; struct linked_list_t *iq = self->iq; struct x86_uop_t *uop; int lat; /* Find instruction to issue */ linked_list_head(iq); while (!linked_list_is_end(iq) && quant) { /* Get element from IQ */ uop = linked_list_get(iq); assert(x86_uop_exists(uop)); assert(!(uop->flags & X86_UINST_MEM)); if (!uop->ready && !X86ThreadIsUopReady(self, uop)) { linked_list_next(iq); continue; } uop->ready = 1; /* avoid next call to 'X86ThreadIsUopReady' */ /* Run the instruction in its corresponding functional unit. * If the instruction does not require a functional unit, 'X86CoreReserveFunctionalUnit' * returns 1 cycle latency. If there is no functional unit available, * 'X86CoreReserveFunctionalUnit' returns 0. */ lat = X86CoreReserveFunctionalUnit(core, uop); if (!lat) { linked_list_next(iq); continue; } /* Instruction was issued to the corresponding fu. * Remove it from IQ */ X86ThreadRemoveFromIQ(self); /* Schedule inst in Event Queue */ assert(!uop->in_event_queue); assert(lat > 0); uop->issued = 1; uop->issue_when = asTiming(cpu)->cycle; uop->when = asTiming(cpu)->cycle + lat; X86CoreInsertInEventQueue(core, uop); /* Statistics */ core->num_issued_uinst_array[uop->uinst->opcode]++; core->iq_reads++; core->reg_file_int_reads += uop->ph_int_idep_count; core->reg_file_fp_reads += uop->ph_fp_idep_count; self->num_issued_uinst_array[uop->uinst->opcode]++; self->iq_reads++; self->reg_file_int_reads += uop->ph_int_idep_count; self->reg_file_fp_reads += uop->ph_fp_idep_count; cpu->num_issued_uinst_array[uop->uinst->opcode]++; if (uop->trace_cache) self->trace_cache->num_issued_uinst++; /* One more instruction issued, update quantum. */ quant--; /* Trace */ x86_trace("x86.inst id=%lld core=%d stg=\"i\"\n", uop->id_in_core, core->id); } return quant; }