예제 #1
0
파일: cpu.c 프로젝트: xianggong/multi2sim42
void X86CpuDumpSummary(Timing *self, FILE *f) {
  X86Cpu *cpu = asX86Cpu(self);

  double inst_per_cycle;
  double uinst_per_cycle;
  double branch_acc;

  /* Calculate statistics */
  inst_per_cycle = asTiming(cpu)->cycle
                       ? (double)cpu->num_committed_inst / asTiming(cpu)->cycle
                       : 0.0;
  uinst_per_cycle =
      asTiming(cpu)->cycle
          ? (double)cpu->num_committed_uinst / asTiming(cpu)->cycle
          : 0.0;
  branch_acc =
      cpu->num_branch_uinst
          ? (double)(cpu->num_branch_uinst - cpu->num_mispred_branch_uinst) /
                cpu->num_branch_uinst
          : 0.0;

  /* Print statistics */
  fprintf(f, "FastForwardInstructions = %lld\n", cpu->num_fast_forward_inst);
  fprintf(f, "CommittedInstructions = %lld\n", cpu->num_committed_inst);
  fprintf(f, "CommittedInstructionsPerCycle = %.4g\n", inst_per_cycle);
  fprintf(f, "CommittedMicroInstructions = %lld\n", cpu->num_committed_uinst);
  fprintf(f, "CommittedMicroInstructionsPerCycle = %.4g\n", uinst_per_cycle);
  fprintf(f, "BranchPredictionAccuracy = %.4g\n", branch_acc);

  /* Call parent */
  TimingDumpSummary(self, f);
}
예제 #2
0
파일: cpu.c 프로젝트: xianggong/multi2sim42
void ARMCpuCreate(ARMCpu *self) {
    /* Parent */
    TimingCreate(asTiming(self));

    /* Virtual functions */
    asObject(self)->Dump = ARMCpuDump;
    asTiming(self)->DumpSummary = ARMCpuDumpSummary;
    asTiming(self)->Run = ARMCpuRun;
}
예제 #3
0
void frm_lds_decode(struct frm_lds_t *lds) {
  struct frm_uop_t *uop;
  int instructions_processed = 0;
  int list_entries;
  int list_index = 0;
  int i;

  list_entries = list_count(lds->issue_buffer);

  /* Sanity check the issue buffer */
  assert(list_entries <= frm_gpu_lds_issue_buffer_size);

  for (i = 0; i < list_entries; i++) {
    uop = list_get(lds->issue_buffer, list_index);
    assert(uop);

    instructions_processed++;

    /* Uop not ready yet */
    if (asTiming(frm_gpu)->cycle < uop->issue_ready) {
      list_index++;
      continue;
    }

    /* Stall if the issue width has been reached. */
    if (instructions_processed > frm_gpu_lds_width) {
      frm_trace(
          "si.inst id=%lld cu=%d wf=%d uop_id=%lld "
          "stg=\"s\"\n",
          uop->id_in_sm, lds->sm->id, uop->warp->id, uop->id_in_warp);
      list_index++;
      continue;
    }

    /* Sanity check the decode buffer */
    assert(list_count(lds->decode_buffer) <= frm_gpu_lds_decode_buffer_size);

    /* Stall if the decode buffer is full. */
    if (list_count(lds->decode_buffer) == frm_gpu_lds_decode_buffer_size) {
      frm_trace(
          "si.inst id=%lld cu=%d wf=%d uop_id=%lld "
          "stg=\"s\"\n",
          uop->id_in_sm, lds->sm->id, uop->warp->id, uop->id_in_warp);
      list_index++;
      continue;
    }

    uop->decode_ready = asTiming(frm_gpu)->cycle + frm_gpu_lds_decode_latency;
    list_remove(lds->issue_buffer, uop);
    list_enqueue(lds->decode_buffer, uop);

    frm_trace(
        "si.inst id=%lld cu=%d wf=%d uop_id=%lld "
        "stg=\"lds-d\"\n",
        uop->id_in_sm, lds->sm->id, uop->warp->id, uop->id_in_warp);
  }
}
예제 #4
0
void EvgGpuDumpSummary(Timing *self, FILE *f)
{
	double inst_per_cycle;

	/* Call parent */
	TimingDumpSummary(asTiming(self), f);

	/* Additional statistics */
	inst_per_cycle = asTiming(evg_gpu)->cycle ?
			(double) asEmu(evg_emu)->instructions
			/ asTiming(evg_gpu)->cycle : 0.0;
	fprintf(f, "IPC = %.4g\n", inst_per_cycle);
}
예제 #5
0
void EvgGpuCreate(EvgGpu *self)
{
	struct evg_compute_unit_t *compute_unit;
	int compute_unit_id;

	/* Parent */
	TimingCreate(asTiming(self));

	/* Frequency */
	asTiming(self)->frequency = evg_gpu_frequency;
	asTiming(self)->frequency_domain = esim_new_domain(evg_gpu_frequency);

	/* Initialize */
	self->trash_uop_list = linked_list_create();
	self->compute_units = xcalloc(evg_gpu_num_compute_units, sizeof(void *));
	EVG_GPU_FOREACH_COMPUTE_UNIT(compute_unit_id)
	{
		self->compute_units[compute_unit_id] = evg_compute_unit_create();
		compute_unit = self->compute_units[compute_unit_id];
		compute_unit->id = compute_unit_id;
		DOUBLE_LINKED_LIST_INSERT_TAIL(self, ready, compute_unit);
	}

	/* Virtual functions */
	asObject(self)->Dump = EvgGpuDump;
	asTiming(self)->DumpSummary = EvgGpuDumpSummary;
	asTiming(self)->Run = EvgGpuRun;
	asTiming(self)->MemConfigCheck = EvgGpuMemConfigCheck;
	asTiming(self)->MemConfigDefault = EvgGpuMemConfigDefault;
	asTiming(self)->MemConfigParseEntry = EvgGpuMemConfigParseEntry;
}
예제 #6
0
void frm_sm_update_fetch_visualization(
		struct frm_sm_t *sm, int non_active_fb)
{
	struct frm_uop_t *uop;
	int list_entries;
	int i;

	/* Update visualization states for all instructions not issued */
	list_entries = list_count(sm->fetch_buffers[non_active_fb]);
	for (i = 0; i < list_entries; i++)
	{
		uop = list_get(sm->fetch_buffers[non_active_fb], i);
		assert(uop);

		/* Skip all uops that have not yet completed the fetch */
		if (asTiming(frm_gpu)->cycle < uop->fetch_ready)
		{
			continue;
		}

		frm_trace("si.inst id=%lld cu=%d wf=%d uop_id=%lld stg=\"s\"\n", 
				uop->id_in_sm, sm->id, 
				uop->warp->id, uop->id_in_warp);
	}
}
예제 #7
0
static int X86ThreadIssueSQ(X86Thread *self, int quantum) {
  X86Cpu *cpu = self->cpu;
  X86Core *core = self->core;

  struct x86_uop_t *store;
  struct linked_list_t *sq = self->sq;
  struct mod_client_info_t *client_info;

  /* Process SQ */
  linked_list_head(sq);
  while (!linked_list_is_end(sq) && quantum) {
    /* Get store */
    store = linked_list_get(sq);
    assert(store->uinst->opcode == x86_uinst_store);

    /* Only committed stores issue */
    if (store->in_rob) break;

    /* Check that memory system entry is ready */
    if (!mod_can_access(self->data_mod, store->phy_addr)) break;

    /* Remove store from store queue */
    X86ThreadRemoveFromSQ(self);

    /* create and fill the mod_client_info_t object */
    client_info = mod_client_info_create(self->data_mod);
    client_info->prefetcher_eip = store->eip;

    /* Issue store */
    mod_access(self->data_mod, mod_access_store, store->phy_addr, NULL,
               core->event_queue, store, client_info);

    /* The cache system will place the store at the head of the
     * event queue when it is ready. For now, mark "in_event_queue" to
     * prevent the uop from being freed. */
    store->in_event_queue = 1;
    store->issued = 1;
    store->issue_when = asTiming(cpu)->cycle;

    /* Statistics */
    core->num_issued_uinst_array[store->uinst->opcode]++;
    core->lsq_reads++;
    core->reg_file_int_reads += store->ph_int_idep_count;
    core->reg_file_fp_reads += store->ph_fp_idep_count;
    self->num_issued_uinst_array[store->uinst->opcode]++;
    self->lsq_reads++;
    self->reg_file_int_reads += store->ph_int_idep_count;
    self->reg_file_fp_reads += store->ph_fp_idep_count;
    cpu->num_issued_uinst_array[store->uinst->opcode]++;
    if (store->trace_cache) self->trace_cache->num_issued_uinst++;

    /* One more instruction, update quantum. */
    quantum--;

    /* MMU statistics */
    if (*mmu_report_file_name)
      mmu_access_page(store->phy_addr, mmu_access_write);
  }
  return quantum;
}
예제 #8
0
static int X86ThreadCanFetch(X86Thread *self) {
	X86Cpu *cpu = self->cpu;
	X86Context *ctx = self->ctx;

	unsigned int phy_addr;
	unsigned int block;

	/* Context must be running */
	if (!ctx || !X86ContextGetState(ctx, X86ContextRunning))
		return 0;

	/* Fetch stalled or context evict signal activated */
	if (self->fetch_stall_until >= asTiming(cpu)->cycle || ctx->evict_signal)
		return 0;

	/* Fetch queue must have not exceeded the limit of stored bytes
	 * to be able to store new macro-instructions. */
	if (self->fetchq_occ >= x86_fetch_queue_size)
		return 0;

	/* If the next fetch address belongs to a new block, cache system
	 * must be accessible to read it. */
	block = self->fetch_neip & ~(self->inst_mod->block_size - 1);

	if (block != self->fetch_block) {
		phy_addr = mmu_translate(self->ctx->address_space_index,
				self->fetch_neip);
		if (!mod_can_access(self->inst_mod, phy_addr))
			return 0;
	}

	/* We can fetch */
	return 1;
}
예제 #9
0
void frm_lds_write(struct frm_lds_t *lds) {
  struct frm_uop_t *uop;
  int instructions_processed = 0;
  int list_entries;
  int list_index = 0;
  int i;

  list_entries = list_count(lds->mem_buffer);

  /* Sanity check the mem buffer */
  assert(list_entries <= frm_gpu_lds_max_inflight_mem_accesses);

  for (i = 0; i < list_entries; i++) {
    uop = list_get(lds->mem_buffer, list_index);
    assert(uop);

    instructions_processed++;

    /* Uop is not ready yet */
    if (uop->lds_witness) {
      list_index++;
      continue;
    }

    /* Stall if the width has been reached */
    if (instructions_processed > frm_gpu_lds_width) {
      frm_trace(
          "si.inst id=%lld cu=%d wf=%d uop_id=%lld "
          "stg=\"s\"\n",
          uop->id_in_sm, lds->sm->id, uop->warp->id, uop->id_in_warp);
      list_index++;
      continue;
    }

    /* Sanity check the write buffer */
    assert(list_count(lds->write_buffer) <= frm_gpu_lds_write_buffer_size);

    /* Stop if the write buffer is full */
    if (list_count(lds->write_buffer) >= frm_gpu_lds_write_buffer_size) {
      frm_trace(
          "si.inst id=%lld cu=%d wf=%d uop_id=%lld "
          "stg=\"s\"\n",
          uop->id_in_sm, lds->sm->id, uop->warp->id, uop->id_in_warp);
      list_index++;
      continue;
    }

    /* Access complete, remove the uop from the queue */
    uop->write_ready = asTiming(frm_gpu)->cycle + frm_gpu_lds_write_latency;
    list_remove(lds->mem_buffer, uop);
    list_enqueue(lds->write_buffer, uop);

    instructions_processed++;

    frm_trace(
        "si.inst id=%lld cu=%d wf=%d uop_id=%lld "
        "stg=\"lds-w\"\n",
        uop->id_in_sm, lds->sm->id, uop->warp->id, uop->id_in_warp);
  }
}
예제 #10
0
파일: cpu.c 프로젝트: xianggong/multi2sim42
void X86CpuDumpUopReport(X86Cpu *self, FILE *f, long long *uop_stats,
                         char *prefix, int peak_ipc) {
  long long uinst_int_count = 0;
  long long uinst_logic_count = 0;
  long long uinst_fp_count = 0;
  long long uinst_mem_count = 0;
  long long uinst_ctrl_count = 0;
  long long uinst_total = 0;

  char *name;
  enum x86_uinst_flag_t flags;
  int i;

  for (i = 0; i < x86_uinst_opcode_count; i++) {
    name = x86_uinst_info[i].name;
    flags = x86_uinst_info[i].flags;

    fprintf(f, "%s.Uop.%s = %lld\n", prefix, name, uop_stats[i]);
    if (flags & X86_UINST_INT) uinst_int_count += uop_stats[i];
    if (flags & X86_UINST_LOGIC) uinst_logic_count += uop_stats[i];
    if (flags & X86_UINST_FP) uinst_fp_count += uop_stats[i];
    if (flags & X86_UINST_MEM) uinst_mem_count += uop_stats[i];
    if (flags & X86_UINST_CTRL) uinst_ctrl_count += uop_stats[i];
    uinst_total += uop_stats[i];
  }
  fprintf(f, "%s.Integer = %lld\n", prefix, uinst_int_count);
  fprintf(f, "%s.Logic = %lld\n", prefix, uinst_logic_count);
  fprintf(f, "%s.FloatingPoint = %lld\n", prefix, uinst_fp_count);
  fprintf(f, "%s.Memory = %lld\n", prefix, uinst_mem_count);
  fprintf(f, "%s.Ctrl = %lld\n", prefix, uinst_ctrl_count);
  fprintf(f, "%s.WndSwitch = %lld\n", prefix,
          uop_stats[x86_uinst_call] + uop_stats[x86_uinst_ret]);
  fprintf(f, "%s.Total = %lld\n", prefix, uinst_total);
  fprintf(f, "%s.IPC = %.4g\n", prefix,
          asTiming(self)->cycle ? (double)uinst_total / asTiming(self)->cycle
                                : 0.0);
  fprintf(f, "%s.DutyCycle = %.4g\n", prefix,
          asTiming(self)->cycle && peak_ipc
              ? (double)uinst_total / asTiming(self)->cycle / peak_ipc
              : 0.0);
  fprintf(f, "\n");
}
예제 #11
0
파일: cpu.c 프로젝트: xianggong/multi2sim42
void X86CpuCreate(X86Cpu *self, X86Emu *emu) {
  X86Core *core;
  X86Thread *thread;

  char name[MAX_STRING_SIZE];

  int i;
  int j;

  /* Parent */
  TimingCreate(asTiming(self));

  /* Frequency */
  asTiming(self)->frequency = x86_cpu_frequency;
  asTiming(self)->frequency_domain = esim_new_domain(x86_cpu_frequency);

  /* Initialize */
  self->emu = emu;
  self->uop_trace_list = linked_list_create();

  /* Create cores */
  self->cores = xcalloc(x86_cpu_num_cores, sizeof(X86Core *));
  for (i = 0; i < x86_cpu_num_cores; i++) self->cores[i] = new (X86Core, self);

  /* Assign names and IDs to cores and threads */
  for (i = 0; i < x86_cpu_num_cores; i++) {
    core = self->cores[i];
    snprintf(name, sizeof name, "c%d", i);
    X86CoreSetName(core, name);
    core->id = i;
    for (j = 0; j < x86_cpu_num_threads; j++) {
      thread = core->threads[j];
      snprintf(name, sizeof name, "c%dt%d", i, j);
      X86ThreadSetName(thread, name);
      thread->id_in_core = j;
      thread->id_in_cpu = i * x86_cpu_num_threads + j;
    }
  }

  /* Virtual functions */
  asObject(self)->Dump = X86CpuDump;
  asTiming(self)->DumpSummary = X86CpuDumpSummary;
  asTiming(self)->Run = X86CpuRun;
  asTiming(self)->MemConfigCheck = X86CpuMemConfigCheck;
  asTiming(self)->MemConfigDefault = X86CpuMemConfigDefault;
  asTiming(self)->MemConfigParseEntry = X86CpuMemConfigParseEntry;

  /* Trace */
  x86_trace_header("x86.init version=\"%d.%d\" num_cores=%d num_threads=%d\n",
                   X86_TRACE_VERSION_MAJOR, X86_TRACE_VERSION_MINOR,
                   x86_cpu_num_cores, x86_cpu_num_threads);
}
예제 #12
0
void frm_vector_mem_complete(struct frm_vector_mem_unit_t *vector_mem)
{
	struct frm_uop_t *uop = NULL;
	int list_entries;
	int i;
	int list_index = 0;

	/* Process completed memory instructions */
	list_entries = list_count(vector_mem->write_buffer);

	/* Sanity check the write buffer */
	assert(list_entries <= frm_gpu_vector_mem_width);

	for (i = 0; i < list_entries; i++)
	{
		uop = list_get(vector_mem->write_buffer, list_index);
		assert(uop);

		/* Uop is not ready */
		if (asTiming(frm_gpu)->cycle < uop->write_ready)
		{
			list_index++;
			continue;
		}

		/* Access complete, remove the uop from the queue */
		list_remove(vector_mem->write_buffer, uop);

		assert(uop->warp_inst_queue_entry->lgkm_cnt > 0);
		uop->warp_inst_queue_entry->lgkm_cnt--;

		frm_trace("si.end_inst id=%lld cu=%d\n", uop->id_in_sm,
			uop->sm->id);

		/* Free uop */
		frm_uop_free(uop);

		/* Statistics */
		vector_mem->inst_count++;
		frm_gpu->last_complete_cycle = asTiming(frm_gpu)->cycle;
	}
}
void frm_sm_spatial_report_dump(struct frm_sm_t *sm)
{
	FILE *f = spatial_report_file;

	fprintf(f,"CU,%d,MemAcc,%lld,MappedWGs,%lld,Cycles,%lld\n",
			sm->id,
			sm->vector_mem_unit.inflight_mem_accesses,
			sm->interval_mapped_thread_blocks,
			asTiming(frm_gpu)->cycle);

}
예제 #14
0
void evg_cu_spatial_report_dump(struct evg_compute_unit_t *compute_unit) {
  FILE *f = spatial_report_file;

  fprintf(f,
          "CU,%d,CFInst,%lld,MemAcc,%lld,TEXInstn,%lld,ALUInstn,%lld,Cycles,%"
          "lld \n",
          compute_unit->id, compute_unit->cf_engine.interval_inst_count,
          compute_unit->inflight_mem_accesses,
          compute_unit->tex_engine.interval_inst_count,
          compute_unit->alu_engine.interval_inst_count,
          asTiming(evg_gpu)->cycle);
}
예제 #15
0
void si_simd_complete(struct si_simd_t *simd)
{
	struct si_uop_t *uop;
	int list_entries;
	int list_index = 0;
	int i;

	list_entries = list_count(simd->exec_buffer);

	assert(list_entries <= si_gpu_simd_exec_buffer_size);

	for (i = 0; i < list_entries; i++)
	{
		uop = list_get(simd->exec_buffer, list_index);
		assert(uop);

		if (asTiming(si_gpu)->cycle < uop->execute_ready)
		{
			list_index++;
			continue;
		}

		/* Access complete, remove the uop from the queue */
		list_remove(simd->exec_buffer, uop);

		si_trace("si.end_inst id=%lld cu=%d\n", uop->id_in_compute_unit,
			uop->compute_unit->id);

		/* Free uop */
		si_uop_free(uop);

		/* Statistics */
		simd->inst_count++;
		si_gpu->last_complete_cycle = asTiming(si_gpu)->cycle;
	}
}
예제 #16
0
void evg_cu_interval_update(struct evg_compute_unit_t *compute_unit) {
  /* If interval - reset the counters in all the engines */
  compute_unit->interval_cycle++;
  if (!(asTiming(evg_gpu)->cycle % spatial_profiling_interval)) {
    evg_cu_spatial_report_dump(compute_unit);

    compute_unit->cf_engine.interval_inst_count = 0;
    compute_unit->alu_engine.interval_inst_count = 0;
    compute_unit->tex_engine.interval_inst_count = 0;
    /* This counter is not reset since memory accesses could still be in flight
     * in the hierarchy*/
    /* compute_unit->inflight_mem_accesses = 0; */
    compute_unit->interval_cycle = 0;
  }
}
void si_cu_spatial_report_dump(struct si_compute_unit_t *compute_unit)
{
	FILE *f = spatial_report_file;

	fprintf(f,
		"CU,%d,MemAcc,%lld,MappedWGs,%lld,UnmappedWGs,%lld,ALUIssued,%lld,LDSIssued,%lld,Cycles,%lld\n",
		compute_unit->id,
		compute_unit->vector_mem_unit.inflight_mem_accesses,
		compute_unit->interval_mapped_work_groups,
		compute_unit->interval_unmapped_work_groups,
		compute_unit->interval_alu_issued,
		compute_unit->interval_lds_issued,
		asTiming(si_gpu)->cycle);


}
void frm_sm_interval_update(struct frm_sm_t *sm)
{
	/* If interval - reset the counters in all the engines */
	sm->interval_cycle ++;
	if (!(asTiming(frm_gpu)->cycle % spatial_profiling_interval))
	{
		frm_sm_spatial_report_dump(sm);

		/*
		 * This counter is not reset since memory accesses could still
		 * be in flight in the hierarchy
		 * sm->inflight_mem_accesses = 0;
		 */
		sm->interval_cycle = 0;
		sm->interval_mapped_thread_blocks = 0;
	}
}
예제 #19
0
int X86ThreadCacheMissInEventQueue(X86Thread *self)
{
	X86Cpu *cpu = self->cpu;
	X86Core *core = self->core;

	struct linked_list_t *event_queue = core->event_queue;
	struct x86_uop_t *uop;

	LINKED_LIST_FOR_EACH(event_queue)
	{
		uop = linked_list_get(event_queue);
		if (uop->thread != self || uop->uinst->opcode != x86_uinst_load)
			continue;
		if (asTiming(cpu)->cycle - uop->issue_when > 5)
			return 1;
	}
	return 0;
}
예제 #20
0
int X86ThreadLongLatencyInEventQueue(X86Thread *self)
{
	X86Cpu *cpu = self->cpu;
	X86Core *core = self->core;

	struct linked_list_t *event_queue = core->event_queue;
	struct x86_uop_t *uop;
	
	LINKED_LIST_FOR_EACH(event_queue)
	{
		uop = linked_list_get(event_queue);
		if (uop->thread != self)
			continue;
		if (asTiming(cpu)->cycle - uop->issue_when > 20)
			return 1;
	}
	return 0;
}
void si_cu_interval_update(struct si_compute_unit_t *compute_unit)
{
	/* If interval - reset the counters in all the engines */
	compute_unit->interval_cycle ++;

	if (!(asTiming(si_gpu)->cycle % spatial_profiling_interval))
	{
		si_cu_spatial_report_dump(compute_unit);

		/*
		 * This counter is not reset since memory accesses could still
		 * be in flight in the hierarchy
		 * compute_unit->inflight_mem_accesses = 0;
		 */
		compute_unit->interval_cycle = 0;
		compute_unit->interval_mapped_work_groups = 0;
		compute_unit->interval_unmapped_work_groups = 0;
		compute_unit->interval_alu_issued = 0;
		compute_unit->interval_lds_issued = 0;
	}
}
예제 #22
0
static struct evg_wavefront_t *evg_schedule_greedy(
    struct evg_compute_unit_t *compute_unit) {
  struct evg_wavefront_t *wavefront, *temp_wavefront;
  struct linked_list_t *wavefront_pool = compute_unit->wavefront_pool;

  /* Check all candidates */
  temp_wavefront = NULL;
  LINKED_LIST_FOR_EACH(wavefront_pool) {
    /* Get wavefront from list */
    wavefront = linked_list_get(wavefront_pool);

    /* Wavefront must be running,
     * and the corresponding slot in fetch buffer must be free. */
    assert(wavefront->id_in_compute_unit <
           evg_gpu->wavefronts_per_compute_unit);
    if (!DOUBLE_LINKED_LIST_MEMBER(wavefront->work_group, running, wavefront) ||
        compute_unit->cf_engine.fetch_buffer[wavefront->id_in_compute_unit])
      continue;

    /* Select current wavefront temporarily */
    if (!temp_wavefront || temp_wavefront->sched_when < wavefront->sched_when)
      temp_wavefront = wavefront;
  }

  /* No wavefront found */
  wavefront = NULL;
  if (!temp_wavefront) return NULL;

  /* Wavefront found, remove from pool and return. */
  assert(temp_wavefront->clause_kind == EVG_CLAUSE_CF);
  linked_list_find(wavefront_pool, temp_wavefront);
  assert(!wavefront_pool->error_code);
  linked_list_remove(wavefront_pool);
  temp_wavefront->sched_when = asTiming(evg_gpu)->cycle;
  return temp_wavefront;
}
예제 #23
0
void frm_vector_mem_mem(struct frm_vector_mem_unit_t *vector_mem)
{
	struct frm_uop_t *uop;
	struct frm_thread_uop_t *thread_uop;
	struct frm_thread_t *thread;
	int thread_id;
	int instructions_processed = 0;
	int list_entries;
	int i;
	enum mod_access_kind_t access_kind;
	int list_index = 0;

	list_entries = list_count(vector_mem->read_buffer);
	
	/* Sanity check the read buffer */
	assert(list_entries <= frm_gpu_vector_mem_read_buffer_size);

	for (i = 0; i < list_entries; i++)
	{
		uop = list_get(vector_mem->read_buffer, list_index);
		assert(uop);

		instructions_processed++;

		/* Uop is not ready yet */
		if (asTiming(frm_gpu)->cycle < uop->read_ready)
		{
			list_index++;
			continue;
		}

		/* Stall if the width has been reached. */
		if (instructions_processed > frm_gpu_vector_mem_width)
		{
			frm_trace("si.inst id=%lld cu=%d wf=%d uop_id=%lld "
				"stg=\"s\"\n", uop->id_in_sm, 
				vector_mem->sm->id, 
				uop->warp->id, uop->id_in_warp);
			list_index++;
			continue;
		}

		/* Sanity check mem buffer */
		assert(list_count(vector_mem->mem_buffer) <= 
			frm_gpu_vector_mem_max_inflight_mem_accesses);

		/* Stall if there is not room in the memory buffer */
		if (list_count(vector_mem->mem_buffer) == 
			frm_gpu_vector_mem_max_inflight_mem_accesses)
		{
			frm_trace("si.inst id=%lld cu=%d wf=%d uop_id=%lld "
				"stg=\"s\"\n", uop->id_in_sm, 
				vector_mem->sm->id, 
				uop->warp->id, uop->id_in_warp);
			list_index++;
			continue;
		}

		/* Set the access type */
		if (uop->vector_mem_write && !uop->glc)
			access_kind = mod_access_nc_store;
		else if (uop->vector_mem_write && uop->glc)
			access_kind = mod_access_store;
		else if (uop->vector_mem_read)
			access_kind = mod_access_load;
		else 
			fatal("%s: invalid access kind", __FUNCTION__);

		/* Access global memory */
		assert(!uop->global_mem_witness);
		for (thread_id = uop->warp->threads[0]->id_in_warp; 
				thread_id < uop->warp->thread_count; 
				thread_id++)
		{
			thread = uop->warp->threads[thread_id];
			thread_uop = 
				&uop->thread_uop[thread->id_in_warp];

			mod_access(vector_mem->sm->global_memory, 
				access_kind, 
				thread_uop->global_mem_access_addr,
				&uop->global_mem_witness, NULL, NULL, NULL);
			uop->global_mem_witness--;
		}

		if(frm_spatial_report_active)
		{
			if (uop->vector_mem_write)
			{
				uop->num_global_mem_write += 
					uop->global_mem_witness;
				frm_report_global_mem_inflight(uop->sm,
						uop->num_global_mem_write);
			}
			else if (uop->vector_mem_read)
			{
				uop->num_global_mem_read += 
					uop->global_mem_witness;
				frm_report_global_mem_inflight(uop->sm,
						uop->num_global_mem_read);
			}
			else
				fatal("%s: invalid access kind", __FUNCTION__);
		}

		/* Increment outstanding memory access count */
		uop->warp_inst_queue_entry->lgkm_cnt++;

		/* Transfer the uop to the mem buffer */
		list_remove(vector_mem->read_buffer, uop);
		list_enqueue(vector_mem->mem_buffer, uop);

		frm_trace("si.inst id=%lld cu=%d wf=%d uop_id=%lld "
			"stg=\"mem-m\"\n", uop->id_in_sm, 
			vector_mem->sm->id, uop->warp->id, 
			uop->id_in_warp);
	}
}
예제 #24
0
void frm_lds_mem(struct frm_lds_t *lds) {
  struct frm_uop_t *uop;
  struct frm_thread_uop_t *thread_uop;
  struct frm_thread_t *thread;
  int thread_id;
  int instructions_processed = 0;
  int list_entries;
  int i, j;
  enum mod_access_kind_t access_type;
  int list_index = 0;

  list_entries = list_count(lds->read_buffer);

  /* Sanity check the read buffer */
  assert(list_entries <= frm_gpu_lds_read_buffer_size);

  for (i = 0; i < list_entries; i++) {
    uop = list_get(lds->read_buffer, list_index);
    assert(uop);

    instructions_processed++;

    /* Uop is not ready yet */
    if (asTiming(frm_gpu)->cycle < uop->read_ready) {
      list_index++;
      continue;
    }

    /* Stall if the width has been reached. */
    if (instructions_processed > frm_gpu_lds_width) {
      frm_trace(
          "si.inst id=%lld cu=%d wf=%d uop_id=%lld "
          "stg=\"s\"\n",
          uop->id_in_sm, lds->sm->id, uop->warp->id, uop->id_in_warp);
      list_index++;
      continue;
    }

    /* Sanity check uop */
    assert(uop->lds_read || uop->lds_write);

    /* Sanity check mem buffer */
    assert(list_count(lds->mem_buffer) <=
           frm_gpu_lds_max_inflight_mem_accesses);

    /* Stall if there is no room in the memory buffer */
    if (list_count(lds->mem_buffer) == frm_gpu_lds_max_inflight_mem_accesses) {
      frm_trace(
          "si.inst id=%lld cu=%d wf=%d uop_id=%lld "
          "stg=\"s\"\n",
          uop->id_in_sm, lds->sm->id, uop->warp->id, uop->id_in_warp);
      list_index++;
      continue;
    }

    /* Access local memory */
    for (thread_id = uop->warp->threads[0]->id_in_warp;
         thread_id < uop->warp->thread_count; thread_id++) {
      thread = uop->warp->threads[thread_id];
      thread_uop = &uop->thread_uop[thread->id_in_warp];

      for (j = 0; j < thread_uop->lds_access_count; j++) {
        if (thread->lds_access_type[j] == 1) {
          access_type = mod_access_load;
        } else if (thread->lds_access_type[j] == 2) {
          access_type = mod_access_store;
        } else {
          fatal("%s: invalid lds access type", __FUNCTION__);
        }

        mod_access(lds->sm->lds_module, access_type,
                   thread_uop->lds_access_addr[j], &uop->lds_witness, NULL,
                   NULL, NULL);
        uop->lds_witness--;
      }
    }

    /* Increment outstanding memory access count */
    uop->warp_inst_queue_entry->lgkm_cnt++;

    /* Transfer the uop to the mem buffer */
    list_remove(lds->read_buffer, uop);
    list_enqueue(lds->mem_buffer, uop);

    frm_trace(
        "si.inst id=%lld cu=%d wf=%d uop_id=%lld "
        "stg=\"lds-m\"\n",
        uop->id_in_sm, lds->sm->id, uop->warp->id, uop->id_in_warp);
  }
}
예제 #25
0
void si_simd_execute(struct si_simd_t *simd)
{
	struct si_uop_t *uop;
	int list_entries;
	int list_index = 0;
	int instructions_processed = 0;
	int i;

	list_entries = list_count(simd->decode_buffer);

	/* Sanity check the decode buffer */
	assert(list_entries <= si_gpu_simd_decode_buffer_size);

	for (i = 0; i < list_entries; i++)
	{
		uop = list_get(simd->decode_buffer, list_index);
		assert(uop);

		instructions_processed++;

		/* Uop is not ready yet */
		if (asTiming(si_gpu)->cycle < uop->decode_ready)
		{
			list_index++;
			continue;
		}

		/* Stall if the width has been reached */
		if (instructions_processed > si_gpu_simd_width)
		{
			si_trace("si.inst id=%lld cu=%d wf=%d uop_id=%lld "
				"stg=\"s\"\n", uop->id_in_compute_unit, 
				simd->compute_unit->id, uop->wavefront->id, 
				uop->id_in_wavefront);
			list_index++;
			continue;
		}

		/* Sanity check exec buffer */
		assert(list_count(simd->exec_buffer) <= 
			si_gpu_simd_exec_buffer_size);

		/* Stall if SIMD unit is full */
		if (list_count(simd->exec_buffer) == 
			si_gpu_simd_exec_buffer_size)
		{
			si_trace("si.inst id=%lld cu=%d wf=%d uop_id=%lld "
				"stg=\"s\"\n", uop->id_in_compute_unit, 
				simd->compute_unit->id, uop->wavefront->id, 
				uop->id_in_wavefront);
			list_index++;
			continue;
		}

		/* Includes time for pipelined read-exec-write of 
		 * all subwavefronts */
		uop->execute_ready = asTiming(si_gpu)->cycle + 
			si_gpu_simd_exec_latency;

		/* Transfer the uop to the outstanding execution buffer */
		list_remove(simd->decode_buffer, uop);
		list_enqueue(simd->exec_buffer, uop);

		uop->wavefront_pool_entry->ready_next_cycle = 1;

		si_trace("si.inst id=%lld cu=%d wf=%d uop_id=%lld "
			"stg=\"simd-e\"\n", uop->id_in_compute_unit, 
			simd->compute_unit->id, uop->wavefront->id, 
			uop->id_in_wavefront);
	}
}
예제 #26
0
void frm_vector_mem_write(struct frm_vector_mem_unit_t *vector_mem)
{
	struct frm_uop_t *uop;
	int instructions_processed = 0;
	int list_entries;
	int list_index = 0;
	int i;

	list_entries = list_count(vector_mem->mem_buffer);

	/* Sanity check the mem buffer */
	assert(list_entries <= frm_gpu_vector_mem_max_inflight_mem_accesses);

	for (i = 0; i < list_entries; i++)
	{
		uop = list_get(vector_mem->mem_buffer, list_index);
		assert(uop);

		instructions_processed++;

		/* Uop is not ready yet */
		if (uop->global_mem_witness)
		{
			list_index++;
			continue;
		}

		/* Stall if the width has been reached. */
		if (instructions_processed > frm_gpu_vector_mem_width)
		{
			frm_trace("si.inst id=%lld cu=%d wf=%d uop_id=%lld "
				"stg=\"s\"\n", uop->id_in_sm, 
				vector_mem->sm->id, 
				uop->warp->id, uop->id_in_warp);
			list_index++;
			continue;
		}

		/* Sanity check write buffer */
		assert(list_count(vector_mem->write_buffer) <= 
				frm_gpu_vector_mem_write_buffer_size);

		/* Stop if the write buffer is full. */
		if (list_count(vector_mem->write_buffer) == 
				frm_gpu_vector_mem_write_buffer_size)
		{
			frm_trace("si.inst id=%lld cu=%d wf=%d uop_id=%lld "
				"stg=\"s\"\n", uop->id_in_sm, 
				vector_mem->sm->id, 
				uop->warp->id, uop->id_in_warp);
			list_index++;
			continue;
		}

		/* Access complete, remove the uop from the queue */
		uop->write_ready = asTiming(frm_gpu)->cycle + 
			frm_gpu_vector_mem_write_latency;

		/* In the above context, access means any of the 
		 * mod_access calls in frm_vector_mem_mem. Means all 
		 * inflight accesses for uop are done */
		if(frm_spatial_report_active)
		{
			if (uop->vector_mem_write)
			{
				frm_report_global_mem_finish(uop->sm,
						uop->num_global_mem_write);
			}
			else if (uop->vector_mem_read)
			{
				frm_report_global_mem_finish(uop->sm,
						uop->num_global_mem_read);
			}
			else
			{
				fatal("%s: invalid access kind", __FUNCTION__);
			}
		}

		list_remove(vector_mem->mem_buffer, uop);
		list_enqueue(vector_mem->write_buffer, uop);

		frm_trace("si.inst id=%lld cu=%d wf=%d uop_id=%lld "
			"stg=\"mem-w\"\n", uop->id_in_sm, 
			vector_mem->sm->id, uop->warp->id, 
			uop->id_in_warp);
	}
}
예제 #27
0
void si_simd_decode(struct si_simd_t *simd)
{
	struct si_uop_t *uop;
	int instructions_processed = 0;
	int list_entries;
	int list_index = 0;
	int i;

	list_entries = list_count(simd->issue_buffer);

	/* Sanity check the issue buffer */
	assert(list_entries <= si_gpu_simd_issue_buffer_size);

	for (i = 0; i < list_entries; i++)
	{
		uop = list_get(simd->issue_buffer, list_index);
		assert(uop);

		instructions_processed++;

		/* Uop not ready yet */
		if (asTiming(si_gpu)->cycle < uop->issue_ready)
		{
			list_index++;
			continue;
		}

		/* Stall if the issue width has been reached. */
		if (instructions_processed > si_gpu_simd_width)
		{
			si_trace("si.inst id=%lld cu=%d wf=%d uop_id=%lld "
				"stg=\"s\"\n", uop->id_in_compute_unit, 
				simd->compute_unit->id, uop->wavefront->id, 
				uop->id_in_wavefront);
			list_index++;
			continue;
		}

		/* Sanity check the decode buffer */
		assert(list_count(simd->decode_buffer) <= 
				si_gpu_simd_decode_buffer_size);

		/* Stall if the decode buffer is full. */
		if (list_count(simd->decode_buffer) == 
			si_gpu_simd_decode_buffer_size)
		{
			si_trace("si.inst id=%lld cu=%d wf=%d uop_id=%lld "
				"stg=\"s\"\n", uop->id_in_compute_unit, 
				simd->compute_unit->id, uop->wavefront->id, 
				uop->id_in_wavefront);
			list_index++;
			continue;
		}

		uop->decode_ready = asTiming(si_gpu)->cycle + si_gpu_simd_decode_latency;
		list_remove(simd->issue_buffer, uop);
		list_enqueue(simd->decode_buffer, uop);

		if (si_spatial_report_active)
			si_alu_report_new_inst(simd->compute_unit);

		si_trace("si.inst id=%lld cu=%d wf=%d uop_id=%lld "
			"stg=\"simd-d\"\n", uop->id_in_compute_unit, 
			simd->compute_unit->id, uop->wavefront->id, 
			uop->id_in_wavefront);
	}
}
예제 #28
0
static int X86ThreadIssueLQ(X86Thread *self, int quant)
{
	X86Core *core = self->core;
	X86Cpu *cpu = self->cpu;

	struct linked_list_t *lq = self->lq;
	struct x86_uop_t *load;
	struct mod_client_info_t *client_info;

	/* Process lq */
	linked_list_head(lq);
	while (!linked_list_is_end(lq) && quant)
	{
		/* Get element from load queue. If it is not ready, go to the next one */
		load = linked_list_get(lq);
		if (!load->ready && !X86ThreadIsUopReady(self, load))
		{
			linked_list_next(lq);
			continue;
		}
		load->ready = 1;

		/* Check that memory system is accessible */
		if (!mod_can_access(self->data_mod, load->phy_addr))
		{
			linked_list_next(lq);
			continue;
		}

		/* Remove from load queue */
		assert(load->uinst->opcode == x86_uinst_load);
		X86ThreadRemoveFromLQ(self);

		/* create and fill the mod_client_info_t object */
		client_info = mod_client_info_create(self->data_mod);
		client_info->prefetcher_eip = load->eip;

		/* Access memory system */
		mod_access(self->data_mod, mod_access_load,
			load->phy_addr, NULL, core->event_queue, load, client_info);

		/* The cache system will place the load at the head of the
		 * event queue when it is ready. For now, mark "in_event_queue" to
		 * prevent the uop from being freed. */
		load->in_event_queue = 1;
		load->issued = 1;
		load->issue_when = asTiming(cpu)->cycle;
		
		/* Statistics */
		core->num_issued_uinst_array[load->uinst->opcode]++;
		core->lsq_reads++;
		core->reg_file_int_reads += load->ph_int_idep_count;
		core->reg_file_fp_reads += load->ph_fp_idep_count;
		self->num_issued_uinst_array[load->uinst->opcode]++;
		self->lsq_reads++;
		self->reg_file_int_reads += load->ph_int_idep_count;
		self->reg_file_fp_reads += load->ph_fp_idep_count;
		cpu->num_issued_uinst_array[load->uinst->opcode]++;
		if (load->trace_cache)
			self->trace_cache->num_issued_uinst++;

		/* One more instruction issued, update quantum. */
		quant--;
		
		/* MMU statistics */
		MMUAccessPage(cpu->mmu, load->phy_addr, mmu_access_read);

		/* Trace */
		x86_trace("x86.inst id=%lld core=%d stg=\"i\"\n",
			load->id_in_core, core->id);
	}
	
	return quant;
}
예제 #29
0
static int X86ThreadIssuePreQ(X86Thread *self, int quantum)
{
	X86Core *core = self->core;
	X86Cpu *cpu = self->cpu;

	struct linked_list_t *preq = self->preq;
	struct x86_uop_t *prefetch;

	/* Process preq */
	linked_list_head(preq);
	while (!linked_list_is_end(preq) && quantum)
	{
		/* Get element from prefetch queue. If it is not ready, go to the next one */
		prefetch = linked_list_get(preq);
		if (!prefetch->ready && !X86ThreadIsUopReady(self, prefetch))
		{
			linked_list_next(preq);
			continue;
		}

		/* 
		 * Make sure its not been prefetched recently. This is just to avoid unnecessary
		 * memory traffic. Even though the cache will realise a "hit" on redundant 
		 * prefetches, its still helpful to avoid going to the memory (cache). 
		 */
		if (prefetch_history_is_redundant(core->prefetch_history,
							   self->data_mod, prefetch->phy_addr))
		{
			/* remove from queue. do not prefetch. */
			assert(prefetch->uinst->opcode == x86_uinst_prefetch);
			X86ThreadRemovePreQ(self);
			prefetch->completed = 1;
			x86_uop_free_if_not_queued(prefetch);
			continue;
		}

		prefetch->ready = 1;

		/* Check that memory system is accessible */
		if (!mod_can_access(self->data_mod, prefetch->phy_addr))
		{
			linked_list_next(preq);
			continue;
		}

		/* Remove from prefetch queue */
		assert(prefetch->uinst->opcode == x86_uinst_prefetch);
		X86ThreadRemovePreQ(self);

		/* Access memory system */
		mod_access(self->data_mod, mod_access_prefetch,
			prefetch->phy_addr, NULL, core->event_queue, prefetch, NULL);

		/* Record prefetched address */
		prefetch_history_record(core->prefetch_history, prefetch->phy_addr);

		/* The cache system will place the prefetch at the head of the
		 * event queue when it is ready. For now, mark "in_event_queue" to
		 * prevent the uop from being freed. */
		prefetch->in_event_queue = 1;
		prefetch->issued = 1;
		prefetch->issue_when = asTiming(cpu)->cycle;
		
		/* Statistics */
		core->num_issued_uinst_array[prefetch->uinst->opcode]++;
		core->lsq_reads++;
		core->reg_file_int_reads += prefetch->ph_int_idep_count;
		core->reg_file_fp_reads += prefetch->ph_fp_idep_count;
		self->num_issued_uinst_array[prefetch->uinst->opcode]++;
		self->lsq_reads++;
		self->reg_file_int_reads += prefetch->ph_int_idep_count;
		self->reg_file_fp_reads += prefetch->ph_fp_idep_count;
		cpu->num_issued_uinst_array[prefetch->uinst->opcode]++;
		if (prefetch->trace_cache)
			self->trace_cache->num_issued_uinst++;

		/* One more instruction issued, update quantum. */
		quantum--;
		
		/* MMU statistics */
		MMUAccessPage(cpu->mmu, prefetch->phy_addr, mmu_access_read);

		/* Trace */
		x86_trace("x86.inst id=%lld core=%d stg=\"i\"\n",
			prefetch->id_in_core, core->id);
	}
	
	return quantum;
}
예제 #30
0
static int X86ThreadIssueIQ(X86Thread *self, int quant)
{
	X86Cpu *cpu = self->cpu;
	X86Core *core = self->core;

	struct linked_list_t *iq = self->iq;
	struct x86_uop_t *uop;
	int lat;

	/* Find instruction to issue */
	linked_list_head(iq);
	while (!linked_list_is_end(iq) && quant)
	{
		/* Get element from IQ */
		uop = linked_list_get(iq);
		assert(x86_uop_exists(uop));
		assert(!(uop->flags & X86_UINST_MEM));
		if (!uop->ready && !X86ThreadIsUopReady(self, uop))
		{
			linked_list_next(iq);
			continue;
		}
		uop->ready = 1;  /* avoid next call to 'X86ThreadIsUopReady' */
		
		/* Run the instruction in its corresponding functional unit.
		 * If the instruction does not require a functional unit, 'X86CoreReserveFunctionalUnit'
		 * returns 1 cycle latency. If there is no functional unit available,
		 * 'X86CoreReserveFunctionalUnit' returns 0. */
		lat = X86CoreReserveFunctionalUnit(core, uop);
		if (!lat)
		{
			linked_list_next(iq);
			continue;
		}
		
		/* Instruction was issued to the corresponding fu.
		 * Remove it from IQ */
		X86ThreadRemoveFromIQ(self);
		
		/* Schedule inst in Event Queue */
		assert(!uop->in_event_queue);
		assert(lat > 0);
		uop->issued = 1;
		uop->issue_when = asTiming(cpu)->cycle;
		uop->when = asTiming(cpu)->cycle + lat;
		X86CoreInsertInEventQueue(core, uop);
		
		/* Statistics */
		core->num_issued_uinst_array[uop->uinst->opcode]++;
		core->iq_reads++;
		core->reg_file_int_reads += uop->ph_int_idep_count;
		core->reg_file_fp_reads += uop->ph_fp_idep_count;
		self->num_issued_uinst_array[uop->uinst->opcode]++;
		self->iq_reads++;
		self->reg_file_int_reads += uop->ph_int_idep_count;
		self->reg_file_fp_reads += uop->ph_fp_idep_count;
		cpu->num_issued_uinst_array[uop->uinst->opcode]++;
		if (uop->trace_cache)
			self->trace_cache->num_issued_uinst++;

		/* One more instruction issued, update quantum. */
		quant--;

		/* Trace */
		x86_trace("x86.inst id=%lld core=%d stg=\"i\"\n",
			uop->id_in_core, core->id);
	}
	
	return quant;
}