void si_gpu_uop_trash_empty(void) { struct si_uop_t *uop; while (si_gpu->trash_uop_list->count) { linked_list_head(si_gpu->trash_uop_list); uop = linked_list_get(si_gpu->trash_uop_list); linked_list_remove(si_gpu->trash_uop_list); si_trace("si.end_inst id=%lld cu=%d\n", uop->id_in_compute_unit, uop->compute_unit->id); si_uop_free(uop); } }
void si_simd_complete(struct si_simd_t *simd) { struct si_uop_t *uop; int list_entries; int list_index = 0; int i; list_entries = list_count(simd->exec_buffer); assert(list_entries <= si_gpu_simd_exec_buffer_size); for (i = 0; i < list_entries; i++) { uop = list_get(simd->exec_buffer, list_index); assert(uop); if (asTiming(si_gpu)->cycle < uop->execute_ready) { list_index++; continue; } /* Access complete, remove the uop from the queue */ list_remove(simd->exec_buffer, uop); si_trace("si.end_inst id=%lld cu=%d\n", uop->id_in_compute_unit, uop->compute_unit->id); /* Free uop */ si_uop_free(uop); /* Statistics */ simd->inst_count++; si_gpu->last_complete_cycle = asTiming(si_gpu)->cycle; } }
void si_simd_execute(struct si_simd_t *simd) { struct si_uop_t *uop; int list_entries; int list_index = 0; int instructions_processed = 0; int i; list_entries = list_count(simd->decode_buffer); /* Sanity check the decode buffer */ assert(list_entries <= si_gpu_simd_decode_buffer_size); for (i = 0; i < list_entries; i++) { uop = list_get(simd->decode_buffer, list_index); assert(uop); instructions_processed++; /* Uop is not ready yet */ if (asTiming(si_gpu)->cycle < uop->decode_ready) { list_index++; continue; } /* Stall if the width has been reached */ if (instructions_processed > si_gpu_simd_width) { si_trace("si.inst id=%lld cu=%d wf=%d uop_id=%lld " "stg=\"s\"\n", uop->id_in_compute_unit, simd->compute_unit->id, uop->wavefront->id, uop->id_in_wavefront); list_index++; continue; } /* Sanity check exec buffer */ assert(list_count(simd->exec_buffer) <= si_gpu_simd_exec_buffer_size); /* Stall if SIMD unit is full */ if (list_count(simd->exec_buffer) == si_gpu_simd_exec_buffer_size) { si_trace("si.inst id=%lld cu=%d wf=%d uop_id=%lld " "stg=\"s\"\n", uop->id_in_compute_unit, simd->compute_unit->id, uop->wavefront->id, uop->id_in_wavefront); list_index++; continue; } /* Includes time for pipelined read-exec-write of * all subwavefronts */ uop->execute_ready = asTiming(si_gpu)->cycle + si_gpu_simd_exec_latency; /* Transfer the uop to the outstanding execution buffer */ list_remove(simd->decode_buffer, uop); list_enqueue(simd->exec_buffer, uop); uop->wavefront_pool_entry->ready_next_cycle = 1; si_trace("si.inst id=%lld cu=%d wf=%d uop_id=%lld " "stg=\"simd-e\"\n", uop->id_in_compute_unit, simd->compute_unit->id, uop->wavefront->id, uop->id_in_wavefront); } }
void si_simd_decode(struct si_simd_t *simd) { struct si_uop_t *uop; int instructions_processed = 0; int list_entries; int list_index = 0; int i; list_entries = list_count(simd->issue_buffer); /* Sanity check the issue buffer */ assert(list_entries <= si_gpu_simd_issue_buffer_size); for (i = 0; i < list_entries; i++) { uop = list_get(simd->issue_buffer, list_index); assert(uop); instructions_processed++; /* Uop not ready yet */ if (asTiming(si_gpu)->cycle < uop->issue_ready) { list_index++; continue; } /* Stall if the issue width has been reached. */ if (instructions_processed > si_gpu_simd_width) { si_trace("si.inst id=%lld cu=%d wf=%d uop_id=%lld " "stg=\"s\"\n", uop->id_in_compute_unit, simd->compute_unit->id, uop->wavefront->id, uop->id_in_wavefront); list_index++; continue; } /* Sanity check the decode buffer */ assert(list_count(simd->decode_buffer) <= si_gpu_simd_decode_buffer_size); /* Stall if the decode buffer is full. */ if (list_count(simd->decode_buffer) == si_gpu_simd_decode_buffer_size) { si_trace("si.inst id=%lld cu=%d wf=%d uop_id=%lld " "stg=\"s\"\n", uop->id_in_compute_unit, simd->compute_unit->id, uop->wavefront->id, uop->id_in_wavefront); list_index++; continue; } uop->decode_ready = asTiming(si_gpu)->cycle + si_gpu_simd_decode_latency; list_remove(simd->issue_buffer, uop); list_enqueue(simd->decode_buffer, uop); if (si_spatial_report_active) si_alu_report_new_inst(simd->compute_unit); si_trace("si.inst id=%lld cu=%d wf=%d uop_id=%lld " "stg=\"simd-d\"\n", uop->id_in_compute_unit, simd->compute_unit->id, uop->wavefront->id, uop->id_in_wavefront); } }
void si_scalar_unit_writeback(struct si_scalar_unit_t *scalar_unit) { struct si_uop_t *uop = NULL; struct si_wavefront_t *wavefront; struct si_work_group_t *work_group; struct si_ndrange_t *ndrange; int i; int list_count; int wavefront_id; /* Process completed memory instructions */ list_count = linked_list_count(scalar_unit->mem_out_buffer); linked_list_head(scalar_unit->mem_out_buffer); for (i = 0; i < list_count; i++) { uop = linked_list_get(scalar_unit->mem_out_buffer); assert(uop); if (!uop->global_mem_witness) { /* Access complete, remove the uop from the queue */ linked_list_remove(scalar_unit->mem_out_buffer); si_trace("si.inst id=%lld cu=%d stg=\"su-w\"\n", uop->id_in_compute_unit, scalar_unit->compute_unit->id); /* Make the wavefront active again */ wavefront = uop->wavefront; wavefront->ready = 1; /* Free uop */ if (si_tracing()) si_gpu_uop_trash_add(uop); else si_uop_free(uop); } else { linked_list_next(scalar_unit->mem_out_buffer); } } /* Process completed ALU instructions */ list_count = linked_list_count(scalar_unit->alu_out_buffer); linked_list_head(scalar_unit->alu_out_buffer); for (i = 0; i < list_count; i++) { uop = linked_list_get(scalar_unit->alu_out_buffer); assert(uop); if (uop->execute_ready <= si_gpu->cycle) { /* Access complete, remove the uop from the queue */ linked_list_remove(scalar_unit->alu_out_buffer); si_trace("si.inst id=%lld cu=%d stg=\"su-w\"\n", uop->id_in_compute_unit, scalar_unit->compute_unit->id); /* Make the wavefront active again */ wavefront = uop->wavefront; work_group = wavefront->work_group; ndrange = work_group->ndrange; if (wavefront->finished) { work_group->compute_unit_finished_count++; } else if (wavefront->barrier) { if (wavefront->barrier_cleared) { /* All wavefronts have hit barrier */ wavefront->barrier_cleared = 0; SI_FOREACH_WAVEFRONT_IN_WORK_GROUP(work_group, wavefront_id) { wavefront = ndrange->wavefronts[wavefront_id]; wavefront->barrier = 0; wavefront->ready = 1; } } else { /* Wavefront is waiting at barrier */ } } else {
/* Run one iteration of the Southern Islands GPU timing simulation loop. */ int si_gpu_run(void) { struct si_ndrange_t *ndrange; struct si_compute_unit_t *compute_unit; struct si_compute_unit_t *compute_unit_next; /* For efficiency when no Southern Islands emulation is selected, exit here * if the list of existing ND-Ranges is empty. */ if (!si_emu->ndrange_list_count) return 0; /* Start one ND-Range in state 'pending' */ while ((ndrange = si_emu->pending_ndrange_list_head)) { /* Currently not supported for more than 1 ND-Range */ if (si_gpu->ndrange) fatal("%s: Southern Islands GPU timing simulation not supported " "for multiple ND-Ranges", __FUNCTION__); /* Set ND-Range status to 'running' */ si_ndrange_clear_status(ndrange, si_ndrange_pending); si_ndrange_set_status(ndrange, si_ndrange_running); /* Trace */ si_trace("si.new_ndrange id=%d wg_first=%d wg_count=%d\n", ndrange->id, ndrange->work_group_id_first, ndrange->work_group_count); /* Map ND-Range to GPU */ si_gpu_map_ndrange(ndrange); si_calc_plot(); } /* Mapped ND-Range */ ndrange = si_gpu->ndrange; assert(ndrange); /* Allocate work-groups to compute units */ while (si_gpu->compute_unit_ready_list_head && ndrange->pending_list_head) si_compute_unit_map_work_group(si_gpu->compute_unit_ready_list_head, ndrange->pending_list_head); /* One more cycle */ si_gpu->cycle++; /* Stop if maximum number of GPU cycles exceeded */ if (si_emu_max_cycles && si_gpu->cycle >= si_emu_max_cycles) esim_finish = esim_finish_si_max_cycles; /* Stop if maximum number of GPU instructions exceeded */ if (si_emu_max_inst && si_emu->inst_count >= si_emu_max_inst) esim_finish = esim_finish_si_max_inst; /* Stop if any reason met */ if (esim_finish) return 1; /* Free instructions in trash */ si_gpu_uop_trash_empty(); /* Run one loop iteration on each busy compute unit */ for (compute_unit = si_gpu->compute_unit_busy_list_head; compute_unit; compute_unit = compute_unit_next) { /* Store next busy compute unit, since this can change * during the compute unit simulation loop iteration. */ compute_unit_next = compute_unit->compute_unit_busy_list_next; /* Run one cycle */ si_compute_unit_run(compute_unit); } /* If ND-Range finished execution in all compute units, free it. */ if (!si_gpu->compute_unit_busy_list_count) { /* Dump ND-Range report */ si_ndrange_dump(ndrange, si_emu_report_file); /* Stop if maximum number of kernels reached */ if (si_emu_max_kernels && si_emu->ndrange_count >= si_emu_max_kernels) esim_finish = esim_finish_si_max_kernels; /* Finalize and free ND-Range */ assert(si_ndrange_get_status(ndrange, si_ndrange_finished)); si_gpu_uop_trash_empty(); si_gpu_unmap_ndrange(); si_ndrange_free(ndrange); } /* Return true */ return 1; }