Beispiel #1
0
void evg_compute_unit_map_work_group(struct evg_compute_unit_t *compute_unit, struct evg_work_group_t *work_group)
{
	struct evg_ndrange_t *ndrange = work_group->ndrange;
	struct evg_wavefront_t *wavefront;
	int wavefront_id;

	/* Map work-group */
	assert(compute_unit->work_group_count < evg_gpu->work_groups_per_compute_unit);
	assert(!work_group->id_in_compute_unit);
	while (work_group->id_in_compute_unit < evg_gpu->work_groups_per_compute_unit
		&& compute_unit->work_groups[work_group->id_in_compute_unit])
		work_group->id_in_compute_unit++;
	assert(work_group->id_in_compute_unit < evg_gpu->work_groups_per_compute_unit);
	compute_unit->work_groups[work_group->id_in_compute_unit] = work_group;
	compute_unit->work_group_count++;

	/* If compute unit reached its maximum load, remove it from 'ready' list.
	 * Otherwise, move it to the end of the 'ready' list. */
	assert(DOUBLE_LINKED_LIST_MEMBER(evg_gpu, ready, compute_unit));
	DOUBLE_LINKED_LIST_REMOVE(evg_gpu, ready, compute_unit);
	if (compute_unit->work_group_count < evg_gpu->work_groups_per_compute_unit)
		DOUBLE_LINKED_LIST_INSERT_TAIL(evg_gpu, ready, compute_unit);
	
	/* If this is the first scheduled work-group, insert to 'busy' list. */
	if (!DOUBLE_LINKED_LIST_MEMBER(evg_gpu, busy, compute_unit))
		DOUBLE_LINKED_LIST_INSERT_TAIL(evg_gpu, busy, compute_unit);

	/* Assign wavefronts identifiers in compute unit */
	EVG_FOREACH_WAVEFRONT_IN_WORK_GROUP(work_group, wavefront_id)
	{
		wavefront = ndrange->wavefronts[wavefront_id];
		wavefront->id_in_compute_unit = work_group->id_in_compute_unit *
			ndrange->wavefronts_per_work_group + wavefront->id_in_work_group;
	}
Beispiel #2
0
void si_compute_unit_map_work_group(struct si_compute_unit_t *compute_unit, 
	struct si_work_group_t *work_group)
{
	struct si_ndrange_t *ndrange = work_group->ndrange;
	struct si_wavefront_t *wavefront;
	int wavefront_id;
	int ib_id;

	assert(compute_unit->work_group_count < 
		si_gpu->work_groups_per_compute_unit);
	assert(!work_group->id_in_compute_unit);

	/* Find an available slot */
	while (work_group->id_in_compute_unit < 
		si_gpu->work_groups_per_compute_unit &&
		compute_unit->work_groups[work_group->id_in_compute_unit])
	{
		work_group->id_in_compute_unit++;
	}
	assert(work_group->id_in_compute_unit < 
		si_gpu->work_groups_per_compute_unit);
	compute_unit->work_groups[work_group->id_in_compute_unit] = work_group;
	compute_unit->work_group_count++;

	/* If compute unit reached its maximum load, remove it from 
	 * 'compute_unit_ready' list.  Otherwise, move it to the end of 
	 * the 'compute_unit_ready' list. */
	assert(DOUBLE_LINKED_LIST_MEMBER(si_gpu, compute_unit_ready, 
		compute_unit));
	DOUBLE_LINKED_LIST_REMOVE(si_gpu, compute_unit_ready, compute_unit);
	if (compute_unit->work_group_count < 
		si_gpu->work_groups_per_compute_unit)
	{
		DOUBLE_LINKED_LIST_INSERT_TAIL(si_gpu, compute_unit_ready, 
			compute_unit);
	}
	
	/* If this is the first scheduled work-group, insert to 
	 * 'compute_unit_busy' list. */
	if (!DOUBLE_LINKED_LIST_MEMBER(si_gpu, compute_unit_busy, compute_unit))
	{
		DOUBLE_LINKED_LIST_INSERT_TAIL(si_gpu, compute_unit_busy, 
			compute_unit);
	}

	/* Assign wavefronts identifiers in compute unit */
	SI_FOREACH_WAVEFRONT_IN_WORK_GROUP(work_group, wavefront_id)
	{
		wavefront = ndrange->wavefronts[wavefront_id];
		wavefront->id_in_compute_unit = work_group->id_in_compute_unit *
			ndrange->wavefronts_per_work_group + 
			wavefront->id_in_work_group;
	}
Beispiel #3
0
void EvgGpuCreate(EvgGpu *self)
{
	struct evg_compute_unit_t *compute_unit;
	int compute_unit_id;

	/* Parent */
	TimingCreate(asTiming(self));

	/* Frequency */
	asTiming(self)->frequency = evg_gpu_frequency;
	asTiming(self)->frequency_domain = esim_new_domain(evg_gpu_frequency);

	/* Initialize */
	self->trash_uop_list = linked_list_create();
	self->compute_units = xcalloc(evg_gpu_num_compute_units, sizeof(void *));
	EVG_GPU_FOREACH_COMPUTE_UNIT(compute_unit_id)
	{
		self->compute_units[compute_unit_id] = evg_compute_unit_create();
		compute_unit = self->compute_units[compute_unit_id];
		compute_unit->id = compute_unit_id;
		DOUBLE_LINKED_LIST_INSERT_TAIL(self, ready, compute_unit);
	}

	/* Virtual functions */
	asObject(self)->Dump = EvgGpuDump;
	asTiming(self)->DumpSummary = EvgGpuDumpSummary;
	asTiming(self)->Run = EvgGpuRun;
	asTiming(self)->MemConfigCheck = EvgGpuMemConfigCheck;
	asTiming(self)->MemConfigDefault = EvgGpuMemConfigDefault;
	asTiming(self)->MemConfigParseEntry = EvgGpuMemConfigParseEntry;
}
Beispiel #4
0
struct evg_ndrange_t *evg_ndrange_create(struct evg_opencl_kernel_t *kernel)
{
	struct evg_ndrange_t *ndrange;

	/* Allocate */
	ndrange = calloc(1, sizeof(struct evg_ndrange_t));
	if (!ndrange)
		fatal("%s: out of memory", __FUNCTION__);

	/* Insert in ND-Range list of Evergreen emulator */
	DOUBLE_LINKED_LIST_INSERT_TAIL(evg_emu, ndrange, ndrange);

	/* Name */
	ndrange->name = strdup(kernel->name);
	if (!ndrange->name)
		fatal("%s: out of memory", __FUNCTION__);

	/* Initialize */
	ndrange->kernel = kernel;
	ndrange->local_mem_top = kernel->func_mem_local;
	ndrange->id = evg_emu->ndrange_count++;

	/* Instruction histogram */
	if (evg_emu_report_file)
	{
		ndrange->inst_histogram = calloc(EVG_INST_COUNT, sizeof(unsigned int));
		if (!ndrange->inst_histogram)
			fatal("%s: out of memory", __FUNCTION__);
	}

	/* Return */
	return ndrange;
}
Beispiel #5
0
static void si_gpu_device_init()
{
    struct si_compute_unit_t *compute_unit;
    int compute_unit_id;

    /* Create device */
    si_gpu = calloc(1, sizeof(struct si_gpu_t));
    if (!si_gpu)
        fatal("%s: out of memory", __FUNCTION__);

    /* Initialize */
    si_gpu->trash_uop_list = linked_list_create();

    /* Create compute units */
    si_gpu->compute_units = calloc(si_gpu_num_compute_units, sizeof(void *));
    if (!si_gpu->compute_units)
        fatal("%s: out of memory", __FUNCTION__);

    /* Initialize compute units */
    SI_GPU_FOREACH_COMPUTE_UNIT(compute_unit_id)
    {
        si_gpu->compute_units[compute_unit_id] = si_compute_unit_create();
        compute_unit = si_gpu->compute_units[compute_unit_id];
        compute_unit->id = compute_unit_id;
        DOUBLE_LINKED_LIST_INSERT_TAIL(si_gpu, compute_unit_ready, compute_unit);
    }

    /* Trace */
    si_trace_header("si.init version=\"%d.%d\" num_compute_units=%d\n",
                    SI_TRACE_VERSION_MAJOR, SI_TRACE_VERSION_MINOR,
                    si_gpu_num_compute_units);
}
Beispiel #6
0
void frm_grid_set_status(struct frm_grid_t *grid, enum frm_grid_status_t status)
{
        /* Get only the new bits */
        status &= ~grid->status;

        /* Add ND-Range to lists */
        if (status & frm_grid_pending)
                DOUBLE_LINKED_LIST_INSERT_TAIL(frm_emu, pending_grid, grid);
        if (status & frm_grid_running)
                DOUBLE_LINKED_LIST_INSERT_TAIL(frm_emu, running_grid, grid);
        if (status & frm_grid_finished)
                DOUBLE_LINKED_LIST_INSERT_TAIL(frm_emu, finished_grid, grid);

        /* Update it */
        grid->status |= status;
}
Beispiel #7
0
struct si_ndrange_t *si_ndrange_create(struct si_opencl_kernel_t *kernel)
{
	struct si_ndrange_t *ndrange;

	/* Insert in ND-Range list of SouthernIslands emulator */
	ndrange = xcalloc(1, sizeof(struct si_ndrange_t));
	DOUBLE_LINKED_LIST_INSERT_TAIL(si_emu, ndrange, ndrange);

	/* Initialize */
	ndrange->name = xstrdup(kernel->name);
	ndrange->kernel = kernel;
	ndrange->local_mem_top = kernel->func_mem_local;
	ndrange->id = si_emu->ndrange_count++;
	ndrange->num_vgprs = 
		kernel->bin_file->enc_dict_entry_southern_islands->num_vgpr_used;
	ndrange->num_sgprs = 
		kernel->bin_file->enc_dict_entry_southern_islands->num_sgpr_used;

	/* Create the UAV-to-physical-address lookup lists */
	ndrange->uav_list = list_create();

	/* Instruction histogram */
	if (si_emu_report_file)
		ndrange->inst_histogram = xcalloc(SI_INST_COUNT, sizeof(unsigned int));

	/* Return */
	return ndrange;
}
Beispiel #8
0
/* Enqueue access in module wait list. */
void mod_stack_wait_in_mod(struct mod_stack_t *stack,
	struct mod_t *mod, int event)
{
	assert(mod == stack->mod);
	assert(!DOUBLE_LINKED_LIST_MEMBER(mod, waiting, stack));
	stack->waiting_list_event = event;
	DOUBLE_LINKED_LIST_INSERT_TAIL(mod, waiting, stack);
}
Beispiel #9
0
/* Enqueue access in port wait list. */
void mod_stack_wait_in_port(struct mod_stack_t *stack,
	struct mod_port_t *port, int event)
{
	assert(port == stack->port);
	assert(!DOUBLE_LINKED_LIST_MEMBER(port, waiting, stack));
	stack->waiting_list_event = event;
	DOUBLE_LINKED_LIST_INSERT_TAIL(port, waiting, stack);
}
Beispiel #10
0
void si_work_group_set_status(struct si_work_group_t *work_group, enum si_work_group_status_t status)
{
	struct si_ndrange_t *ndrange = work_group->ndrange;

	/* Get only the new bits */
	status &= ~work_group->status;

	/* Add work-group to lists */
	if (status & si_work_group_pending)
		DOUBLE_LINKED_LIST_INSERT_TAIL(ndrange, pending, work_group);
	if (status & si_work_group_running)
		DOUBLE_LINKED_LIST_INSERT_TAIL(ndrange, running, work_group);
	if (status & si_work_group_finished)
		DOUBLE_LINKED_LIST_INSERT_TAIL(ndrange, finished, work_group);

	/* Update it */
	work_group->status |= status;
}
Beispiel #11
0
/* Enqueue access in stack wait list. */
void mod_stack_wait_in_stack(struct mod_stack_t *stack,
	struct mod_stack_t *master_stack, int event)
{
	assert(master_stack != stack);
	assert(!DOUBLE_LINKED_LIST_MEMBER(master_stack, waiting, stack));

	stack->waiting_list_event = event;
	DOUBLE_LINKED_LIST_INSERT_TAIL(master_stack, waiting, stack);
}
Beispiel #12
0
/* Enqueue access in stack wait list. */
void mod_stack_wait_in_stack(struct mod_stack_t *stack,
	struct mod_stack_t *master_stack, int event)
{
	assert(master_stack != stack);
	assert(!DOUBLE_LINKED_LIST_MEMBER(master_stack, waiting, stack));

	stack->waiting_list_event = event;
	DOUBLE_LINKED_LIST_INSERT_TAIL(master_stack, waiting, stack);
    
    if((stack->addr >= 0x2F20 && stack->addr<= 0x2F2C))
	  ;// fprintf(stderr, "      wait master %x, %x, %d,%lld\n", master_stack->addr, stack->addr , event,esim_time);
}
Beispiel #13
0
struct frm_grid_t *frm_grid_create(struct frm_cuda_function_t *function)
{
	struct frm_grid_t *grid;

	/* Initialize */
	grid = xcalloc(1, sizeof(struct frm_grid_t));
	DOUBLE_LINKED_LIST_INSERT_TAIL(frm_emu, grid, grid);
	grid->id = 0;
	strncpy(grid->name, function->name, MAX_STRING_SIZE);
	grid->function = function;

	/* Return */
	return grid;
}
Beispiel #14
0
void x86_emu_list_insert_tail(enum x86_emu_list_kind_t list, struct x86_ctx_t *ctx)
{
	assert(!x86_emu_list_member(list, ctx));
	switch (list) {
	case x86_emu_list_context: DOUBLE_LINKED_LIST_INSERT_TAIL(x86_emu, context, ctx); break;
	case x86_emu_list_running: DOUBLE_LINKED_LIST_INSERT_TAIL(x86_emu, running, ctx); break;
	case x86_emu_list_finished: DOUBLE_LINKED_LIST_INSERT_TAIL(x86_emu, finished, ctx); break;
	case x86_emu_list_zombie: DOUBLE_LINKED_LIST_INSERT_TAIL(x86_emu, zombie, ctx); break;
	case x86_emu_list_suspended: DOUBLE_LINKED_LIST_INSERT_TAIL(x86_emu, suspended, ctx); break;
	case x86_emu_list_alloc: DOUBLE_LINKED_LIST_INSERT_TAIL(x86_emu, alloc, ctx); break;
	}
}
Beispiel #15
0
struct frm_grid_t *frm_grid_create(struct frm_cuda_function_t *function)
{
	struct frm_grid_t *grid;

	/* Allocate */
	grid = calloc(1, sizeof(struct frm_grid_t));
	if (!grid)
		fatal("%s: out of memory", __FUNCTION__);

	/* Initialize */
	DOUBLE_LINKED_LIST_INSERT_TAIL(frm_emu, grid, grid);
	grid->id = 0;
	strncpy(grid->name, function->name, MAX_STRING_SIZE);
	grid->function = function;

	/* Return */
	return grid;
}
Beispiel #16
0
void MIPSEmuListInsertTail(MIPSEmu *self, enum mips_emu_list_kind_t list,
		struct mips_ctx_t *ctx)
{
	assert(!MIPSEmuListMember(self, list, ctx));
	switch (list) {
	case mips_emu_list_context: DOUBLE_LINKED_LIST_INSERT_TAIL(self, context, ctx); break;
	case mips_emu_list_running: DOUBLE_LINKED_LIST_INSERT_TAIL(self, running, ctx); break;
	case mips_emu_list_finished: DOUBLE_LINKED_LIST_INSERT_TAIL(self, finished, ctx); break;
	case mips_emu_list_zombie: DOUBLE_LINKED_LIST_INSERT_TAIL(self, zombie, ctx); break;
	case mips_emu_list_suspended: DOUBLE_LINKED_LIST_INSERT_TAIL(self, suspended, ctx); break;
	case mips_emu_list_alloc: DOUBLE_LINKED_LIST_INSERT_TAIL(self, alloc, ctx); break;
	}
}
Beispiel #17
0
struct si_ndrange_t *si_ndrange_create(char *name)
{
	struct si_ndrange_t *ndrange;

	/* Initialize */
	ndrange = xcalloc(1, sizeof(struct si_ndrange_t));
	ndrange->id = si_emu->ndrange_count++;

	/* Insert in ND-Range list of SouthernIslands emulator */
	DOUBLE_LINKED_LIST_INSERT_TAIL(si_emu, ndrange, ndrange);

	/* Instruction histogram */
	if (si_emu_report_file)
		ndrange->inst_histogram = xcalloc(SI_INST_COUNT, 
			sizeof(unsigned int));

	/* Return */
	return ndrange;
}
Beispiel #18
0
struct si_ndrange_t *si_ndrange_create(struct si_opencl_kernel_t *kernel)
{
	struct si_ndrange_t *ndrange;

	/* Allocate */
	ndrange = calloc(1, sizeof(struct si_ndrange_t));
	if (!ndrange)
		fatal("%s: out of memory", __FUNCTION__);

	/* Insert in ND-Range list of SouthernIslands emulator */
	DOUBLE_LINKED_LIST_INSERT_TAIL(si_emu, ndrange, ndrange);

	/* Name */
	ndrange->name = strdup(kernel->name);
	if (!ndrange->name)
		fatal("%s: out of memory", __FUNCTION__);

	/* Initialize */
	ndrange->kernel = kernel;
	ndrange->local_mem_top = kernel->func_mem_local;
	ndrange->id = si_emu->ndrange_count++;

	/* Create the UAV-to-physical-address lookup lists */
	ndrange->uav_list = list_create();

	/* Instruction histogram */
	if (evg_emu_report_file)
	{
		ndrange->inst_histogram = calloc(EVG_INST_COUNT, sizeof(unsigned int));
		if (!ndrange->inst_histogram)
			fatal("%s: out of memory", __FUNCTION__);
	}

	/* Return */
	return ndrange;
}
Beispiel #19
0
void frm_grid_setup_threads(struct frm_grid_t *grid)
{
	struct frm_cuda_function_t *function = grid->function;

	struct frm_threadblock_t *threadblock;
	struct frm_warp_t *warp;
	struct frm_thread_t *thread;

	int bidx, bidy, bidz;  /* 3D threadblock ID iterators */
	int lidx, lidy, lidz;  /* 3D thread local ID iterators */

	int tid;  /* Global ID iterator */
	int bid;  /* Threadblock ID iterator */
	int wid;  /* Warp ID iterator */
	int lid;  /* Local ID iterator */

	/* Array of threadblocks */
	grid->threadblock_count = function->group_count;
	grid->threadblock_id_first = 0;
	grid->threadblock_id_last = grid->threadblock_count - 1;
	grid->threadblocks = calloc(grid->threadblock_count, sizeof(void *));
	for (bid = 0; bid < grid->threadblock_count; bid++)
		grid->threadblocks[bid] = frm_threadblock_create();
	
	/* Array of warps */
	grid->warps_per_threadblock = (function->local_size + frm_emu_warp_size - 1) / frm_emu_warp_size;
	grid->warp_count = grid->warps_per_threadblock * grid->threadblock_count;
	grid->warp_id_first = 0;
	grid->warp_id_last = grid->warp_count - 1;
	assert(grid->warps_per_threadblock > 0 && grid->warp_count > 0);
	grid->warps = calloc(grid->warp_count, sizeof(void *));
	for (wid = 0; wid < grid->warp_count; wid++)
	{
		bid = wid / grid->warps_per_threadblock;
		grid->warps[wid] = frm_warp_create();
		warp = grid->warps[wid];
		threadblock = grid->threadblocks[bid];

		warp->id = wid;
		warp->id_in_threadblock = wid % grid->warps_per_threadblock;
		warp->grid = grid;
		warp->threadblock = threadblock;
		DOUBLE_LINKED_LIST_INSERT_TAIL(threadblock, running, warp);
	}

	/* Array of threads */
	grid->thread_count = function->global_size;
	grid->thread_id_first = 0;
	grid->thread_id_last = grid->thread_count - 1;
	grid->threads = calloc(grid->thread_count, sizeof(void *));
	tid = 0;
	bid = 0;
	for (bidz = 0; bidz < function->group_count3[2]; bidz++)
	{
		for (bidy = 0; bidy < function->group_count3[1]; bidy++)
		{
			for (bidx = 0; bidx < function->group_count3[0]; bidx++)
			{
				/* Assign threadblock ID */
				threadblock = grid->threadblocks[bid];
				threadblock->grid = grid;
				threadblock->id_3d[0] = bidx;
				threadblock->id_3d[1] = bidy;
				threadblock->id_3d[2] = bidz;
				threadblock->id = bid;
				frm_threadblock_set_status(threadblock, frm_threadblock_pending);

				/* First, last, and number of threads in threadblock */
				threadblock->thread_id_first = tid;
				threadblock->thread_id_last = tid + function->local_size - 1;
				threadblock->thread_count = function->local_size;
				threadblock->threads = &grid->threads[tid];
				snprintf(threadblock->name, sizeof(threadblock->name), "threadblock[i%d-i%d]",
					threadblock->thread_id_first, threadblock->thread_id_last);

				/* First ,last, and number of warps in threadblock */
				threadblock->warp_id_first = bid * grid->warps_per_threadblock;
				threadblock->warp_id_last = threadblock->warp_id_first + grid->warps_per_threadblock - 1;
				threadblock->warp_count = grid->warps_per_threadblock;
				threadblock->warps = &grid->warps[threadblock->warp_id_first];

				/* Iterate through threads */
				lid = 0;
				for (lidz = 0; lidz < function->local_size3[2]; lidz++)
				{
					for (lidy = 0; lidy < function->local_size3[1]; lidy++)
					{
						for (lidx = 0; lidx < function->local_size3[0]; lidx++)
						{
							/* Warp ID */
							wid = bid * grid->warps_per_threadblock +
								lid / frm_emu_warp_size;
							assert(wid < grid->warp_count);
							warp = grid->warps[wid];
							
							/* Create thread */
							grid->threads[tid] = frm_thread_create();
							thread = grid->threads[tid];
							thread->grid = grid;

							/* Global IDs */
							thread->id_3d[0] = bidx * function->local_size3[0] + lidx;
							thread->id_3d[1] = bidy * function->local_size3[1] + lidy;
							thread->id_3d[2] = bidz * function->local_size3[2] + lidz;
							thread->id = tid;

							/* Local IDs */
							thread->id_in_threadblock_3d[0] = lidx;
							thread->id_in_threadblock_3d[1] = lidy;
							thread->id_in_threadblock_3d[2] = lidz;
							thread->id_in_threadblock = lid;

							/* Other */
							thread->id_in_warp = thread->id_in_threadblock % frm_emu_warp_size;
							thread->threadblock = grid->threadblocks[bid];
							thread->warp = grid->warps[wid];

							/* First, last, and number of threads in warp */
							if (!warp->thread_count) {
								warp->thread_id_first = tid;
								warp->threads = &grid->threads[tid];
							}
							warp->thread_count++;
							warp->thread_id_last = tid;
							bit_map_set(warp->active_stack, thread->id_in_warp, 1, 1);

                                                        /* Save local IDs in register R0 */
                                                        thread->sr[FRM_SR_Tid_X].v.i = lidx;  /* R0.x */
                                                        thread->sr[FRM_SR_Tid_Y].v.i = lidy;  /* R0.y */
                                                        thread->sr[FRM_SR_Tid_Z].v.i = lidz;  /* R0.z */

                                                        /* Save threadblock IDs in register R1 */
                                                        thread->sr[FRM_SR_CTAid_X].v.i = bidx;  /* R1.x */
                                                        thread->sr[FRM_SR_CTAid_Y].v.i = bidy;  /* R1.y */
                                                        thread->sr[FRM_SR_CTAid_Z].v.i = bidz;  /* R1.z */

							/* Next thread */
							tid++;
							lid++;
						}
					}
				}

				/* Next threadblock */
				bid++;
			}
		}
	}

	/* Assign names to warps */
	for (wid = 0; wid < grid->warp_count; wid++)
	{
		warp = grid->warps[wid];
		snprintf(warp->name, sizeof(warp->name), "warp[i%d-i%d]",
			warp->thread_id_first, warp->thread_id_last);

		/* Initialize warp program counter */
                warp->buf_start = function->function_buffer.ptr;
                warp->buf = warp->buf_start;
                warp->buf_size = function->function_buffer.size;
	}

	/* Debug */
	printf("local_size = %d (%d,%d,%d)\n", function->local_size, function->local_size3[0],
		function->local_size3[1], function->local_size3[2]);
	printf("global_size = %d (%d,%d,%d)\n", function->global_size, function->global_size3[0],
		function->global_size3[1], function->global_size3[2]);
	printf("group_count = %d (%d,%d,%d)\n", function->group_count, function->group_count3[0],
		function->group_count3[1], function->group_count3[2]);
	printf("warp_count = %d\n", grid->warp_count);
	printf("warps_per_threadblock = %d\n", grid->warps_per_threadblock);
	printf(" tid tid2 tid1 tid0   bid bid2 bid1 bid0   lid lid2 lid1 lid0  warp            work-group\n");
	for (tid = 0; tid < grid->thread_count; tid++)
	{
		thread = grid->threads[tid];
		warp = thread->warp;
		threadblock = thread->threadblock;
		printf("%4d %4d %4d %4d  ", thread->id, thread->id_3d[2],
			thread->id_3d[1], thread->id_3d[0]);
		printf("%4d %4d %4d %4d  ", threadblock->id, threadblock->id_3d[2],
			threadblock->id_3d[1], threadblock->id_3d[0]);
		printf("%4d %4d %4d %4d  ", thread->id_in_threadblock, thread->id_in_threadblock_3d[2],
			thread->id_in_threadblock_3d[1], thread->id_in_threadblock_3d[0]);
		printf("%20s.%-4d  ", warp->name, thread->id_in_warp);
		printf("%20s.%-4d\n", threadblock->name, thread->id_in_threadblock);
	}

}
Beispiel #20
0
static void si_ndrange_setup_arrays(struct si_ndrange_t *ndrange)
{
	struct si_work_group_t *work_group;
	struct si_wavefront_t *wavefront;
	struct si_work_item_t *work_item;

	int gidx, gidy, gidz;  /* 3D work-group ID iterators */
	int lidx, lidy, lidz;  /* 3D work-item local ID iterators */

	int tid;  /* Global ID iterator */
	int gid;  /* Group ID iterator */
	int wid;  /* Wavefront ID iterator */
	int lid;  /* Local ID iterator */

	/* Array of work-groups */
	ndrange->work_group_count = ndrange->group_count;
	ndrange->work_group_id_first = 0;
	ndrange->work_group_id_last = ndrange->work_group_count - 1;
	ndrange->work_groups = xcalloc(ndrange->work_group_count, sizeof(void *));
	for (gid = 0; gid < ndrange->group_count; gid++)
	{
		ndrange->work_groups[gid] = si_work_group_create();
		work_group = ndrange->work_groups[gid];
	}

	/* Array of wavefronts */
	ndrange->wavefronts_per_work_group = 
		(ndrange->local_size + si_emu_wavefront_size - 1) /
		si_emu_wavefront_size;
	ndrange->wavefront_count = ndrange->wavefronts_per_work_group * 
		ndrange->work_group_count;
	ndrange->wavefront_id_first = 0;
	ndrange->wavefront_id_last = ndrange->wavefront_count - 1;
	assert(ndrange->wavefronts_per_work_group > 0 && 
		ndrange->wavefront_count > 0);
	ndrange->wavefronts = xcalloc(ndrange->wavefront_count, sizeof(void *));
	ndrange->scalar_work_items = xcalloc(ndrange->wavefront_count, 
		sizeof(void *));

	for (wid = 0; wid < ndrange->wavefront_count; wid++)
	{
		gid = wid / ndrange->wavefronts_per_work_group;
		ndrange->wavefronts[wid] = si_wavefront_create();
		wavefront = ndrange->wavefronts[wid];
		work_group = ndrange->work_groups[gid];

		wavefront->id = wid;
		wavefront->id_in_work_group = wid % 
			ndrange->wavefronts_per_work_group;
		wavefront->ndrange = ndrange;
		wavefront->work_group = work_group;
		DOUBLE_LINKED_LIST_INSERT_TAIL(work_group, running, wavefront);

		/* Initialize the scalar work item */
		ndrange->scalar_work_items[wid] = si_work_item_create();
		wavefront->scalar_work_item = ndrange->scalar_work_items[wid];
		ndrange->scalar_work_items[wid]->wavefront = wavefront;
		ndrange->scalar_work_items[wid]->work_group = work_group;
		ndrange->scalar_work_items[wid]->ndrange = ndrange;
	}

	/* Array of work-items */
	ndrange->work_item_count = ndrange->global_size;
	ndrange->work_item_id_first = 0;
	ndrange->work_item_id_last = ndrange->work_item_count - 1;
	ndrange->work_items = xcalloc(ndrange->work_item_count, sizeof(void *));
	tid = 0;
	gid = 0;
	for (gidz = 0; gidz < ndrange->group_count3[2]; gidz++)
	{
		for (gidy = 0; gidy < ndrange->group_count3[1]; gidy++)
		{
			for (gidx = 0; gidx < ndrange->group_count3[0]; gidx++)
			{
				/* Assign work-group ID */
				work_group = ndrange->work_groups[gid];
				work_group->ndrange = ndrange;
				work_group->id_3d[0] = gidx;
				work_group->id_3d[1] = gidy;
				work_group->id_3d[2] = gidz;
				work_group->id = gid;
				si_work_group_set_status(work_group, si_work_group_pending);

				/* First, last, and number of work-items in work-group */
				work_group->work_item_id_first = tid;
				work_group->work_item_id_last = tid + ndrange->local_size;
				work_group->work_item_count = ndrange->local_size;
				work_group->work_items = &ndrange->work_items[tid];
				snprintf(work_group->name, sizeof(work_group->name), "work-group[i%d-i%d]",
					work_group->work_item_id_first, work_group->work_item_id_last);

				/* First ,last, and number of wavefronts in work-group */
				work_group->wavefront_id_first = gid * ndrange->wavefronts_per_work_group;
				work_group->wavefront_id_last = work_group->wavefront_id_first + ndrange->wavefronts_per_work_group - 1;
				work_group->wavefront_count = ndrange->wavefronts_per_work_group;
				work_group->wavefronts = &ndrange->wavefronts[work_group->wavefront_id_first];
				/* Iterate through work-items */
				lid = 0;
				for (lidz = 0; lidz < ndrange->local_size3[2]; lidz++)
				{
					for (lidy = 0; lidy < ndrange->local_size3[1]; lidy++)
					{
						for (lidx = 0; lidx < ndrange->local_size3[0]; lidx++)
						{
							/* Wavefront ID */
							wid = gid * ndrange->wavefronts_per_work_group +
								lid / si_emu_wavefront_size;
							assert(wid < ndrange->wavefront_count);
							wavefront = ndrange->wavefronts[wid];
							
							/* Create work-item */
							ndrange->work_items[tid] = si_work_item_create();
							work_item = ndrange->work_items[tid];
							work_item->ndrange = ndrange;

							/* Global IDs */
							work_item->id_3d[0] = gidx * ndrange->local_size3[0] + lidx;
							work_item->id_3d[1] = gidy * ndrange->local_size3[1] + lidy;
							work_item->id_3d[2] = gidz * ndrange->local_size3[2] + lidz;
							work_item->id = tid;

							/* Local IDs */
							work_item->id_in_work_group_3d[0] = lidx;
							work_item->id_in_work_group_3d[1] = lidy;
							work_item->id_in_work_group_3d[2] = lidz;
							work_item->id_in_work_group = lid;

							/* Other */
							work_item->id_in_wavefront = work_item->id_in_work_group % si_emu_wavefront_size;
							work_item->work_group = ndrange->work_groups[gid];
							work_item->wavefront = ndrange->wavefronts[wid];

							/*MIAOW start*/
							work_item->id = work_item->id_in_wavefront;
							/*MIAOW stop*/

							/* First, last, and number of work-items in wavefront */
							if (!wavefront->work_item_count)
							{
								wavefront->work_item_id_first = tid;
								wavefront->work_items = &ndrange->work_items[tid];
							}
							wavefront->work_item_count++;
							wavefront->work_item_id_last = tid;

							/* Next work-item */
							tid++;
							lid++;
						}
					}
				}

				/* Next work-group */
				gid++;
			}
		}
	}

	/* Initialize the wavefronts */
	for (wid = 0; wid < ndrange->wavefront_count; wid++)
	{
		/* Assign names to wavefronts */
		wavefront = ndrange->wavefronts[wid];
		snprintf(wavefront->name, sizeof(wavefront->name),
			"wavefront[i%d-i%d]",
			wavefront->work_item_id_first,
			wavefront->work_item_id_last);
	}

	/* Debug */
	si_isa_debug("local_size = %d (%d,%d,%d)\n", ndrange->local_size,
		ndrange->local_size3[0], ndrange->local_size3[1],
		ndrange->local_size3[2]);
	si_isa_debug("global_size = %d (%d,%d,%d)\n", ndrange->global_size,
		ndrange->global_size3[0], ndrange->global_size3[1],
		ndrange->global_size3[2]);
	si_isa_debug("group_count = %d (%d,%d,%d)\n", ndrange->group_count,
		ndrange->group_count3[0], ndrange->group_count3[1],
		ndrange->group_count3[2]);
	si_isa_debug("wavefront_count = %d\n", ndrange->wavefront_count);
	si_isa_debug("wavefronts_per_work_group = %d\n",
		ndrange->wavefronts_per_work_group);
	si_isa_debug("\n");
}
Beispiel #21
0
void si_ndrange_setup_work_items(struct si_ndrange_t *ndrange)
{
	struct si_opencl_kernel_t *kernel = ndrange->kernel;

	struct si_work_group_t *work_group;
	struct si_wavefront_t *wavefront;
	struct si_work_item_t *work_item;

	int gidx, gidy, gidz;  /* 3D work-group ID iterators */
	int lidx, lidy, lidz;  /* 3D work-item local ID iterators */

	int tid;  /* Global ID iterator */
	int gid;  /* Group ID iterator */
	int wid;  /* Wavefront ID iterator */
	int lid;  /* Local ID iterator */

	/* Array of work-groups */
	ndrange->work_group_count = kernel->group_count;
	ndrange->work_group_id_first = 0;
	ndrange->work_group_id_last = ndrange->work_group_count - 1;
	ndrange->work_groups = calloc(ndrange->work_group_count, sizeof(void *));
	for (gid = 0; gid < kernel->group_count; gid++)
	{
		ndrange->work_groups[gid] = si_work_group_create();
		work_group = ndrange->work_groups[gid];
	}

	/* Array of wavefronts */
	ndrange->wavefronts_per_work_group = (kernel->local_size + si_emu_wavefront_size - 1) / si_emu_wavefront_size;
	ndrange->wavefront_count = ndrange->wavefronts_per_work_group * ndrange->work_group_count;
	ndrange->wavefront_id_first = 0;
	ndrange->wavefront_id_last = ndrange->wavefront_count - 1;
	assert(ndrange->wavefronts_per_work_group > 0 && ndrange->wavefront_count > 0);
	ndrange->wavefronts = calloc(ndrange->wavefront_count, sizeof(void *));
	ndrange->scalar_work_items = calloc(ndrange->wavefront_count, sizeof(void *));
	for (wid = 0; wid < ndrange->wavefront_count; wid++)
	{
		gid = wid / ndrange->wavefronts_per_work_group;
		ndrange->wavefronts[wid] = si_wavefront_create();
		wavefront = ndrange->wavefronts[wid];
		work_group = ndrange->work_groups[gid];

		wavefront->id = wid;
		wavefront->id_in_work_group = wid % ndrange->wavefronts_per_work_group;
		wavefront->ndrange = ndrange;
		wavefront->work_group = work_group;
		DOUBLE_LINKED_LIST_INSERT_TAIL(work_group, running, wavefront);

		/* Initialize the scalar work item */
		ndrange->scalar_work_items[wid] = si_work_item_create();
		wavefront->scalar_work_item = ndrange->scalar_work_items[wid];
		ndrange->scalar_work_items[wid]->wavefront = wavefront;
		ndrange->scalar_work_items[wid]->work_group = work_group;
		ndrange->scalar_work_items[wid]->ndrange = ndrange;
	}

	/* Array of work-items */
	ndrange->work_item_count = kernel->global_size;
	ndrange->work_item_id_first = 0;
	ndrange->work_item_id_last = ndrange->work_item_count - 1;
	ndrange->work_items = calloc(ndrange->work_item_count, sizeof(void *));
	tid = 0;
	gid = 0;
	for (gidz = 0; gidz < kernel->group_count3[2]; gidz++)
	{
		for (gidy = 0; gidy < kernel->group_count3[1]; gidy++)
		{
			for (gidx = 0; gidx < kernel->group_count3[0]; gidx++)
			{
				/* Assign work-group ID */
				work_group = ndrange->work_groups[gid];
				work_group->ndrange = ndrange;
				work_group->id_3d[0] = gidx;
				work_group->id_3d[1] = gidy;
				work_group->id_3d[2] = gidz;
				work_group->id = gid;
				si_work_group_set_status(work_group, si_work_group_pending);

				/* First, last, and number of work-items in work-group */
				work_group->work_item_id_first = tid;
				work_group->work_item_id_last = tid + kernel->local_size;
				work_group->work_item_count = kernel->local_size;
				work_group->work_items = &ndrange->work_items[tid];
				snprintf(work_group->name, sizeof(work_group->name), "work-group[i%d-i%d]",
					work_group->work_item_id_first, work_group->work_item_id_last);

				/* First ,last, and number of wavefronts in work-group */
				work_group->wavefront_id_first = gid * ndrange->wavefronts_per_work_group;
				work_group->wavefront_id_last = work_group->wavefront_id_first + ndrange->wavefronts_per_work_group - 1;
				work_group->wavefront_count = ndrange->wavefronts_per_work_group;
				work_group->wavefronts = &ndrange->wavefronts[work_group->wavefront_id_first];
				/* Iterate through work-items */
				lid = 0;
				for (lidz = 0; lidz < kernel->local_size3[2]; lidz++)
				{
					for (lidy = 0; lidy < kernel->local_size3[1]; lidy++)
					{
						for (lidx = 0; lidx < kernel->local_size3[0]; lidx++)
						{
							/* Wavefront ID */
							wid = gid * ndrange->wavefronts_per_work_group +
								lid / si_emu_wavefront_size;
							assert(wid < ndrange->wavefront_count);
							wavefront = ndrange->wavefronts[wid];
							
							/* Create work-item */
							ndrange->work_items[tid] = si_work_item_create();
							work_item = ndrange->work_items[tid];
							work_item->ndrange = ndrange;

							/* Global IDs */
							work_item->id_3d[0] = gidx * kernel->local_size3[0] + lidx;
							work_item->id_3d[1] = gidy * kernel->local_size3[1] + lidy;
							work_item->id_3d[2] = gidz * kernel->local_size3[2] + lidz;
							work_item->id = tid;

							/* Local IDs */
							work_item->id_in_work_group_3d[0] = lidx;
							work_item->id_in_work_group_3d[1] = lidy;
							work_item->id_in_work_group_3d[2] = lidz;
							work_item->id_in_work_group = lid;

							/* Other */
							work_item->id_in_wavefront = work_item->id_in_work_group % si_emu_wavefront_size;
							work_item->work_group = ndrange->work_groups[gid];
							work_item->wavefront = ndrange->wavefronts[wid];

							/* First, last, and number of work-items in wavefront */
							if (!wavefront->work_item_count) {
								wavefront->work_item_id_first = tid;
								wavefront->work_items = &ndrange->work_items[tid];
							}
							wavefront->work_item_count++;
							wavefront->work_item_id_last = tid;

							/* Save local IDs in registers */
							work_item->vreg[0].as_int = lidx;  /* V0 */
							work_item->vreg[1].as_int = lidy;  /* V1 */
							work_item->vreg[2].as_int = lidz;  /* V2 */

							/* Next work-item */
							tid++;
							lid++;
						}
					}
				}

				/* Next work-group */
				gid++;
			}
		}
	}

	/* Initialize the wavefronts */
	for (wid = 0; wid < ndrange->wavefront_count; wid++)
	{
		/* Assign names to wavefronts */
		wavefront = ndrange->wavefronts[wid];
		snprintf(wavefront->name, sizeof(wavefront->name), "wavefront[i%d-i%d]",
			wavefront->work_item_id_first, wavefront->work_item_id_last);

		/* Initialize wavefront program counter */
		if (!kernel->bin_file->enc_dict_entry_southern_islands->sec_text_buffer.size)
			fatal("%s: cannot load kernel code", __FUNCTION__);
		wavefront->inst_buf_start = kernel->bin_file->enc_dict_entry_southern_islands->sec_text_buffer.ptr;
		wavefront->inst_buf = kernel->bin_file->enc_dict_entry_southern_islands->sec_text_buffer.ptr;

		/* Save work-group IDs in registers */
		unsigned int user_sgpr = kernel->bin_file->enc_dict_entry_southern_islands->compute_pgm_rsrc2->user_sgpr;
		wavefront->sreg[user_sgpr].as_int = wavefront->work_group->id_3d[0];
		wavefront->sreg[user_sgpr + 1].as_int = wavefront->work_group->id_3d[1];
		wavefront->sreg[user_sgpr + 2].as_int = wavefront->work_group->id_3d[2];

		/* Initialize Constant Buffers */
		unsigned int userElementCount = kernel->bin_file->enc_dict_entry_southern_islands->userElementCount;
		struct si_bin_enc_user_element_t* userElements = kernel->bin_file->enc_dict_entry_southern_islands->userElements;
		for (int i = 0; i < userElementCount; i++)
		{
			if (userElements[i].dataClass == IMM_CONST_BUFFER)
			{
				si_wavefront_init_sreg_with_cb(wavefront, userElements[i].startUserReg, userElements[i].userRegCount, userElements[i].apiSlot);
			}
			else if (userElements[i].dataClass == IMM_UAV)
			{
				si_wavefront_init_sreg_with_cb(wavefront, userElements[i].startUserReg, userElements[i].userRegCount, userElements[i].apiSlot);
			}
			else if (userElements[i].dataClass == PTR_CONST_BUFFER_TABLE)
			{
				si_wavefront_init_sreg_with_uav_table(wavefront, userElements[i].startUserReg, userElements[i].userRegCount);
			}
			else if (userElements[i].dataClass == PTR_UAV_TABLE)
			{
				si_wavefront_init_sreg_with_uav_table(wavefront, userElements[i].startUserReg, userElements[i].userRegCount);
			}
			else
			{
				fatal("Unimplemented User Element: dataClass:%d", userElements[i].dataClass);
			}
		}

		/* Initialize the execution mask */
		wavefront->sreg[SI_EXEC].as_int = 0xFFFFFFFF;
		wavefront->sreg[SI_EXEC + 1].as_int = 0xFFFFFFFF;
		wavefront->sreg[SI_EXECZ].as_int = 0;
	}

	/* Debug */
	si_isa_debug("local_size = %d (%d,%d,%d)\n", kernel->local_size, kernel->local_size3[0],
		kernel->local_size3[1], kernel->local_size3[2]);
	si_isa_debug("global_size = %d (%d,%d,%d)\n", kernel->global_size, kernel->global_size3[0],
		kernel->global_size3[1], kernel->global_size3[2]);
	si_isa_debug("group_count = %d (%d,%d,%d)\n", kernel->group_count, kernel->group_count3[0],
		kernel->group_count3[1], kernel->group_count3[2]);
	si_isa_debug("wavefront_count = %d\n", ndrange->wavefront_count);
	si_isa_debug("wavefronts_per_work_group = %d\n", ndrange->wavefronts_per_work_group);
	si_isa_debug(" tid tid2 tid1 tid0   gid gid2 gid1 gid0   lid lid2 lid1 lid0  wavefront            work-group\n");
	for (tid = 0; tid < ndrange->work_item_count; tid++)
	{
		work_item = ndrange->work_items[tid];
		wavefront = work_item->wavefront;
		work_group = work_item->work_group;
		si_isa_debug("%4d %4d %4d %4d  ", work_item->id, work_item->id_3d[2],
			work_item->id_3d[1], work_item->id_3d[0]);
		si_isa_debug("%4d %4d %4d %4d  ", work_group->id, work_group->id_3d[2],
			work_group->id_3d[1], work_group->id_3d[0]);
		si_isa_debug("%4d %4d %4d %4d  ", work_item->id_in_work_group, 
			work_item->id_in_work_group_3d[2], work_item->id_in_work_group_3d[1], 
			work_item->id_in_work_group_3d[0]);
		si_isa_debug("%20s.%-4d  ", wavefront->name, work_item->id_in_wavefront);
		si_isa_debug("%20s.%-4d\n", work_group->name, work_item->id_in_work_group);
	}

}
Beispiel #22
0
void si_ndrange_setup_work_items(struct si_ndrange_t *ndrange)
{
	struct si_opencl_kernel_t *kernel = ndrange->kernel;

	struct si_work_group_t *work_group;
	struct si_wavefront_t *wavefront;
	struct si_work_item_t *work_item;

	int gidx, gidy, gidz;  /* 3D work-group ID iterators */
	int lidx, lidy, lidz;  /* 3D work-item local ID iterators */

	int tid;  /* Global ID iterator */
	int gid;  /* Group ID iterator */
	int wid;  /* Wavefront ID iterator */
	int lid;  /* Local ID iterator */

	/*MIAOW start */
	char config_str[100];
	sprintf(config_str, "config_%d.txt", kernel_config_count);
	FILE* config = fopen(config_str, "w");
	/*MIAOW stop */

	/*MIAOW start*/
	//UNIT TEST
	char unit_test_input_buf[150000];
	char *tok = NULL;
	char *config_read_result = NULL;
	char vreg_str[64][2500];
	char sreg_str[2500];

	FILE* unit_test_config = fopen("unit_test_config.txt", "r");
	if (unit_test_config != 0)
	{
		int i;
		int num_of_threads = 0;

		//ndrange->wavefront_count = 1;
		//kernel->group_count = 1;
		kernel->local_size3[2] = 1;
		kernel->local_size3[1] = 1;
		kernel->global_size3[2] = 1;
		kernel->global_size3[1] = 1;
		
		config_read_result = fgets(unit_test_input_buf, 150000, unit_test_config);
		if(config_read_result != NULL)
		{
			tok = strtok(unit_test_input_buf, ";"); //WG count
			kernel->group_count = atoi(tok);

			tok = strtok(NULL, ";"); //total number of threads
			num_of_threads = atoi(tok);

			kernel->global_size = atoi(tok);
			kernel->global_size3[0] = atoi(tok);
			kernel->local_size3[0] = atoi(tok);
			kernel->local_size = atoi(tok);

		}
	}

	//WorkGroup count and thread count
	fprintf(config,"%d;%d;\n", kernel->group_count, kernel->global_size);
#ifdef MIAOW_DEBUG
	fflush(config);
#endif

	/*MIAOW stop*/

	/* Array of work-groups */
	ndrange->work_group_count = kernel->group_count;
	ndrange->work_group_id_first = 0;
	ndrange->work_group_id_last = ndrange->work_group_count - 1;
	ndrange->work_groups = xcalloc(ndrange->work_group_count, sizeof(void *));
	for (gid = 0; gid < kernel->group_count; gid++)
	{
		ndrange->work_groups[gid] = si_work_group_create();
		work_group = ndrange->work_groups[gid];
	}

	/* Array of wavefronts */
	ndrange->wavefronts_per_work_group = (kernel->local_size + si_emu_wavefront_size - 1) / si_emu_wavefront_size;
	ndrange->wavefront_count = ndrange->wavefronts_per_work_group * ndrange->work_group_count;
	ndrange->wavefront_id_first = 0;
	ndrange->wavefront_id_last = ndrange->wavefront_count - 1;
	assert(ndrange->wavefronts_per_work_group > 0 && ndrange->wavefront_count > 0);
	ndrange->wavefronts = xcalloc(ndrange->wavefront_count, sizeof(void *));
	ndrange->scalar_work_items = xcalloc(ndrange->wavefront_count, sizeof(void *));
	for (wid = 0; wid < ndrange->wavefront_count; wid++)
	{
		gid = wid / ndrange->wavefronts_per_work_group;
		ndrange->wavefronts[wid] = si_wavefront_create();
		wavefront = ndrange->wavefronts[wid];
		work_group = ndrange->work_groups[gid];

		wavefront->id = wid;
		wavefront->id_in_work_group = wid % ndrange->wavefronts_per_work_group;
		wavefront->ndrange = ndrange;
		wavefront->work_group = work_group;
		DOUBLE_LINKED_LIST_INSERT_TAIL(work_group, running, wavefront);

		/* Initialize the scalar work item */
		ndrange->scalar_work_items[wid] = si_work_item_create();
		wavefront->scalar_work_item = ndrange->scalar_work_items[wid];
		ndrange->scalar_work_items[wid]->wavefront = wavefront;
		ndrange->scalar_work_items[wid]->work_group = work_group;
		ndrange->scalar_work_items[wid]->ndrange = ndrange;
	}

#ifdef MIAOW_DEBUG
	fprintf(config, "Processing Workitems\n");
	fflush(config);
#endif
	/* Array of work-items */
	ndrange->work_item_count = kernel->global_size;
	ndrange->work_item_id_first = 0;
	ndrange->work_item_id_last = ndrange->work_item_count - 1;
	ndrange->work_items = xcalloc(ndrange->work_item_count, sizeof(void *));
	tid = 0;
	gid = 0;
	for (gidz = 0; gidz < kernel->group_count3[2]; gidz++)
	{
		for (gidy = 0; gidy < kernel->group_count3[1]; gidy++)
		{
			for (gidx = 0; gidx < kernel->group_count3[0]; gidx++)
			{
				/* Assign work-group ID */
				work_group = ndrange->work_groups[gid];
				work_group->ndrange = ndrange;
				work_group->id_3d[0] = gidx;
				work_group->id_3d[1] = gidy;
				work_group->id_3d[2] = gidz;
				work_group->id = gid;
				si_work_group_set_status(work_group, si_work_group_pending);

				/* First, last, and number of work-items in work-group */
				work_group->work_item_id_first = tid;
				work_group->work_item_id_last = tid + kernel->local_size;
				work_group->work_item_count = kernel->local_size;
				work_group->work_items = &ndrange->work_items[tid];
				snprintf(work_group->name, sizeof(work_group->name), "work-group[i%d-i%d]",
					work_group->work_item_id_first, work_group->work_item_id_last);

				/* First ,last, and number of wavefronts in work-group */
				work_group->wavefront_id_first = gid * ndrange->wavefronts_per_work_group;
				work_group->wavefront_id_last = work_group->wavefront_id_first + ndrange->wavefronts_per_work_group - 1;
				work_group->wavefront_count = ndrange->wavefronts_per_work_group;
				work_group->wavefronts = &ndrange->wavefronts[work_group->wavefront_id_first];
				/* Iterate through work-items */
				lid = 0;
				for (lidz = 0; lidz < kernel->local_size3[2]; lidz++)
				{
					for (lidy = 0; lidy < kernel->local_size3[1]; lidy++)
					{
						for (lidx = 0; lidx < kernel->local_size3[0]; lidx++)
						{
							/* Wavefront ID */
							wid = gid * ndrange->wavefronts_per_work_group +
								lid / si_emu_wavefront_size;
							assert(wid < ndrange->wavefront_count);
							wavefront = ndrange->wavefronts[wid];
							
							/* Create work-item */
							ndrange->work_items[tid] = si_work_item_create();
							work_item = ndrange->work_items[tid];
							work_item->ndrange = ndrange;

							/* Global IDs */
							work_item->id_3d[0] = gidx * kernel->local_size3[0] + lidx;
							work_item->id_3d[1] = gidy * kernel->local_size3[1] + lidy;
							work_item->id_3d[2] = gidz * kernel->local_size3[2] + lidz;
							work_item->id = tid;

							/* Local IDs */
							work_item->id_in_work_group_3d[0] = lidx;
							work_item->id_in_work_group_3d[1] = lidy;
							work_item->id_in_work_group_3d[2] = lidz;
							work_item->id_in_work_group = lid;

							/* Other */
							work_item->id_in_wavefront = work_item->id_in_work_group % si_emu_wavefront_size;
							work_item->work_group = ndrange->work_groups[gid];
							work_item->wavefront = ndrange->wavefronts[wid];

							/*MIAOW start*/
							work_item->id = work_item->id_in_wavefront;
							/*MIAOW stop*/

							/* First, last, and number of work-items in wavefront */
							if (!wavefront->work_item_count) {
								wavefront->work_item_id_first = tid;
								wavefront->work_items = &ndrange->work_items[tid];
							}
							wavefront->work_item_count++;
							wavefront->work_item_id_last = tid;

							//Initializing all vreg values to zero, so that config.txt doesnt change with each run
							/*MIAOW start*/
							for (int vreg_init_index; vreg_init_index < 256; vreg_init_index++)
							{
								work_item->vreg[vreg_init_index].as_int = 0;
							}
							/*MIAOW stop*/

							/* Save local IDs in registers */
							work_item->vreg[0].as_int = lidx;  /* V0 */
							work_item->vreg[1].as_int = lidy;  /* V1 */
							work_item->vreg[2].as_int = lidz;  /* V2 */

							/* Next work-item */
							tid++;
							lid++;
						}
					}
				}

				/* Next work-group */
				gid++;
			}
		}
	}

	/*MIAOW start */
	//This part is for unit test trace generation.
	//If the file unit_test_instr.mem is present, the contents will be read and placed in the instruction buffer.
	FILE* unit_test_instr = fopen("unit_test_instr.mem", "r");

	if (unit_test_instr != 0)
	{
		unsigned char instr_buf[200];

		int input_instr_count = 0;

		fgets(instr_buf, 200, unit_test_instr); //address

		unsigned char* buf_ptr = (unsigned char*)kernel->bin_file->enc_dict_entry_southern_islands->sec_text_buffer.ptr;

		while (fgets(instr_buf, 200, unit_test_instr) != NULL)
		{
			instr_buf[2] = '\0'; //interested only in first byte.

			unsigned char cur_instr = (unsigned char)strtol(instr_buf, 0, 16);
			buf_ptr[input_instr_count++] = cur_instr;
		}

		kernel->bin_file->enc_dict_entry_southern_islands->sec_text_buffer.size = input_instr_count;

		fclose(unit_test_instr);
	}
	/*MIAOW stop */

	/* Initialize the wavefronts */
	for (wid = 0; wid < ndrange->wavefront_count; wid++)
	{
		/* Assign names to wavefronts */
		wavefront = ndrange->wavefronts[wid];
		snprintf(wavefront->name, sizeof(wavefront->name), "wavefront[i%d-i%d]",
			wavefront->work_item_id_first, wavefront->work_item_id_last);

		/* Initialize wavefront program counter */
		if (!kernel->bin_file->enc_dict_entry_southern_islands->sec_text_buffer.size)
			fatal("%s: cannot load kernel code", __FUNCTION__);
		wavefront->wavefront_pool_start = kernel->bin_file->enc_dict_entry_southern_islands->sec_text_buffer.ptr;
		wavefront->wavefront_pool = kernel->bin_file->enc_dict_entry_southern_islands->sec_text_buffer.ptr;

		//Initializing all sreg values to zero, so that config.txt doesnt change with each run
		/*MIAOW start*/
		for (int sreg_init_index; sreg_init_index < 256; sreg_init_index++)
		{
			//wavefront->sreg[sreg_init_index].as_int = 0;
		}
		/*MIAOW stop*/

		/* Save work-group IDs in registers */
		unsigned int user_sgpr = kernel->bin_file->
			enc_dict_entry_southern_islands->compute_pgm_rsrc2->user_sgpr;
		wavefront->sreg[user_sgpr].as_int = wavefront->work_group->id_3d[0];
		wavefront->sreg[user_sgpr + 1].as_int = wavefront->work_group->id_3d[1];
		wavefront->sreg[user_sgpr + 2].as_int = wavefront->work_group->id_3d[2];

		/* Initialize Constant Buffers */
		unsigned int userElementCount = kernel->bin_file->enc_dict_entry_southern_islands->userElementCount;
		struct si_bin_enc_user_element_t* userElements = kernel->bin_file->enc_dict_entry_southern_islands->userElements;
		for (int i = 0; i < userElementCount; i++)
		{
			if (userElements[i].dataClass == IMM_CONST_BUFFER)
			{
				si_wavefront_init_sreg_with_cb(wavefront, userElements[i].startUserReg, userElements[i].userRegCount, userElements[i].apiSlot);
			}
			else if (userElements[i].dataClass == IMM_UAV)
			{
				si_wavefront_init_sreg_with_cb(wavefront, userElements[i].startUserReg, userElements[i].userRegCount, userElements[i].apiSlot);
			}
			else if (userElements[i].dataClass == PTR_CONST_BUFFER_TABLE)
			{
				si_wavefront_init_sreg_with_uav_table(wavefront, userElements[i].startUserReg, userElements[i].userRegCount);
			}
			else if (userElements[i].dataClass == PTR_UAV_TABLE)
			{
				si_wavefront_init_sreg_with_uav_table(wavefront, userElements[i].startUserReg, userElements[i].userRegCount);
			}
			else
			{
				fatal("Unimplemented User Element: dataClass:%d", userElements[i].dataClass);
			}
		}

		//MIAOW m2s is not setting exec mask properly
		/* Initialize the execution mask */
		//wavefront->sreg[SI_EXEC].as_int = 0xFFFFFFFF;
		//wavefront->sreg[SI_EXEC + 1].as_int = 0xFFFFFFFF;
		//wavefront->sreg[SI_EXECZ].as_int = 0;

		/*MIAOW start*/
		//EXEC Mask init
		unsigned long long mask;
		if(wavefront->work_item_count == 64)
		{
			mask = 0xFFFFFFFFFFFFFFFF;
		}
		else
		{
			mask = powl(2, wavefront->work_item_count) - 1;
		}

		wavefront->sreg[SI_EXEC].as_uint = (unsigned int)mask;
		wavefront->sreg[SI_EXEC + 1].as_uint = mask>>32;
		wavefront->sreg[SI_EXECZ].as_int = 0;
		/*MIAOW stop*/


		/*MIAOW start*/
		if(config_read_result != NULL)
		{
			if(NULL != fgets(unit_test_input_buf, 150000, unit_test_config))
			{
				int num_of_threads = 0;
				int thread_init_count = 0;

				tok = strtok(unit_test_input_buf, ";"); //WGID
				tok = strtok(NULL, ";"); //WFID
				tok = strtok(NULL, ";"); //WF count
				tok = strtok(NULL, ";"); //thread count

				num_of_threads = atoi(tok);
#ifdef MIAOW_DEBUG
				if (num_of_threads != wavefront->work_item_count)
				{
					fprintf(config, "num_thread MISMATCH %d!=%d\n", num_of_threads, wavefront->work_item_count);
				}
				else
				{
					fprintf(config, "num_thread match %d=%d\n", num_of_threads, wavefront->work_item_count);
				}
				fflush(config);
#endif
				tok = strtok(NULL, ";"); //VREG size
				kernel->bin_file->enc_dict_entry_southern_islands->num_vgpr_used = atoi(tok);

				tok = strtok(NULL, ";"); //SREG size
				kernel->bin_file->enc_dict_entry_southern_islands->num_sgpr_used = atoi(tok);

				tok = strtok(NULL, ";"); //LDS size
				kernel->bin_file->enc_dict_entry_southern_islands->lds_size_used = atoi(tok);

				for(thread_init_count = 0; thread_init_count < num_of_threads; thread_init_count++)
				{
					tok = strtok(NULL, ";");
					strcpy((char*)vreg_str[thread_init_count], tok);
					assert(vreg_str[thread_init_count][0] == 'V');
				}

				tok = strtok(NULL, ";");
				strcpy((char*)sreg_str, tok);
				assert(sreg_str[0] == 'S');

				tok = strtok(NULL, ";"); //PC
			}

#ifdef MIAOW_DEBUG
				fprintf(config, "Initializing VREG \n");
				fflush(config);
#endif
			//VREG value init
			int wi_init_count = 0;
			for (wi_init_count = 0; wi_init_count < wavefront->work_item_count; wi_init_count++)
			{
				if (wavefront->work_items != NULL)
				{
					int vreg_init_count = 0;
					char *reg_tok;
					struct si_work_item_t* wi = wavefront->work_items[wi_init_count];

					reg_tok = strtok(vreg_str[wi_init_count], ":");
					reg_tok = strtok(NULL, "=");

					for(vreg_init_count = 0; reg_tok != NULL; vreg_init_count++)
					{
						int vreg_index = atoi(reg_tok);
						reg_tok = strtok(NULL, ",");
						assert(reg_tok != NULL);
						wi->vreg[vreg_index].as_int = atoi(reg_tok);
						reg_tok = strtok(NULL, "=");
					}

					// make sure that all reg values were read
					assert(reg_tok == NULL);
				}
			}

#ifdef MIAOW_DEBUG
				fprintf(config, "Initializing SREG \n");
				fflush(config);
#endif

#ifdef MIAOW_DEBUG
				fprintf(config, "mask: %lld \n", mask);
				fprintf(config, "MASK HI: %u \n", wavefront->sreg[SI_EXEC + 1].as_uint);
				fprintf(config, "MASK LO: %u \n", wavefront->sreg[SI_EXEC].as_uint);
				fflush(config);
#endif
			//SREG value init
			int sreg_init_count = 0;
			char *sreg_tok;
			sreg_tok = strtok(sreg_str, ":");
			sreg_tok = strtok(NULL, "=");
			for(sreg_init_count=0; sreg_tok != NULL; sreg_init_count++)
			{
				int sreg_index = atoi(sreg_tok);
				sreg_tok = strtok(NULL, ",");
				assert(sreg_tok != NULL);
				wavefront->sreg[sreg_index].as_int = atoi(sreg_tok);
				sreg_tok = strtok(NULL, "=");
			}
			// make sure that all reg values were read
			assert(sreg_tok == NULL);
		}
		
		/*MIAOW stop*/

		/*MIAOW start*/
		//WorkGroup ID
		fprintf(config,"%d;",wavefront->work_group->id);

		//Wavefront ID
		fprintf(config,"%d;",wavefront->id_in_work_group);

		//Wavefront Count
		fprintf(config,"%d;",wavefront->work_group->wavefront_count);

		//Thread count
		fprintf(config,"%d;",wavefront->work_item_count);

		//VGPR size, SGPR size, LDS size
		fprintf(config,"%d;",kernel->bin_file->enc_dict_entry_southern_islands->num_vgpr_used);
		fprintf(config,"%d;",kernel->bin_file->enc_dict_entry_southern_islands->num_sgpr_used);
		fprintf(config,"%d;",kernel->bin_file->enc_dict_entry_southern_islands->lds_size_used);

#ifdef MIAOW_DEBUG
		fflush(config);
#endif
		int wi_count = 0;
		for (wi_count = 0; wi_count < wavefront->work_item_count; wi_count++)
		{
			//VGPR initial values
			if (wavefront->work_items != NULL)
			{
				struct si_work_item_t* wi = wavefront->work_items[wi_count];

				fprintf(config,"V:");
				int vgpr_count = 0;
				for (vgpr_count = 0; vgpr_count < (kernel->bin_file->enc_dict_entry_southern_islands->num_vgpr_used - 1); vgpr_count++)
				{
					//All VGPR values except the last
					fprintf(config,"%d=%d,", vgpr_count, wi->vreg[vgpr_count]);
				}
				//Last SGPR value
				fprintf(config,"%d=%d;", vgpr_count, wi->vreg[vgpr_count]);
			}
		}

		//SGPR initial values
		fprintf(config,"S:");
		int sgpr_count = 0;
		for (sgpr_count = 0; sgpr_count < (kernel->bin_file->enc_dict_entry_southern_islands->num_sgpr_used - 1); sgpr_count++)
		{
			//All SGPR values except the last
			fprintf(config,"%d=%d,", sgpr_count, wavefront->sreg[sgpr_count]);
		}
		//Last SGPR value
		fprintf(config,"%d=%d;", sgpr_count, wavefront->sreg[sgpr_count]);

		//PC start
		//fprintf(config,"%d",wavefront->wavefront_pool_start);
		fprintf(config, "0");
		fprintf(config,"\n");
		/*MIAOW stop*/
	}
	
	/*MIAOW start */
	fclose(config);

	char instr_str[100];
	sprintf(instr_str, "instr_%d.mem", kernel_config_count);
	FILE* instr = fopen(instr_str, "w");
	//fprintf(instr, "@%.8x\n", kernel->bin_file->enc_dict_entry_southern_islands->sec_text_buffer.ptr);
	fprintf(instr, "@0\n");
	for (int instr_count = 0; instr_count < kernel->bin_file->enc_dict_entry_southern_islands->sec_text_buffer.size; instr_count++)
	{
		fprintf(instr, "%.2x\n", ((unsigned char*)kernel->bin_file->enc_dict_entry_southern_islands->sec_text_buffer.ptr)[instr_count]);
	}
	fclose(instr);
	/*MIAOW stop */

	/* Debug */
	si_isa_debug("local_size = %d (%d,%d,%d)\n", kernel->local_size, kernel->local_size3[0],
		kernel->local_size3[1], kernel->local_size3[2]);
	si_isa_debug("global_size = %d (%d,%d,%d)\n", kernel->global_size, kernel->global_size3[0],
		kernel->global_size3[1], kernel->global_size3[2]);
	si_isa_debug("group_count = %d (%d,%d,%d)\n", kernel->group_count, kernel->group_count3[0],
		kernel->group_count3[1], kernel->group_count3[2]);
	si_isa_debug("wavefront_count = %d\n", ndrange->wavefront_count);
	si_isa_debug("wavefronts_per_work_group = %d\n", ndrange->wavefronts_per_work_group);
	si_isa_debug(" tid tid2 tid1 tid0   gid gid2 gid1 gid0   lid lid2 lid1 lid0  wavefront            work-group\n");
	for (tid = 0; tid < ndrange->work_item_count; tid++)
	{
		work_item = ndrange->work_items[tid];
		wavefront = work_item->wavefront;
		work_group = work_item->work_group;
		si_isa_debug("%4d %4d %4d %4d  ", work_item->id, work_item->id_3d[2],
			work_item->id_3d[1], work_item->id_3d[0]);
		si_isa_debug("%4d %4d %4d %4d  ", work_group->id, work_group->id_3d[2],
			work_group->id_3d[1], work_group->id_3d[0]);
		si_isa_debug("%4d %4d %4d %4d  ", work_item->id_in_work_group, 
			work_item->id_in_work_group_3d[2], work_item->id_in_work_group_3d[1], 
			work_item->id_in_work_group_3d[0]);
		si_isa_debug("%20s.%-4d  ", wavefront->name, work_item->id_in_wavefront);
		si_isa_debug("%20s.%-4d\n", work_group->name, work_item->id_in_work_group);
	}

}