示例#1
0
/* Run one iteration of the Southern Islands GPU emulation loop.
 * Return FALSE if there is no more emulation to perform. */
int si_emu_run(void)
{
	struct si_ndrange_t *ndrange;
	struct si_ndrange_t *ndrange_next;

	struct si_work_group_t *work_group;
	struct si_work_group_t *work_group_next;

	struct si_wavefront_t *wavefront;
	struct si_wavefront_t *wavefront_next;

	/* For efficiency when no Southern Islands emulation is selected, exit here
	 * if the list of existing ND-Ranges is empty. */
	if (!si_emu->ndrange_list_count)
		return 0;

	/* Start any ND-Range in state 'pending' */
	while ((ndrange = si_emu->pending_ndrange_list_head))
	{
		/* Set all ready work-groups to running */
		while ((work_group = ndrange->pending_list_head))
		{
			si_work_group_clear_status(work_group, si_work_group_pending);
			si_work_group_set_status(work_group, si_work_group_running);
		}

		/* Set is in state 'running' */
		si_ndrange_clear_status(ndrange, si_ndrange_pending);
		si_ndrange_set_status(ndrange, si_ndrange_running);
	}

	/* Run one instruction of each wavefront in each work-group of each
	 * ND-Range that is in status 'running'. */
	for (ndrange = si_emu->running_ndrange_list_head; ndrange; ndrange = ndrange_next)
	{
		/* Save next ND-Range in state 'running'. This is done because the state
		 * might change during the execution of the ND-Range. */
		ndrange_next = ndrange->running_ndrange_list_next;

		/* Execute an instruction from each work-group */
		for (work_group = ndrange->running_list_head; work_group; work_group = work_group_next)
		{
			/* Save next running work-group */
			work_group_next = work_group->running_list_next;

			/* Run an instruction from each wavefront */
			for (wavefront = work_group->running_list_head; wavefront; wavefront = wavefront_next)
			{
				/* Save next running wavefront */
				wavefront_next = wavefront->running_list_next;

				/* Execute instruction in wavefront */
				si_wavefront_execute(wavefront);
			}
		}
	}

	/* Free ND-Ranges that finished */
	while ((ndrange = si_emu->finished_ndrange_list_head))
	{
		/* Dump ND-Range report */
		si_ndrange_dump(ndrange, si_emu_report_file);

		/* Stop if maximum number of kernels reached */
		if (si_emu_max_kernels && si_emu->ndrange_count >= si_emu_max_kernels)
			esim_finish = esim_finish_si_max_kernels;

		/* Extract from list of finished ND-Ranges and free */
		si_ndrange_free(ndrange);
	}

	/* Return TRUE */
	return 1;
}
示例#2
0
static void si_ndrange_setup_arrays(struct si_ndrange_t *ndrange)
{
	struct si_work_group_t *work_group;
	struct si_wavefront_t *wavefront;
	struct si_work_item_t *work_item;

	int gidx, gidy, gidz;  /* 3D work-group ID iterators */
	int lidx, lidy, lidz;  /* 3D work-item local ID iterators */

	int tid;  /* Global ID iterator */
	int gid;  /* Group ID iterator */
	int wid;  /* Wavefront ID iterator */
	int lid;  /* Local ID iterator */

	/* Array of work-groups */
	ndrange->work_group_count = ndrange->group_count;
	ndrange->work_group_id_first = 0;
	ndrange->work_group_id_last = ndrange->work_group_count - 1;
	ndrange->work_groups = xcalloc(ndrange->work_group_count, sizeof(void *));
	for (gid = 0; gid < ndrange->group_count; gid++)
	{
		ndrange->work_groups[gid] = si_work_group_create();
		work_group = ndrange->work_groups[gid];
	}

	/* Array of wavefronts */
	ndrange->wavefronts_per_work_group = 
		(ndrange->local_size + si_emu_wavefront_size - 1) /
		si_emu_wavefront_size;
	ndrange->wavefront_count = ndrange->wavefronts_per_work_group * 
		ndrange->work_group_count;
	ndrange->wavefront_id_first = 0;
	ndrange->wavefront_id_last = ndrange->wavefront_count - 1;
	assert(ndrange->wavefronts_per_work_group > 0 && 
		ndrange->wavefront_count > 0);
	ndrange->wavefronts = xcalloc(ndrange->wavefront_count, sizeof(void *));
	ndrange->scalar_work_items = xcalloc(ndrange->wavefront_count, 
		sizeof(void *));

	for (wid = 0; wid < ndrange->wavefront_count; wid++)
	{
		gid = wid / ndrange->wavefronts_per_work_group;
		ndrange->wavefronts[wid] = si_wavefront_create();
		wavefront = ndrange->wavefronts[wid];
		work_group = ndrange->work_groups[gid];

		wavefront->id = wid;
		wavefront->id_in_work_group = wid % 
			ndrange->wavefronts_per_work_group;
		wavefront->ndrange = ndrange;
		wavefront->work_group = work_group;
		DOUBLE_LINKED_LIST_INSERT_TAIL(work_group, running, wavefront);

		/* Initialize the scalar work item */
		ndrange->scalar_work_items[wid] = si_work_item_create();
		wavefront->scalar_work_item = ndrange->scalar_work_items[wid];
		ndrange->scalar_work_items[wid]->wavefront = wavefront;
		ndrange->scalar_work_items[wid]->work_group = work_group;
		ndrange->scalar_work_items[wid]->ndrange = ndrange;
	}

	/* Array of work-items */
	ndrange->work_item_count = ndrange->global_size;
	ndrange->work_item_id_first = 0;
	ndrange->work_item_id_last = ndrange->work_item_count - 1;
	ndrange->work_items = xcalloc(ndrange->work_item_count, sizeof(void *));
	tid = 0;
	gid = 0;
	for (gidz = 0; gidz < ndrange->group_count3[2]; gidz++)
	{
		for (gidy = 0; gidy < ndrange->group_count3[1]; gidy++)
		{
			for (gidx = 0; gidx < ndrange->group_count3[0]; gidx++)
			{
				/* Assign work-group ID */
				work_group = ndrange->work_groups[gid];
				work_group->ndrange = ndrange;
				work_group->id_3d[0] = gidx;
				work_group->id_3d[1] = gidy;
				work_group->id_3d[2] = gidz;
				work_group->id = gid;
				si_work_group_set_status(work_group, si_work_group_pending);

				/* First, last, and number of work-items in work-group */
				work_group->work_item_id_first = tid;
				work_group->work_item_id_last = tid + ndrange->local_size;
				work_group->work_item_count = ndrange->local_size;
				work_group->work_items = &ndrange->work_items[tid];
				snprintf(work_group->name, sizeof(work_group->name), "work-group[i%d-i%d]",
					work_group->work_item_id_first, work_group->work_item_id_last);

				/* First ,last, and number of wavefronts in work-group */
				work_group->wavefront_id_first = gid * ndrange->wavefronts_per_work_group;
				work_group->wavefront_id_last = work_group->wavefront_id_first + ndrange->wavefronts_per_work_group - 1;
				work_group->wavefront_count = ndrange->wavefronts_per_work_group;
				work_group->wavefronts = &ndrange->wavefronts[work_group->wavefront_id_first];
				/* Iterate through work-items */
				lid = 0;
				for (lidz = 0; lidz < ndrange->local_size3[2]; lidz++)
				{
					for (lidy = 0; lidy < ndrange->local_size3[1]; lidy++)
					{
						for (lidx = 0; lidx < ndrange->local_size3[0]; lidx++)
						{
							/* Wavefront ID */
							wid = gid * ndrange->wavefronts_per_work_group +
								lid / si_emu_wavefront_size;
							assert(wid < ndrange->wavefront_count);
							wavefront = ndrange->wavefronts[wid];
							
							/* Create work-item */
							ndrange->work_items[tid] = si_work_item_create();
							work_item = ndrange->work_items[tid];
							work_item->ndrange = ndrange;

							/* Global IDs */
							work_item->id_3d[0] = gidx * ndrange->local_size3[0] + lidx;
							work_item->id_3d[1] = gidy * ndrange->local_size3[1] + lidy;
							work_item->id_3d[2] = gidz * ndrange->local_size3[2] + lidz;
							work_item->id = tid;

							/* Local IDs */
							work_item->id_in_work_group_3d[0] = lidx;
							work_item->id_in_work_group_3d[1] = lidy;
							work_item->id_in_work_group_3d[2] = lidz;
							work_item->id_in_work_group = lid;

							/* Other */
							work_item->id_in_wavefront = work_item->id_in_work_group % si_emu_wavefront_size;
							work_item->work_group = ndrange->work_groups[gid];
							work_item->wavefront = ndrange->wavefronts[wid];

							/*MIAOW start*/
							work_item->id = work_item->id_in_wavefront;
							/*MIAOW stop*/

							/* First, last, and number of work-items in wavefront */
							if (!wavefront->work_item_count)
							{
								wavefront->work_item_id_first = tid;
								wavefront->work_items = &ndrange->work_items[tid];
							}
							wavefront->work_item_count++;
							wavefront->work_item_id_last = tid;

							/* Next work-item */
							tid++;
							lid++;
						}
					}
				}

				/* Next work-group */
				gid++;
			}
		}
	}

	/* Initialize the wavefronts */
	for (wid = 0; wid < ndrange->wavefront_count; wid++)
	{
		/* Assign names to wavefronts */
		wavefront = ndrange->wavefronts[wid];
		snprintf(wavefront->name, sizeof(wavefront->name),
			"wavefront[i%d-i%d]",
			wavefront->work_item_id_first,
			wavefront->work_item_id_last);
	}

	/* Debug */
	si_isa_debug("local_size = %d (%d,%d,%d)\n", ndrange->local_size,
		ndrange->local_size3[0], ndrange->local_size3[1],
		ndrange->local_size3[2]);
	si_isa_debug("global_size = %d (%d,%d,%d)\n", ndrange->global_size,
		ndrange->global_size3[0], ndrange->global_size3[1],
		ndrange->global_size3[2]);
	si_isa_debug("group_count = %d (%d,%d,%d)\n", ndrange->group_count,
		ndrange->group_count3[0], ndrange->group_count3[1],
		ndrange->group_count3[2]);
	si_isa_debug("wavefront_count = %d\n", ndrange->wavefront_count);
	si_isa_debug("wavefronts_per_work_group = %d\n",
		ndrange->wavefronts_per_work_group);
	si_isa_debug("\n");
}
示例#3
0
void si_ndrange_setup_work_items(struct si_ndrange_t *ndrange)
{
	struct si_opencl_kernel_t *kernel = ndrange->kernel;

	struct si_work_group_t *work_group;
	struct si_wavefront_t *wavefront;
	struct si_work_item_t *work_item;

	int gidx, gidy, gidz;  /* 3D work-group ID iterators */
	int lidx, lidy, lidz;  /* 3D work-item local ID iterators */

	int tid;  /* Global ID iterator */
	int gid;  /* Group ID iterator */
	int wid;  /* Wavefront ID iterator */
	int lid;  /* Local ID iterator */

	/* Array of work-groups */
	ndrange->work_group_count = kernel->group_count;
	ndrange->work_group_id_first = 0;
	ndrange->work_group_id_last = ndrange->work_group_count - 1;
	ndrange->work_groups = calloc(ndrange->work_group_count, sizeof(void *));
	for (gid = 0; gid < kernel->group_count; gid++)
	{
		ndrange->work_groups[gid] = si_work_group_create();
		work_group = ndrange->work_groups[gid];
	}

	/* Array of wavefronts */
	ndrange->wavefronts_per_work_group = (kernel->local_size + si_emu_wavefront_size - 1) / si_emu_wavefront_size;
	ndrange->wavefront_count = ndrange->wavefronts_per_work_group * ndrange->work_group_count;
	ndrange->wavefront_id_first = 0;
	ndrange->wavefront_id_last = ndrange->wavefront_count - 1;
	assert(ndrange->wavefronts_per_work_group > 0 && ndrange->wavefront_count > 0);
	ndrange->wavefronts = calloc(ndrange->wavefront_count, sizeof(void *));
	ndrange->scalar_work_items = calloc(ndrange->wavefront_count, sizeof(void *));
	for (wid = 0; wid < ndrange->wavefront_count; wid++)
	{
		gid = wid / ndrange->wavefronts_per_work_group;
		ndrange->wavefronts[wid] = si_wavefront_create();
		wavefront = ndrange->wavefronts[wid];
		work_group = ndrange->work_groups[gid];

		wavefront->id = wid;
		wavefront->id_in_work_group = wid % ndrange->wavefronts_per_work_group;
		wavefront->ndrange = ndrange;
		wavefront->work_group = work_group;
		DOUBLE_LINKED_LIST_INSERT_TAIL(work_group, running, wavefront);

		/* Initialize the scalar work item */
		ndrange->scalar_work_items[wid] = si_work_item_create();
		wavefront->scalar_work_item = ndrange->scalar_work_items[wid];
		ndrange->scalar_work_items[wid]->wavefront = wavefront;
		ndrange->scalar_work_items[wid]->work_group = work_group;
		ndrange->scalar_work_items[wid]->ndrange = ndrange;
	}

	/* Array of work-items */
	ndrange->work_item_count = kernel->global_size;
	ndrange->work_item_id_first = 0;
	ndrange->work_item_id_last = ndrange->work_item_count - 1;
	ndrange->work_items = calloc(ndrange->work_item_count, sizeof(void *));
	tid = 0;
	gid = 0;
	for (gidz = 0; gidz < kernel->group_count3[2]; gidz++)
	{
		for (gidy = 0; gidy < kernel->group_count3[1]; gidy++)
		{
			for (gidx = 0; gidx < kernel->group_count3[0]; gidx++)
			{
				/* Assign work-group ID */
				work_group = ndrange->work_groups[gid];
				work_group->ndrange = ndrange;
				work_group->id_3d[0] = gidx;
				work_group->id_3d[1] = gidy;
				work_group->id_3d[2] = gidz;
				work_group->id = gid;
				si_work_group_set_status(work_group, si_work_group_pending);

				/* First, last, and number of work-items in work-group */
				work_group->work_item_id_first = tid;
				work_group->work_item_id_last = tid + kernel->local_size;
				work_group->work_item_count = kernel->local_size;
				work_group->work_items = &ndrange->work_items[tid];
				snprintf(work_group->name, sizeof(work_group->name), "work-group[i%d-i%d]",
					work_group->work_item_id_first, work_group->work_item_id_last);

				/* First ,last, and number of wavefronts in work-group */
				work_group->wavefront_id_first = gid * ndrange->wavefronts_per_work_group;
				work_group->wavefront_id_last = work_group->wavefront_id_first + ndrange->wavefronts_per_work_group - 1;
				work_group->wavefront_count = ndrange->wavefronts_per_work_group;
				work_group->wavefronts = &ndrange->wavefronts[work_group->wavefront_id_first];
				/* Iterate through work-items */
				lid = 0;
				for (lidz = 0; lidz < kernel->local_size3[2]; lidz++)
				{
					for (lidy = 0; lidy < kernel->local_size3[1]; lidy++)
					{
						for (lidx = 0; lidx < kernel->local_size3[0]; lidx++)
						{
							/* Wavefront ID */
							wid = gid * ndrange->wavefronts_per_work_group +
								lid / si_emu_wavefront_size;
							assert(wid < ndrange->wavefront_count);
							wavefront = ndrange->wavefronts[wid];
							
							/* Create work-item */
							ndrange->work_items[tid] = si_work_item_create();
							work_item = ndrange->work_items[tid];
							work_item->ndrange = ndrange;

							/* Global IDs */
							work_item->id_3d[0] = gidx * kernel->local_size3[0] + lidx;
							work_item->id_3d[1] = gidy * kernel->local_size3[1] + lidy;
							work_item->id_3d[2] = gidz * kernel->local_size3[2] + lidz;
							work_item->id = tid;

							/* Local IDs */
							work_item->id_in_work_group_3d[0] = lidx;
							work_item->id_in_work_group_3d[1] = lidy;
							work_item->id_in_work_group_3d[2] = lidz;
							work_item->id_in_work_group = lid;

							/* Other */
							work_item->id_in_wavefront = work_item->id_in_work_group % si_emu_wavefront_size;
							work_item->work_group = ndrange->work_groups[gid];
							work_item->wavefront = ndrange->wavefronts[wid];

							/* First, last, and number of work-items in wavefront */
							if (!wavefront->work_item_count) {
								wavefront->work_item_id_first = tid;
								wavefront->work_items = &ndrange->work_items[tid];
							}
							wavefront->work_item_count++;
							wavefront->work_item_id_last = tid;

							/* Save local IDs in registers */
							work_item->vreg[0].as_int = lidx;  /* V0 */
							work_item->vreg[1].as_int = lidy;  /* V1 */
							work_item->vreg[2].as_int = lidz;  /* V2 */

							/* Next work-item */
							tid++;
							lid++;
						}
					}
				}

				/* Next work-group */
				gid++;
			}
		}
	}

	/* Initialize the wavefronts */
	for (wid = 0; wid < ndrange->wavefront_count; wid++)
	{
		/* Assign names to wavefronts */
		wavefront = ndrange->wavefronts[wid];
		snprintf(wavefront->name, sizeof(wavefront->name), "wavefront[i%d-i%d]",
			wavefront->work_item_id_first, wavefront->work_item_id_last);

		/* Initialize wavefront program counter */
		if (!kernel->bin_file->enc_dict_entry_southern_islands->sec_text_buffer.size)
			fatal("%s: cannot load kernel code", __FUNCTION__);
		wavefront->inst_buf_start = kernel->bin_file->enc_dict_entry_southern_islands->sec_text_buffer.ptr;
		wavefront->inst_buf = kernel->bin_file->enc_dict_entry_southern_islands->sec_text_buffer.ptr;

		/* Save work-group IDs in registers */
		unsigned int user_sgpr = kernel->bin_file->enc_dict_entry_southern_islands->compute_pgm_rsrc2->user_sgpr;
		wavefront->sreg[user_sgpr].as_int = wavefront->work_group->id_3d[0];
		wavefront->sreg[user_sgpr + 1].as_int = wavefront->work_group->id_3d[1];
		wavefront->sreg[user_sgpr + 2].as_int = wavefront->work_group->id_3d[2];

		/* Initialize Constant Buffers */
		unsigned int userElementCount = kernel->bin_file->enc_dict_entry_southern_islands->userElementCount;
		struct si_bin_enc_user_element_t* userElements = kernel->bin_file->enc_dict_entry_southern_islands->userElements;
		for (int i = 0; i < userElementCount; i++)
		{
			if (userElements[i].dataClass == IMM_CONST_BUFFER)
			{
				si_wavefront_init_sreg_with_cb(wavefront, userElements[i].startUserReg, userElements[i].userRegCount, userElements[i].apiSlot);
			}
			else if (userElements[i].dataClass == IMM_UAV)
			{
				si_wavefront_init_sreg_with_cb(wavefront, userElements[i].startUserReg, userElements[i].userRegCount, userElements[i].apiSlot);
			}
			else if (userElements[i].dataClass == PTR_CONST_BUFFER_TABLE)
			{
				si_wavefront_init_sreg_with_uav_table(wavefront, userElements[i].startUserReg, userElements[i].userRegCount);
			}
			else if (userElements[i].dataClass == PTR_UAV_TABLE)
			{
				si_wavefront_init_sreg_with_uav_table(wavefront, userElements[i].startUserReg, userElements[i].userRegCount);
			}
			else
			{
				fatal("Unimplemented User Element: dataClass:%d", userElements[i].dataClass);
			}
		}

		/* Initialize the execution mask */
		wavefront->sreg[SI_EXEC].as_int = 0xFFFFFFFF;
		wavefront->sreg[SI_EXEC + 1].as_int = 0xFFFFFFFF;
		wavefront->sreg[SI_EXECZ].as_int = 0;
	}

	/* Debug */
	si_isa_debug("local_size = %d (%d,%d,%d)\n", kernel->local_size, kernel->local_size3[0],
		kernel->local_size3[1], kernel->local_size3[2]);
	si_isa_debug("global_size = %d (%d,%d,%d)\n", kernel->global_size, kernel->global_size3[0],
		kernel->global_size3[1], kernel->global_size3[2]);
	si_isa_debug("group_count = %d (%d,%d,%d)\n", kernel->group_count, kernel->group_count3[0],
		kernel->group_count3[1], kernel->group_count3[2]);
	si_isa_debug("wavefront_count = %d\n", ndrange->wavefront_count);
	si_isa_debug("wavefronts_per_work_group = %d\n", ndrange->wavefronts_per_work_group);
	si_isa_debug(" tid tid2 tid1 tid0   gid gid2 gid1 gid0   lid lid2 lid1 lid0  wavefront            work-group\n");
	for (tid = 0; tid < ndrange->work_item_count; tid++)
	{
		work_item = ndrange->work_items[tid];
		wavefront = work_item->wavefront;
		work_group = work_item->work_group;
		si_isa_debug("%4d %4d %4d %4d  ", work_item->id, work_item->id_3d[2],
			work_item->id_3d[1], work_item->id_3d[0]);
		si_isa_debug("%4d %4d %4d %4d  ", work_group->id, work_group->id_3d[2],
			work_group->id_3d[1], work_group->id_3d[0]);
		si_isa_debug("%4d %4d %4d %4d  ", work_item->id_in_work_group, 
			work_item->id_in_work_group_3d[2], work_item->id_in_work_group_3d[1], 
			work_item->id_in_work_group_3d[0]);
		si_isa_debug("%20s.%-4d  ", wavefront->name, work_item->id_in_wavefront);
		si_isa_debug("%20s.%-4d\n", work_group->name, work_item->id_in_work_group);
	}

}
示例#4
0
文件: ndrange.c 项目: ajithcj/miaow
void si_ndrange_setup_work_items(struct si_ndrange_t *ndrange)
{
	struct si_opencl_kernel_t *kernel = ndrange->kernel;

	struct si_work_group_t *work_group;
	struct si_wavefront_t *wavefront;
	struct si_work_item_t *work_item;

	int gidx, gidy, gidz;  /* 3D work-group ID iterators */
	int lidx, lidy, lidz;  /* 3D work-item local ID iterators */

	int tid;  /* Global ID iterator */
	int gid;  /* Group ID iterator */
	int wid;  /* Wavefront ID iterator */
	int lid;  /* Local ID iterator */

	/*MIAOW start */
	char config_str[100];
	sprintf(config_str, "config_%d.txt", kernel_config_count);
	FILE* config = fopen(config_str, "w");
	/*MIAOW stop */

	/*MIAOW start*/
	//UNIT TEST
	char unit_test_input_buf[150000];
	char *tok = NULL;
	char *config_read_result = NULL;
	char vreg_str[64][2500];
	char sreg_str[2500];

	FILE* unit_test_config = fopen("unit_test_config.txt", "r");
	if (unit_test_config != 0)
	{
		int i;
		int num_of_threads = 0;

		//ndrange->wavefront_count = 1;
		//kernel->group_count = 1;
		kernel->local_size3[2] = 1;
		kernel->local_size3[1] = 1;
		kernel->global_size3[2] = 1;
		kernel->global_size3[1] = 1;
		
		config_read_result = fgets(unit_test_input_buf, 150000, unit_test_config);
		if(config_read_result != NULL)
		{
			tok = strtok(unit_test_input_buf, ";"); //WG count
			kernel->group_count = atoi(tok);

			tok = strtok(NULL, ";"); //total number of threads
			num_of_threads = atoi(tok);

			kernel->global_size = atoi(tok);
			kernel->global_size3[0] = atoi(tok);
			kernel->local_size3[0] = atoi(tok);
			kernel->local_size = atoi(tok);

		}
	}

	//WorkGroup count and thread count
	fprintf(config,"%d;%d;\n", kernel->group_count, kernel->global_size);
#ifdef MIAOW_DEBUG
	fflush(config);
#endif

	/*MIAOW stop*/

	/* Array of work-groups */
	ndrange->work_group_count = kernel->group_count;
	ndrange->work_group_id_first = 0;
	ndrange->work_group_id_last = ndrange->work_group_count - 1;
	ndrange->work_groups = xcalloc(ndrange->work_group_count, sizeof(void *));
	for (gid = 0; gid < kernel->group_count; gid++)
	{
		ndrange->work_groups[gid] = si_work_group_create();
		work_group = ndrange->work_groups[gid];
	}

	/* Array of wavefronts */
	ndrange->wavefronts_per_work_group = (kernel->local_size + si_emu_wavefront_size - 1) / si_emu_wavefront_size;
	ndrange->wavefront_count = ndrange->wavefronts_per_work_group * ndrange->work_group_count;
	ndrange->wavefront_id_first = 0;
	ndrange->wavefront_id_last = ndrange->wavefront_count - 1;
	assert(ndrange->wavefronts_per_work_group > 0 && ndrange->wavefront_count > 0);
	ndrange->wavefronts = xcalloc(ndrange->wavefront_count, sizeof(void *));
	ndrange->scalar_work_items = xcalloc(ndrange->wavefront_count, sizeof(void *));
	for (wid = 0; wid < ndrange->wavefront_count; wid++)
	{
		gid = wid / ndrange->wavefronts_per_work_group;
		ndrange->wavefronts[wid] = si_wavefront_create();
		wavefront = ndrange->wavefronts[wid];
		work_group = ndrange->work_groups[gid];

		wavefront->id = wid;
		wavefront->id_in_work_group = wid % ndrange->wavefronts_per_work_group;
		wavefront->ndrange = ndrange;
		wavefront->work_group = work_group;
		DOUBLE_LINKED_LIST_INSERT_TAIL(work_group, running, wavefront);

		/* Initialize the scalar work item */
		ndrange->scalar_work_items[wid] = si_work_item_create();
		wavefront->scalar_work_item = ndrange->scalar_work_items[wid];
		ndrange->scalar_work_items[wid]->wavefront = wavefront;
		ndrange->scalar_work_items[wid]->work_group = work_group;
		ndrange->scalar_work_items[wid]->ndrange = ndrange;
	}

#ifdef MIAOW_DEBUG
	fprintf(config, "Processing Workitems\n");
	fflush(config);
#endif
	/* Array of work-items */
	ndrange->work_item_count = kernel->global_size;
	ndrange->work_item_id_first = 0;
	ndrange->work_item_id_last = ndrange->work_item_count - 1;
	ndrange->work_items = xcalloc(ndrange->work_item_count, sizeof(void *));
	tid = 0;
	gid = 0;
	for (gidz = 0; gidz < kernel->group_count3[2]; gidz++)
	{
		for (gidy = 0; gidy < kernel->group_count3[1]; gidy++)
		{
			for (gidx = 0; gidx < kernel->group_count3[0]; gidx++)
			{
				/* Assign work-group ID */
				work_group = ndrange->work_groups[gid];
				work_group->ndrange = ndrange;
				work_group->id_3d[0] = gidx;
				work_group->id_3d[1] = gidy;
				work_group->id_3d[2] = gidz;
				work_group->id = gid;
				si_work_group_set_status(work_group, si_work_group_pending);

				/* First, last, and number of work-items in work-group */
				work_group->work_item_id_first = tid;
				work_group->work_item_id_last = tid + kernel->local_size;
				work_group->work_item_count = kernel->local_size;
				work_group->work_items = &ndrange->work_items[tid];
				snprintf(work_group->name, sizeof(work_group->name), "work-group[i%d-i%d]",
					work_group->work_item_id_first, work_group->work_item_id_last);

				/* First ,last, and number of wavefronts in work-group */
				work_group->wavefront_id_first = gid * ndrange->wavefronts_per_work_group;
				work_group->wavefront_id_last = work_group->wavefront_id_first + ndrange->wavefronts_per_work_group - 1;
				work_group->wavefront_count = ndrange->wavefronts_per_work_group;
				work_group->wavefronts = &ndrange->wavefronts[work_group->wavefront_id_first];
				/* Iterate through work-items */
				lid = 0;
				for (lidz = 0; lidz < kernel->local_size3[2]; lidz++)
				{
					for (lidy = 0; lidy < kernel->local_size3[1]; lidy++)
					{
						for (lidx = 0; lidx < kernel->local_size3[0]; lidx++)
						{
							/* Wavefront ID */
							wid = gid * ndrange->wavefronts_per_work_group +
								lid / si_emu_wavefront_size;
							assert(wid < ndrange->wavefront_count);
							wavefront = ndrange->wavefronts[wid];
							
							/* Create work-item */
							ndrange->work_items[tid] = si_work_item_create();
							work_item = ndrange->work_items[tid];
							work_item->ndrange = ndrange;

							/* Global IDs */
							work_item->id_3d[0] = gidx * kernel->local_size3[0] + lidx;
							work_item->id_3d[1] = gidy * kernel->local_size3[1] + lidy;
							work_item->id_3d[2] = gidz * kernel->local_size3[2] + lidz;
							work_item->id = tid;

							/* Local IDs */
							work_item->id_in_work_group_3d[0] = lidx;
							work_item->id_in_work_group_3d[1] = lidy;
							work_item->id_in_work_group_3d[2] = lidz;
							work_item->id_in_work_group = lid;

							/* Other */
							work_item->id_in_wavefront = work_item->id_in_work_group % si_emu_wavefront_size;
							work_item->work_group = ndrange->work_groups[gid];
							work_item->wavefront = ndrange->wavefronts[wid];

							/*MIAOW start*/
							work_item->id = work_item->id_in_wavefront;
							/*MIAOW stop*/

							/* First, last, and number of work-items in wavefront */
							if (!wavefront->work_item_count) {
								wavefront->work_item_id_first = tid;
								wavefront->work_items = &ndrange->work_items[tid];
							}
							wavefront->work_item_count++;
							wavefront->work_item_id_last = tid;

							//Initializing all vreg values to zero, so that config.txt doesnt change with each run
							/*MIAOW start*/
							for (int vreg_init_index; vreg_init_index < 256; vreg_init_index++)
							{
								work_item->vreg[vreg_init_index].as_int = 0;
							}
							/*MIAOW stop*/

							/* Save local IDs in registers */
							work_item->vreg[0].as_int = lidx;  /* V0 */
							work_item->vreg[1].as_int = lidy;  /* V1 */
							work_item->vreg[2].as_int = lidz;  /* V2 */

							/* Next work-item */
							tid++;
							lid++;
						}
					}
				}

				/* Next work-group */
				gid++;
			}
		}
	}

	/*MIAOW start */
	//This part is for unit test trace generation.
	//If the file unit_test_instr.mem is present, the contents will be read and placed in the instruction buffer.
	FILE* unit_test_instr = fopen("unit_test_instr.mem", "r");

	if (unit_test_instr != 0)
	{
		unsigned char instr_buf[200];

		int input_instr_count = 0;

		fgets(instr_buf, 200, unit_test_instr); //address

		unsigned char* buf_ptr = (unsigned char*)kernel->bin_file->enc_dict_entry_southern_islands->sec_text_buffer.ptr;

		while (fgets(instr_buf, 200, unit_test_instr) != NULL)
		{
			instr_buf[2] = '\0'; //interested only in first byte.

			unsigned char cur_instr = (unsigned char)strtol(instr_buf, 0, 16);
			buf_ptr[input_instr_count++] = cur_instr;
		}

		kernel->bin_file->enc_dict_entry_southern_islands->sec_text_buffer.size = input_instr_count;

		fclose(unit_test_instr);
	}
	/*MIAOW stop */

	/* Initialize the wavefronts */
	for (wid = 0; wid < ndrange->wavefront_count; wid++)
	{
		/* Assign names to wavefronts */
		wavefront = ndrange->wavefronts[wid];
		snprintf(wavefront->name, sizeof(wavefront->name), "wavefront[i%d-i%d]",
			wavefront->work_item_id_first, wavefront->work_item_id_last);

		/* Initialize wavefront program counter */
		if (!kernel->bin_file->enc_dict_entry_southern_islands->sec_text_buffer.size)
			fatal("%s: cannot load kernel code", __FUNCTION__);
		wavefront->wavefront_pool_start = kernel->bin_file->enc_dict_entry_southern_islands->sec_text_buffer.ptr;
		wavefront->wavefront_pool = kernel->bin_file->enc_dict_entry_southern_islands->sec_text_buffer.ptr;

		//Initializing all sreg values to zero, so that config.txt doesnt change with each run
		/*MIAOW start*/
		for (int sreg_init_index; sreg_init_index < 256; sreg_init_index++)
		{
			//wavefront->sreg[sreg_init_index].as_int = 0;
		}
		/*MIAOW stop*/

		/* Save work-group IDs in registers */
		unsigned int user_sgpr = kernel->bin_file->
			enc_dict_entry_southern_islands->compute_pgm_rsrc2->user_sgpr;
		wavefront->sreg[user_sgpr].as_int = wavefront->work_group->id_3d[0];
		wavefront->sreg[user_sgpr + 1].as_int = wavefront->work_group->id_3d[1];
		wavefront->sreg[user_sgpr + 2].as_int = wavefront->work_group->id_3d[2];

		/* Initialize Constant Buffers */
		unsigned int userElementCount = kernel->bin_file->enc_dict_entry_southern_islands->userElementCount;
		struct si_bin_enc_user_element_t* userElements = kernel->bin_file->enc_dict_entry_southern_islands->userElements;
		for (int i = 0; i < userElementCount; i++)
		{
			if (userElements[i].dataClass == IMM_CONST_BUFFER)
			{
				si_wavefront_init_sreg_with_cb(wavefront, userElements[i].startUserReg, userElements[i].userRegCount, userElements[i].apiSlot);
			}
			else if (userElements[i].dataClass == IMM_UAV)
			{
				si_wavefront_init_sreg_with_cb(wavefront, userElements[i].startUserReg, userElements[i].userRegCount, userElements[i].apiSlot);
			}
			else if (userElements[i].dataClass == PTR_CONST_BUFFER_TABLE)
			{
				si_wavefront_init_sreg_with_uav_table(wavefront, userElements[i].startUserReg, userElements[i].userRegCount);
			}
			else if (userElements[i].dataClass == PTR_UAV_TABLE)
			{
				si_wavefront_init_sreg_with_uav_table(wavefront, userElements[i].startUserReg, userElements[i].userRegCount);
			}
			else
			{
				fatal("Unimplemented User Element: dataClass:%d", userElements[i].dataClass);
			}
		}

		//MIAOW m2s is not setting exec mask properly
		/* Initialize the execution mask */
		//wavefront->sreg[SI_EXEC].as_int = 0xFFFFFFFF;
		//wavefront->sreg[SI_EXEC + 1].as_int = 0xFFFFFFFF;
		//wavefront->sreg[SI_EXECZ].as_int = 0;

		/*MIAOW start*/
		//EXEC Mask init
		unsigned long long mask;
		if(wavefront->work_item_count == 64)
		{
			mask = 0xFFFFFFFFFFFFFFFF;
		}
		else
		{
			mask = powl(2, wavefront->work_item_count) - 1;
		}

		wavefront->sreg[SI_EXEC].as_uint = (unsigned int)mask;
		wavefront->sreg[SI_EXEC + 1].as_uint = mask>>32;
		wavefront->sreg[SI_EXECZ].as_int = 0;
		/*MIAOW stop*/


		/*MIAOW start*/
		if(config_read_result != NULL)
		{
			if(NULL != fgets(unit_test_input_buf, 150000, unit_test_config))
			{
				int num_of_threads = 0;
				int thread_init_count = 0;

				tok = strtok(unit_test_input_buf, ";"); //WGID
				tok = strtok(NULL, ";"); //WFID
				tok = strtok(NULL, ";"); //WF count
				tok = strtok(NULL, ";"); //thread count

				num_of_threads = atoi(tok);
#ifdef MIAOW_DEBUG
				if (num_of_threads != wavefront->work_item_count)
				{
					fprintf(config, "num_thread MISMATCH %d!=%d\n", num_of_threads, wavefront->work_item_count);
				}
				else
				{
					fprintf(config, "num_thread match %d=%d\n", num_of_threads, wavefront->work_item_count);
				}
				fflush(config);
#endif
				tok = strtok(NULL, ";"); //VREG size
				kernel->bin_file->enc_dict_entry_southern_islands->num_vgpr_used = atoi(tok);

				tok = strtok(NULL, ";"); //SREG size
				kernel->bin_file->enc_dict_entry_southern_islands->num_sgpr_used = atoi(tok);

				tok = strtok(NULL, ";"); //LDS size
				kernel->bin_file->enc_dict_entry_southern_islands->lds_size_used = atoi(tok);

				for(thread_init_count = 0; thread_init_count < num_of_threads; thread_init_count++)
				{
					tok = strtok(NULL, ";");
					strcpy((char*)vreg_str[thread_init_count], tok);
					assert(vreg_str[thread_init_count][0] == 'V');
				}

				tok = strtok(NULL, ";");
				strcpy((char*)sreg_str, tok);
				assert(sreg_str[0] == 'S');

				tok = strtok(NULL, ";"); //PC
			}

#ifdef MIAOW_DEBUG
				fprintf(config, "Initializing VREG \n");
				fflush(config);
#endif
			//VREG value init
			int wi_init_count = 0;
			for (wi_init_count = 0; wi_init_count < wavefront->work_item_count; wi_init_count++)
			{
				if (wavefront->work_items != NULL)
				{
					int vreg_init_count = 0;
					char *reg_tok;
					struct si_work_item_t* wi = wavefront->work_items[wi_init_count];

					reg_tok = strtok(vreg_str[wi_init_count], ":");
					reg_tok = strtok(NULL, "=");

					for(vreg_init_count = 0; reg_tok != NULL; vreg_init_count++)
					{
						int vreg_index = atoi(reg_tok);
						reg_tok = strtok(NULL, ",");
						assert(reg_tok != NULL);
						wi->vreg[vreg_index].as_int = atoi(reg_tok);
						reg_tok = strtok(NULL, "=");
					}

					// make sure that all reg values were read
					assert(reg_tok == NULL);
				}
			}

#ifdef MIAOW_DEBUG
				fprintf(config, "Initializing SREG \n");
				fflush(config);
#endif

#ifdef MIAOW_DEBUG
				fprintf(config, "mask: %lld \n", mask);
				fprintf(config, "MASK HI: %u \n", wavefront->sreg[SI_EXEC + 1].as_uint);
				fprintf(config, "MASK LO: %u \n", wavefront->sreg[SI_EXEC].as_uint);
				fflush(config);
#endif
			//SREG value init
			int sreg_init_count = 0;
			char *sreg_tok;
			sreg_tok = strtok(sreg_str, ":");
			sreg_tok = strtok(NULL, "=");
			for(sreg_init_count=0; sreg_tok != NULL; sreg_init_count++)
			{
				int sreg_index = atoi(sreg_tok);
				sreg_tok = strtok(NULL, ",");
				assert(sreg_tok != NULL);
				wavefront->sreg[sreg_index].as_int = atoi(sreg_tok);
				sreg_tok = strtok(NULL, "=");
			}
			// make sure that all reg values were read
			assert(sreg_tok == NULL);
		}
		
		/*MIAOW stop*/

		/*MIAOW start*/
		//WorkGroup ID
		fprintf(config,"%d;",wavefront->work_group->id);

		//Wavefront ID
		fprintf(config,"%d;",wavefront->id_in_work_group);

		//Wavefront Count
		fprintf(config,"%d;",wavefront->work_group->wavefront_count);

		//Thread count
		fprintf(config,"%d;",wavefront->work_item_count);

		//VGPR size, SGPR size, LDS size
		fprintf(config,"%d;",kernel->bin_file->enc_dict_entry_southern_islands->num_vgpr_used);
		fprintf(config,"%d;",kernel->bin_file->enc_dict_entry_southern_islands->num_sgpr_used);
		fprintf(config,"%d;",kernel->bin_file->enc_dict_entry_southern_islands->lds_size_used);

#ifdef MIAOW_DEBUG
		fflush(config);
#endif
		int wi_count = 0;
		for (wi_count = 0; wi_count < wavefront->work_item_count; wi_count++)
		{
			//VGPR initial values
			if (wavefront->work_items != NULL)
			{
				struct si_work_item_t* wi = wavefront->work_items[wi_count];

				fprintf(config,"V:");
				int vgpr_count = 0;
				for (vgpr_count = 0; vgpr_count < (kernel->bin_file->enc_dict_entry_southern_islands->num_vgpr_used - 1); vgpr_count++)
				{
					//All VGPR values except the last
					fprintf(config,"%d=%d,", vgpr_count, wi->vreg[vgpr_count]);
				}
				//Last SGPR value
				fprintf(config,"%d=%d;", vgpr_count, wi->vreg[vgpr_count]);
			}
		}

		//SGPR initial values
		fprintf(config,"S:");
		int sgpr_count = 0;
		for (sgpr_count = 0; sgpr_count < (kernel->bin_file->enc_dict_entry_southern_islands->num_sgpr_used - 1); sgpr_count++)
		{
			//All SGPR values except the last
			fprintf(config,"%d=%d,", sgpr_count, wavefront->sreg[sgpr_count]);
		}
		//Last SGPR value
		fprintf(config,"%d=%d;", sgpr_count, wavefront->sreg[sgpr_count]);

		//PC start
		//fprintf(config,"%d",wavefront->wavefront_pool_start);
		fprintf(config, "0");
		fprintf(config,"\n");
		/*MIAOW stop*/
	}
	
	/*MIAOW start */
	fclose(config);

	char instr_str[100];
	sprintf(instr_str, "instr_%d.mem", kernel_config_count);
	FILE* instr = fopen(instr_str, "w");
	//fprintf(instr, "@%.8x\n", kernel->bin_file->enc_dict_entry_southern_islands->sec_text_buffer.ptr);
	fprintf(instr, "@0\n");
	for (int instr_count = 0; instr_count < kernel->bin_file->enc_dict_entry_southern_islands->sec_text_buffer.size; instr_count++)
	{
		fprintf(instr, "%.2x\n", ((unsigned char*)kernel->bin_file->enc_dict_entry_southern_islands->sec_text_buffer.ptr)[instr_count]);
	}
	fclose(instr);
	/*MIAOW stop */

	/* Debug */
	si_isa_debug("local_size = %d (%d,%d,%d)\n", kernel->local_size, kernel->local_size3[0],
		kernel->local_size3[1], kernel->local_size3[2]);
	si_isa_debug("global_size = %d (%d,%d,%d)\n", kernel->global_size, kernel->global_size3[0],
		kernel->global_size3[1], kernel->global_size3[2]);
	si_isa_debug("group_count = %d (%d,%d,%d)\n", kernel->group_count, kernel->group_count3[0],
		kernel->group_count3[1], kernel->group_count3[2]);
	si_isa_debug("wavefront_count = %d\n", ndrange->wavefront_count);
	si_isa_debug("wavefronts_per_work_group = %d\n", ndrange->wavefronts_per_work_group);
	si_isa_debug(" tid tid2 tid1 tid0   gid gid2 gid1 gid0   lid lid2 lid1 lid0  wavefront            work-group\n");
	for (tid = 0; tid < ndrange->work_item_count; tid++)
	{
		work_item = ndrange->work_items[tid];
		wavefront = work_item->wavefront;
		work_group = work_item->work_group;
		si_isa_debug("%4d %4d %4d %4d  ", work_item->id, work_item->id_3d[2],
			work_item->id_3d[1], work_item->id_3d[0]);
		si_isa_debug("%4d %4d %4d %4d  ", work_group->id, work_group->id_3d[2],
			work_group->id_3d[1], work_group->id_3d[0]);
		si_isa_debug("%4d %4d %4d %4d  ", work_item->id_in_work_group, 
			work_item->id_in_work_group_3d[2], work_item->id_in_work_group_3d[1], 
			work_item->id_in_work_group_3d[0]);
		si_isa_debug("%20s.%-4d  ", wavefront->name, work_item->id_in_wavefront);
		si_isa_debug("%20s.%-4d\n", work_group->name, work_item->id_in_work_group);
	}

}