void frm_grid_setup_threads(struct frm_grid_t *grid) { struct frm_cuda_function_t *function = grid->function; struct frm_threadblock_t *threadblock; struct frm_warp_t *warp; struct frm_thread_t *thread; int bidx, bidy, bidz; /* 3D threadblock ID iterators */ int lidx, lidy, lidz; /* 3D thread local ID iterators */ int tid; /* Global ID iterator */ int bid; /* Threadblock ID iterator */ int wid; /* Warp ID iterator */ int lid; /* Local ID iterator */ /* Array of threadblocks */ grid->threadblock_count = function->group_count; grid->threadblock_id_first = 0; grid->threadblock_id_last = grid->threadblock_count - 1; grid->threadblocks = calloc(grid->threadblock_count, sizeof(void *)); for (bid = 0; bid < grid->threadblock_count; bid++) grid->threadblocks[bid] = frm_threadblock_create(); /* Array of warps */ grid->warps_per_threadblock = (function->local_size + frm_emu_warp_size - 1) / frm_emu_warp_size; grid->warp_count = grid->warps_per_threadblock * grid->threadblock_count; grid->warp_id_first = 0; grid->warp_id_last = grid->warp_count - 1; assert(grid->warps_per_threadblock > 0 && grid->warp_count > 0); grid->warps = calloc(grid->warp_count, sizeof(void *)); for (wid = 0; wid < grid->warp_count; wid++) { bid = wid / grid->warps_per_threadblock; grid->warps[wid] = frm_warp_create(); warp = grid->warps[wid]; threadblock = grid->threadblocks[bid]; warp->id = wid; warp->id_in_threadblock = wid % grid->warps_per_threadblock; warp->grid = grid; warp->threadblock = threadblock; DOUBLE_LINKED_LIST_INSERT_TAIL(threadblock, running, warp); } /* Array of threads */ grid->thread_count = function->global_size; grid->thread_id_first = 0; grid->thread_id_last = grid->thread_count - 1; grid->threads = calloc(grid->thread_count, sizeof(void *)); tid = 0; bid = 0; for (bidz = 0; bidz < function->group_count3[2]; bidz++) { for (bidy = 0; bidy < function->group_count3[1]; bidy++) { for (bidx = 0; bidx < function->group_count3[0]; bidx++) { /* Assign threadblock ID */ threadblock = grid->threadblocks[bid]; threadblock->grid = grid; threadblock->id_3d[0] = bidx; threadblock->id_3d[1] = bidy; threadblock->id_3d[2] = bidz; threadblock->id = bid; frm_threadblock_set_status(threadblock, frm_threadblock_pending); /* First, last, and number of threads in threadblock */ threadblock->thread_id_first = tid; threadblock->thread_id_last = tid + function->local_size - 1; threadblock->thread_count = function->local_size; threadblock->threads = &grid->threads[tid]; snprintf(threadblock->name, sizeof(threadblock->name), "threadblock[i%d-i%d]", threadblock->thread_id_first, threadblock->thread_id_last); /* First ,last, and number of warps in threadblock */ threadblock->warp_id_first = bid * grid->warps_per_threadblock; threadblock->warp_id_last = threadblock->warp_id_first + grid->warps_per_threadblock - 1; threadblock->warp_count = grid->warps_per_threadblock; threadblock->warps = &grid->warps[threadblock->warp_id_first]; /* Iterate through threads */ lid = 0; for (lidz = 0; lidz < function->local_size3[2]; lidz++) { for (lidy = 0; lidy < function->local_size3[1]; lidy++) { for (lidx = 0; lidx < function->local_size3[0]; lidx++) { /* Warp ID */ wid = bid * grid->warps_per_threadblock + lid / frm_emu_warp_size; assert(wid < grid->warp_count); warp = grid->warps[wid]; /* Create thread */ grid->threads[tid] = frm_thread_create(); thread = grid->threads[tid]; thread->grid = grid; /* Global IDs */ thread->id_3d[0] = bidx * function->local_size3[0] + lidx; thread->id_3d[1] = bidy * function->local_size3[1] + lidy; thread->id_3d[2] = bidz * function->local_size3[2] + lidz; thread->id = tid; /* Local IDs */ thread->id_in_threadblock_3d[0] = lidx; thread->id_in_threadblock_3d[1] = lidy; thread->id_in_threadblock_3d[2] = lidz; thread->id_in_threadblock = lid; /* Other */ thread->id_in_warp = thread->id_in_threadblock % frm_emu_warp_size; thread->threadblock = grid->threadblocks[bid]; thread->warp = grid->warps[wid]; /* First, last, and number of threads in warp */ if (!warp->thread_count) { warp->thread_id_first = tid; warp->threads = &grid->threads[tid]; } warp->thread_count++; warp->thread_id_last = tid; bit_map_set(warp->active_stack, thread->id_in_warp, 1, 1); /* Save local IDs in register R0 */ thread->sr[FRM_SR_Tid_X].v.i = lidx; /* R0.x */ thread->sr[FRM_SR_Tid_Y].v.i = lidy; /* R0.y */ thread->sr[FRM_SR_Tid_Z].v.i = lidz; /* R0.z */ /* Save threadblock IDs in register R1 */ thread->sr[FRM_SR_CTAid_X].v.i = bidx; /* R1.x */ thread->sr[FRM_SR_CTAid_Y].v.i = bidy; /* R1.y */ thread->sr[FRM_SR_CTAid_Z].v.i = bidz; /* R1.z */ /* Next thread */ tid++; lid++; } } } /* Next threadblock */ bid++; } } } /* Assign names to warps */ for (wid = 0; wid < grid->warp_count; wid++) { warp = grid->warps[wid]; snprintf(warp->name, sizeof(warp->name), "warp[i%d-i%d]", warp->thread_id_first, warp->thread_id_last); /* Initialize warp program counter */ warp->buf_start = function->function_buffer.ptr; warp->buf = warp->buf_start; warp->buf_size = function->function_buffer.size; } /* Debug */ printf("local_size = %d (%d,%d,%d)\n", function->local_size, function->local_size3[0], function->local_size3[1], function->local_size3[2]); printf("global_size = %d (%d,%d,%d)\n", function->global_size, function->global_size3[0], function->global_size3[1], function->global_size3[2]); printf("group_count = %d (%d,%d,%d)\n", function->group_count, function->group_count3[0], function->group_count3[1], function->group_count3[2]); printf("warp_count = %d\n", grid->warp_count); printf("warps_per_threadblock = %d\n", grid->warps_per_threadblock); printf(" tid tid2 tid1 tid0 bid bid2 bid1 bid0 lid lid2 lid1 lid0 warp work-group\n"); for (tid = 0; tid < grid->thread_count; tid++) { thread = grid->threads[tid]; warp = thread->warp; threadblock = thread->threadblock; printf("%4d %4d %4d %4d ", thread->id, thread->id_3d[2], thread->id_3d[1], thread->id_3d[0]); printf("%4d %4d %4d %4d ", threadblock->id, threadblock->id_3d[2], threadblock->id_3d[1], threadblock->id_3d[0]); printf("%4d %4d %4d %4d ", thread->id_in_threadblock, thread->id_in_threadblock_3d[2], thread->id_in_threadblock_3d[1], thread->id_in_threadblock_3d[0]); printf("%20s.%-4d ", warp->name, thread->id_in_warp); printf("%20s.%-4d\n", threadblock->name, thread->id_in_threadblock); } }
static void frm_grid_setup_arrays(struct frm_grid_t *grid) { struct frm_thread_block_t *thread_block; struct frm_warp_t *warp; struct frm_thread_t *thread; int bid; /* Thread block ID */ int wid; /* Warp ID iterator */ int tid; /* Thread ID iterator */ /* Create array/lists of thread blocks */ grid->thread_block_count = grid->block_count; grid->thread_blocks = (struct frm_thread_block_t **)xcalloc( grid->block_count, sizeof(struct frm_thread_block_t *)); grid->pending_thread_blocks = list_create(); grid->running_thread_blocks = list_create(); grid->finished_thread_blocks = list_create(); for (bid = 0; bid < grid->block_count; bid++) { /* Create new thread block */ thread_block = frm_thread_block_create(); grid->thread_blocks[bid] = thread_block; /* Initialize thread block */ thread_block->id = bid; snprintf(thread_block->name, sizeof(thread_block->name), "thread-block[g%d-b%d]", grid->id, thread_block->id); thread_block->grid = grid; /* Add to pending list */ list_add(grid->pending_thread_blocks, thread_block); /* Create array/lists of warps */ thread_block->warp_count = (grid->block_size + frm_emu_warp_size - 1) / frm_emu_warp_size; thread_block->warps = (struct frm_warp_t **)xcalloc( thread_block->warp_count, sizeof(struct frm_warp_t *)); thread_block->running_warps = list_create(); thread_block->finished_warps = list_create(); for (wid = 0; wid < thread_block->warp_count; wid++) { /* Create new warp */ warp = frm_warp_create(); thread_block->warps[wid] = warp; /* Initialize warp */ warp->id = wid + bid * thread_block->warp_count; warp->id_in_thread_block = wid; snprintf(warp->name, sizeof(warp->name), "warp[g%d-b%d-w%d]", grid->id, thread_block->id, warp->id_in_thread_block); warp->grid = grid; warp->thread_block = thread_block; warp->inst_buffer = grid->function->inst_buffer; warp->inst_buffer_size = grid->function->inst_buffer_size; if (wid < thread_block->warp_count - 1) warp->thread_count = frm_emu_warp_size; else warp->thread_count = grid->block_size - (thread_block->warp_count - 1) * frm_emu_warp_size; warp->threads = (struct frm_thread_t **)xcalloc( warp->thread_count, sizeof(struct frm_thread_t *)); /* Add to running list */ list_add(thread_block->running_warps, warp); } /* Create array/lists of threads */ thread_block->thread_count = grid->block_size; thread_block->threads = (struct frm_thread_t **)xcalloc( thread_block->thread_count, sizeof(struct frm_thread_t *)); for (tid = 0; tid < thread_block->thread_count; tid++) { /* Create new thread */ thread = frm_thread_create(); thread_block->threads[tid] = thread; /* Initialize thread */ thread->id = tid + bid * thread_block->thread_count; thread->id_in_warp = tid % frm_emu_warp_size; thread->id_in_thread_block = tid; thread->warp = thread_block->warps[tid / frm_emu_warp_size]; thread->thread_block = thread_block; thread->grid = grid; /* Save thread IDs in special register R0 */ thread->sr[FRM_SR_Tid_X].v.i = tid % grid->block_size3[0]; thread->sr[FRM_SR_Tid_Y].v.i = tid / grid->block_size3[0]; thread->sr[FRM_SR_Tid_Z].v.i = tid / (grid->block_size3[0] * grid->block_size3[1]); /* Save thread block IDs in special register R1 */ thread->sr[FRM_SR_CTAid_X].v.i = bid % grid->block_count3[0]; thread->sr[FRM_SR_CTAid_Y].v.i = bid / grid->block_count3[0]; thread->sr[FRM_SR_CTAid_Z].v.i = bid / (grid->block_count3[0] * grid->block_count3[1]); /* Set predicate register #7 to 1 */ thread->pr[7] = 1; /* Link thread with warp */ thread->warp->threads[thread->id_in_warp] = thread; } } }