__kernel void memset_uint4(__global int *mem, const int size, __private int val) { \n\ int tid = get_local_id(0); \n\ int bx = (get_group_id(1)) * (get_num_groups(0)) + get_group_id(0); \n\ int i = tid + (bx) * (get_local_size(0)); \n\ //debug \n\ //if (i == 0) { printf(\"memset size = %i value = %i buffer %i \\n\",size,val,mem[0]); } \n\ if (i < size ) { mem[i]=val; } \n\ }" };
size_t _CL_OVERLOADABLE get_global_id(unsigned int dimindx) { switch(dimindx) { /* TODO: add get_global_offset(X) to these! */ case 0: return get_local_size(0) * get_group_id(0) + get_local_id(0); case 1: return get_local_size(1) * get_group_id(1) + get_local_id(1); case 2: return get_local_size(2) * get_group_id(2) + get_local_id(2); default: return 0; } }
/// \fn _copyKernel /// \brief generate a copy kernel program compute::program _copyKernel(const compute::context& context) { const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE( __kernel void copy_kernel(__global const float *src, __global float *dst) { uint x = get_group_id(0) * TILE_DIM + get_local_id(0); uint y = get_group_id(1) * TILE_DIM + get_local_id(1); uint width = get_num_groups(0) * TILE_DIM; for(uint i = 0 ; i < TILE_DIM ; i+= BLOCK_ROWS) { dst[(y+i)*width +x] = src[(y+i)*width + x]; } }
static void rpc_set_delayed(rpc_t* rpc, void* c) { str group, var; int i, err; char *ch; unsigned int *group_id; if (rpc->scan(c, "SS", &group, &var) < 2) return; if (get_group_id(&group, &group_id)) { rpc->fault(c, 400, "Wrong group syntax. Use either \"group\", or \"group[id]\""); return; } if (rpc->scan(c, "d", &i) == 1) err = cfg_set_delayed_int(ctx, &group, group_id, &var, i); else if (rpc->scan(c, "s", &ch) == 1) err = cfg_set_delayed_string(ctx, &group, group_id, &var, ch); else return; /* error */ if (err) { rpc->fault(c, 400, "Failed to set the variable"); return; } }
void handle_work_checkout_event( awe_server_state * ns, tw_bf * b, awe_msg * m, tw_lp * lp) { tw_event *e; awe_msg *msg; e = codes_event_new(m->src, ns_tw_lookahead, lp); msg = tw_event_data(e); msg->event_type = WORK_CHECKOUT; memset(msg->object_id, 0, sizeof(msg->object_id)); tw_lpid client_id = m->src; // char group_name[MAX_LENGTH_GROUP]; //char lp_type_name[MAX_LENGTH_GROUP]; //int lp_type_id, grp_id, grp_rep_id, offset; // codes_mapping_get_lp_info(client_id, group_name, &grp_id, &lp_type_id, // lp_type_name, &grp_rep_id, &offset); int group_id = 0; group_id = get_group_id(client_id); /*if queue is empty, msg->object_id is "", otherwise msg->object-id is the dequeued workid*/ int got_work = 0; char workid[MAX_LENGTH_ID]; if (!g_queue_is_empty(work_queue)) { if (group_id == 1 && sched_policy>0) { //client from remote site char* work = NULL; if (sched_policy==1) { work = get_first_work_by_stage(5); //checkout task 5 (blat) only for remote site } else if (sched_policy==2) { work = get_first_work_by_greedy(WorkOrder); } if (work) { strcpy(workid, work); got_work = 1; } } else { strcpy(workid, g_queue_pop_head(work_queue)); got_work = 1; } } if (got_work) { //eligible work found, send back to the requesting client fprintf(event_log, "%lf;awe_server;%lu;WC;work=%s client=%lu\n", now_sec(lp), lp->gid, workid, m->src); assert (strlen(workid) > 10); strcpy(msg->object_id, workid); tw_event_send(e); } else { //no eligible work found, put client request to the waiting queue tw_lpid *clientid = NULL; clientid = malloc(sizeof(tw_lpid)); *clientid = m->src; g_queue_push_tail(client_req_queue, clientid); } return; }
void ComputeGradinetsRTLR_SetGradients(local float* tmpGradients, global float* gradients, global float* gradientSums) { Reduce_Sum(tmpGradients); if (get_local_id(0) == 0) { int ijValueIndex = get_group_id(0); if (gradients != null) gradients[ijValueIndex] = tmpGradients[0]; if (gradientSums != null) gradientSums[ijValueIndex] += tmpGradients[0]; } }
__kernel void kernel_scan(__global float* input, __global float* output) { int global_idx = get_global_id(0); int local_idx = get_local_id(0); int block_size = get_local_size(0); int group_id = get_group_id(0); output[global_idx] = input[global_idx]; mem_fence(CLK_GLOBAL_MEM_FENCE); for(int i = 1; i < block_size; i <<= 1) { if(global_idx >= i) output[global_idx] += output[global_idx - i]; mem_fence(CLK_GLOBAL_MEM_FENCE); } }
__kernel void kernel_reduce(__global float* input, __global float* output) { int global_idx = get_global_id(0); int local_idx = get_local_id(0); int block_size = get_local_size(0); int group_id = get_group_id(0); for(int i = block_size/2; i > 0; i >>= 1) { if(local_idx < i) input[global_idx] += input[global_idx + i]; barrier(CLK_GLOBAL_MEM_FENCE); } if(local_idx == 0) output[group_id] = input[global_idx]; }
static void rpc_get(rpc_t* rpc, void* c) { str group, var; void *val; unsigned int val_type; int ret; unsigned int *group_id; if (rpc->scan(c, "SS", &group, &var) < 2) return; if (get_group_id(&group, &group_id)) { rpc->fault(c, 400, "Wrong group syntax. Use either \"group\", or \"group[id]\""); return; } ret = cfg_get_by_name(ctx, &group, group_id, &var, &val, &val_type); if (ret < 0) { rpc->fault(c, 400, "Failed to get the variable"); return; } else if (ret > 0) { rpc->fault(c, 400, "Variable exists, but it is not readable via RPC interface"); return; } switch (val_type) { case CFG_VAR_INT: rpc->add(c, "d", (int)(long)val); break; case CFG_VAR_STRING: rpc->add(c, "s", (char *)val); break; case CFG_VAR_STR: rpc->add(c, "S", (str *)val); break; case CFG_VAR_POINTER: rpc->printf(c, "%p", val); break; } }
int client_match_work(tw_lpid client_id, char* workid) { int match = 1; //char group_name[MAX_LENGTH_GROUP]; //char lp_type_name[MAX_LENGTH_GROUP]; //codes_mapping_get_lp_info(clientid, group_name, grp_id, lp_type_id, lp_type_name, grp_rep_id, offset); int group_id = 0; group_id = get_group_id(client_id); if (group_id == 1) { //remote client gchar **seg = g_strsplit(workid, "_", 3); int taskid = atoi(seg[1]); if (taskid != 5) { match = 0; } } return match; }
static void rpc_del_delayed(rpc_t* rpc, void* c) { str group, var; unsigned int *group_id; if (rpc->scan(c, "SS", &group, &var) < 2) return; if (get_group_id(&group, &group_id) || !group_id) { rpc->fault(c, 400, "Wrong group syntax. Use \"group[id]\""); return; } if (cfg_del_delayed(ctx, &group, group_id, &var)) { rpc->fault(c, 400, "Failed to delete the value"); return; } }
static void rpc_del_group_inst(rpc_t* rpc, void* c) { str group; unsigned int *group_id; if (rpc->scan(c, "S", &group) < 1) return; if (get_group_id(&group, &group_id) || !group_id) { rpc->fault(c, 400, "Wrong group syntax. Use \"group[id]\""); return; } if (cfg_del_group_inst(ctx, &group, *group_id)) { rpc->fault(c, 400, "Failed to delete the group instance"); return; } }
static void rpc_set_now_int(rpc_t* rpc, void* c) { str group, var; int i; unsigned int *group_id; if (rpc->scan(c, "SSd", &group, &var, &i) < 3) return; if (get_group_id(&group, &group_id)) { rpc->fault(c, 400, "Wrong group syntax. Use either \"group\", or \"group[id]\""); return; } if (cfg_set_now_int(ctx, &group, group_id, &var, i)) { rpc->fault(c, 400, "Failed to set the variable"); return; } }
static void rpc_set_delayed_string(rpc_t* rpc, void* c) { str group, var; char *ch; unsigned int *group_id; if (rpc->scan(c, "SSs", &group, &var, &ch) < 3) return; if (get_group_id(&group, &group_id)) { rpc->fault(c, 400, "Wrong group syntax. Use either \"group\", or \"group[id]\""); return; } if (cfg_set_delayed_string(ctx, &group, group_id, &var, ch)) { rpc->fault(c, 400, "Failed to set the variable"); return; } }
inline global float* GetPValuesPtr(global float* pValuesOfWeights, int uLayersCount, int maxULayerSize, int kLayerIndex) { int ijValueIndex = get_group_id(0); return pValuesOfWeights + (ijValueIndex * uLayersCount * maxULayerSize) + (kLayerIndex * maxULayerSize); }
kernel void ComputeGradientsRTLR_V0_CPU( global float* pValuesOfWeights , int uLayersCount , int maxULayerSize , int p_i_j_l_LayerIndex_0_0 , int p_i_j_l_LayerSize_0_0 , global float$* weights_0_0 , int p_i_j_l_LayerIndex_1_0 , int p_i_j_l_LayerSize_1_0 , global float$* weights_1_0 , int p_i_j_l_LayerIndex_2_0 , int p_i_j_l_LayerSize_2_0 , global float$* weights_2_0 , int p_i_j_l_LayerIndex_3_0 , int p_i_j_l_LayerSize_3_0 , global float$* weights_3_0 , int p_i_j_k_LayerSize_0 , global float* netDerivValues_0 , int p_i_j_l_LayerIndex_0_1 , int p_i_j_l_LayerSize_0_1 , global float$* weights_0_1 , int p_i_j_l_LayerIndex_1_1 , int p_i_j_l_LayerSize_1_1 , global float$* weights_1_1 , int p_i_j_l_LayerIndex_2_1 , int p_i_j_l_LayerSize_2_1 , global float$* weights_2_1 , int p_i_j_l_LayerIndex_3_1 , int p_i_j_l_LayerSize_3_1 , global float$* weights_3_1 , int p_i_j_k_LayerSize_1 , global float* netDerivValues_1 , int p_i_j_l_LayerIndex_0_2 , int p_i_j_l_LayerSize_0_2 , global float$* weights_0_2 , int p_i_j_l_LayerIndex_1_2 , int p_i_j_l_LayerSize_1_2 , global float$* weights_1_2 , int p_i_j_l_LayerIndex_2_2 , int p_i_j_l_LayerSize_2_2 , global float$* weights_2_2 , int p_i_j_l_LayerIndex_3_2 , int p_i_j_l_LayerSize_3_2 , global float$* weights_3_2 , int p_i_j_k_LayerSize_2 , global float* netDerivValues_2 , int p_i_j_l_LayerIndex_0_3 , int p_i_j_l_LayerSize_0_3 , global float$* weights_0_3 , int p_i_j_l_LayerIndex_1_3 , int p_i_j_l_LayerSize_1_3 , global float$* weights_1_3 , int p_i_j_l_LayerIndex_2_3 , int p_i_j_l_LayerSize_2_3 , global float$* weights_2_3 , int p_i_j_l_LayerIndex_3_3 , int p_i_j_l_LayerSize_3_3 , global float$* weights_3_3 , int p_i_j_k_LayerSize_3 , global float* netDerivValues_3 , int iLayerIndex , global float* inputs , int inputsSize // + bias (null) = 1, inputs: size , global float* outputs , global float* desiredOutputs , local float* tmpGradients // size = local size , global float* gradients , global float* gradientSums) { int localId = get_local_id(0); int localSize = get_local_size(0); int ijValueIndex = get_group_id(0); int iValueIndex = ijValueIndex / inputsSize; int jValueIndex = ijValueIndex % inputsSize; tmpGradients[localId] = 0.0f; barrier(CLK_LOCAL_MEM_FENCE); // Local size ~ avg uLayerSize for (int kLayerIndex = 0; kLayerIndex < uLayersCount; kLayerIndex++) { int kLayerSize = PickIntValueByLayerIndex(p_i_j_k_LayerSize_0, p_i_j_k_LayerSize_1, p_i_j_k_LayerSize_2, p_i_j_k_LayerSize_3, kLayerIndex); bool computeGradient = (kLayerIndex == uLayersCount - 1) && outputs != null && desiredOutputs != null; int block = kLayerSize / localSize + (kLayerSize % localSize != 0 ? 1 : 0); int kValueIndex = localId * block; int max = kValueIndex + block; if (max > kLayerSize) max = kLayerSize; while (kValueIndex < max) { float sum = (iLayerIndex == kLayerIndex && iValueIndex == kValueIndex) ? (inputs != null ? inputs[jValueIndex] : 1.0f) : 0.0f; int p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_0_0, p_i_j_l_LayerIndex_0_1, p_i_j_l_LayerIndex_0_2, p_i_j_l_LayerIndex_0_3, kLayerIndex); int p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_0_0, p_i_j_l_LayerSize_0_1, p_i_j_l_LayerSize_0_2, p_i_j_l_LayerSize_0_3, kLayerIndex); global float$* weights = PickFPValueByLayerIndex$(weights_0_0, weights_0_1, weights_0_2, weights_0_3, kLayerIndex); sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex); p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_1_0, p_i_j_l_LayerIndex_1_1, p_i_j_l_LayerIndex_1_2, p_i_j_l_LayerIndex_1_3, kLayerIndex); if (p_i_j_l_LayerIndex != -1) { p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_1_0, p_i_j_l_LayerSize_1_1, p_i_j_l_LayerSize_1_2, p_i_j_l_LayerSize_1_3, kLayerIndex); weights = PickFPValueByLayerIndex$(weights_1_0, weights_1_1, weights_1_2, weights_1_3, kLayerIndex); sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex); } p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_2_0, p_i_j_l_LayerIndex_2_1, p_i_j_l_LayerIndex_2_2, p_i_j_l_LayerIndex_2_3, kLayerIndex); if (p_i_j_l_LayerIndex != -1) { p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_2_0, p_i_j_l_LayerSize_2_1, p_i_j_l_LayerSize_2_2, p_i_j_l_LayerSize_2_3, kLayerIndex); weights = PickFPValueByLayerIndex$(weights_2_0, weights_2_1, weights_2_2, weights_2_3, kLayerIndex); sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex); } p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_3_0, p_i_j_l_LayerIndex_3_1, p_i_j_l_LayerIndex_3_2, p_i_j_l_LayerIndex_3_3, kLayerIndex); if (p_i_j_l_LayerIndex != -1) { p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_3_0, p_i_j_l_LayerSize_3_1, p_i_j_l_LayerSize_3_2, p_i_j_l_LayerSize_3_3, kLayerIndex); weights = PickFPValueByLayerIndex$(weights_3_0, weights_3_1, weights_3_2, weights_3_3, kLayerIndex); sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex); } global float* netDerivValues = PickFPValueByLayerIndex(netDerivValues_0, netDerivValues_1, netDerivValues_2, netDerivValues_3, kLayerIndex); float p = netDerivValues[kValueIndex] * sum; GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, kLayerIndex)[kValueIndex] = p; if (computeGradient) tmpGradients[localId] += (desiredOutputs[kValueIndex] - outputs[kValueIndex]) * p; kValueIndex++; } barrier(CLK_LOCAL_MEM_FENCE); } if (gradients != null || gradientSums != null) { ComputeGradinetsRTLR_SetGradients(tmpGradients, gradients, gradientSums); } /*int pValuesOfWeightsSize2 = uLayersCount * maxULayerSize; int block = pValuesOfWeightsSize2 / localSize + (pValuesOfWeightsSize2 % localSize != 0 ? 1 : 0); int kLayerAndValueIndex = localId * block; int max = kLayerAndValueIndex + block; if (max > pValuesOfWeightsSize2) max = pValuesOfWeightsSize2; while (kLayerAndValueIndex < max) { int kLayerIndex = kLayerAndValueIndex / maxULayerSize; int kValueIndex = kLayerAndValueIndex % maxULayerSize; int kLayerSize = PickIntValueByLayerIndex(p_i_j_k_LayerSize_0, p_i_j_k_LayerSize_1, p_i_j_k_LayerSize_2, p_i_j_k_LayerSize_3, kLayerIndex); if (kValueIndex < kLayerSize) { bool computeGradient = (kLayerIndex == uLayersCount - 1) && outputs != null && desiredOutputs != null; float sum = (iLayerIndex == kLayerIndex && iValueIndex == kValueIndex) ? (inputs != null ? inputs[jValueIndex] : 1.0f) : 0.0f; int p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_0_0, p_i_j_l_LayerIndex_0_1, p_i_j_l_LayerIndex_0_2, p_i_j_l_LayerIndex_0_3, kLayerIndex); int p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_0_0, p_i_j_l_LayerSize_0_1, p_i_j_l_LayerSize_0_2, p_i_j_l_LayerSize_0_3, kLayerIndex); global float$* weights = PickFPValueByLayerIndex$(weights_0_0, weights_0_1, weights_0_2, weights_0_3, kLayerIndex); sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex); p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_1_0, p_i_j_l_LayerIndex_1_1, p_i_j_l_LayerIndex_1_2, p_i_j_l_LayerIndex_1_3, kLayerIndex); if (p_i_j_l_LayerIndex != -1) { p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_1_0, p_i_j_l_LayerSize_1_1, p_i_j_l_LayerSize_1_2, p_i_j_l_LayerSize_1_3, kLayerIndex); weights = PickFPValueByLayerIndex$(weights_1_0, weights_1_1, weights_1_2, weights_1_3, kLayerIndex); sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex); } p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_2_0, p_i_j_l_LayerIndex_2_1, p_i_j_l_LayerIndex_2_2, p_i_j_l_LayerIndex_2_3, kLayerIndex); if (p_i_j_l_LayerIndex != -1) { p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_2_0, p_i_j_l_LayerSize_2_1, p_i_j_l_LayerSize_2_2, p_i_j_l_LayerSize_2_3, kLayerIndex); weights = PickFPValueByLayerIndex$(weights_2_0, weights_2_1, weights_2_2, weights_2_3, kLayerIndex); sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex); } p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_3_0, p_i_j_l_LayerIndex_3_1, p_i_j_l_LayerIndex_3_2, p_i_j_l_LayerIndex_3_3, kLayerIndex); if (p_i_j_l_LayerIndex != -1) { p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_3_0, p_i_j_l_LayerSize_3_1, p_i_j_l_LayerSize_3_2, p_i_j_l_LayerSize_3_3, kLayerIndex); weights = PickFPValueByLayerIndex$(weights_3_0, weights_3_1, weights_3_2, weights_3_3, kLayerIndex); sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex); } global float* netDerivValues = PickFPValueByLayerIndex(netDerivValues_0, netDerivValues_1, netDerivValues_2, netDerivValues_3, kLayerIndex); float p = netDerivValues[kValueIndex] * sum; GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, kLayerIndex)[kValueIndex] = p; if (computeGradient) tmpGradients[localId] += (desiredOutputs[kValueIndex] - outputs[kValueIndex]) * p; } kLayerAndValueIndex++; }*/ }
uint na) { int i, j; double Ystx = 0; __local double *y = 0; double switcher; double neg_switcher; // Thread index int tx = get_local_id(0); // Thread index int gx = get_global_id(0); // Block index int bx = get_group_id(0); A = A + offA; __global const double *Aoff = A + bx*lda*BLOCK_SIZE + bx*BLOCK_SIZE; int NumBLperNB = NB / BLOCK_SIZE; d_dinvA += bx / NumBLperNB*NB*NB + (bx % NumBLperNB)*(NB*BLOCK_SIZE + BLOCK_SIZE); __local double Bs[BLOCK_SIZE*BLOCK_SIZE]; __local double workspace[BLOCK_SIZE]; // workspace used to store the current working column // load A _Pragma("unroll") for (i = 0; i < BLOCK_SIZE; i++) { if (tx <= i && i + bx*BLOCK_SIZE < na)
uint offsetB, uint offsetC) { float rC[4][4] = { {(float)0} }; float rA[1][4]; float rB[1][4]; A += offsetA; B += offsetB; C+=offsetC; __local float lA[1056]; __local float lB[1056]; uint gidx = get_group_id(0); uint gidy = get_group_id(1); uint idx = get_local_id(0); uint idy = get_local_id(1); uint idt = 16*idy + idx; uint idxT = idt % 16; uint idyT = idt / 16; A += gidx*64*lda+ idxT + idyT*lda; B += gidy*64*ldb+ idxT + idyT*ldb; uint block_k = K >> 4; do {
Group * ComputeGroupExternally(Group *group) { int i; int size = group->my_corpus->size; int cutoff_freq = group->cutoff_frequency; char temporary_name[TEMP_FILENAME_BUFSIZE]; FILE *fd; FILE *pipe; char sort_call[CL_MAX_LINE_LENGTH]; /* ---------------------------------------------------------------------- */ if ((fd = open_temporary_file(temporary_name)) == NULL) { perror("Error while opening temporary file"); cqpmessage(Warning, "Can't open temporary file"); return group; } for (i = 0; i < size; i++) { fprintf(fd, "%d %d\n", get_group_id(group, i, 0), get_group_id(group, i, 1)); /* (source ID, target ID) */ } fclose(fd); /* construct sort call */ sprintf(sort_call, ExternalGroupingCommand, temporary_name); if (GROUP_DEBUG) Rprintf( "Running grouping sort: \n\t%s\n", sort_call); if ((pipe = popen(sort_call, "r")) == NULL) { perror("Failure opening grouping pipe"); cqpmessage(Warning, "Can't open grouping pipe:\n%s\n" "Disable external grouping by\n" " set UseExternalGrouping off;", sort_call); } else { int freq, p1, p2, tokens; #define GROUP_REALLOC 16 while ((tokens = fscanf(pipe, "%d%d%d", &freq, &p1, &p2)) == 3) { if (freq > cutoff_freq) { if ((group->nr_cells % GROUP_REALLOC) == 0) { if (group->count_cells == NULL) { group->count_cells = (ID_Count_Mapping *)cl_malloc(GROUP_REALLOC * sizeof(ID_Count_Mapping)); } else { group->count_cells = (ID_Count_Mapping *)cl_realloc(group->count_cells, (group->nr_cells + GROUP_REALLOC) * sizeof(ID_Count_Mapping)); } assert(group->count_cells); } group->count_cells[group->nr_cells].s = p1; group->count_cells[group->nr_cells].t = p2; group->count_cells[group->nr_cells].freq = freq; group->nr_cells = group->nr_cells + 1; } } if (tokens != EOF) { Rprintf( "Warning: could not reach EOF of temporary file!\n"); } pclose(pipe); } if (GROUP_DEBUG) { Rprintf( "Keeping temporary file %s -- delete manually\n", temporary_name); } else if (unlink(temporary_name) != 0) { perror(temporary_name); Rprintf( "Can't remove temporary file %s -- \n\tI will continue, " "but you should remove that file.\n", temporary_name); } return group; }
Group * ComputeGroupInternally(Group *group) { ID_Count_Mapping node; ID_Count_Mapping *result; int i; size_t nr_nodes; int percentage, new_percentage; /* for ProgressBar */ int size = group->my_corpus->size; /* ---------------------------------------------------------------------- */ nr_nodes = 0; if (progress_bar) progress_bar_clear_line(); percentage = -1; EvaluationIsRunning = 1; for (i = 0; i < size; i++) { if (! EvaluationIsRunning) break; /* user abort (Ctrl-C) */ if (progress_bar) { new_percentage = floor(0.5 + (100.0 * i) / size); if (new_percentage > percentage) { percentage = new_percentage; progress_bar_percentage(1, 2, percentage); } } node.s = get_group_id(group, i, 0); /* source ID */ node.t = get_group_id(group, i, 1); /* target ID */ node.freq = 0; result = binsert_g(&node, (void **) &(group->count_cells), &nr_nodes, sizeof(ID_Count_Mapping), compare_st_cells); result->freq++; } if (EvaluationIsRunning) { group->nr_cells = sum_freqs(group->count_cells, nr_nodes, group->cutoff_frequency); if (progress_bar) progress_bar_clear_line(); if (group->nr_cells < nr_nodes) group->count_cells = cl_realloc(group->count_cells, (group->nr_cells * sizeof(ID_Count_Mapping))); } else { cqpmessage(Warning, "Group operation aborted by user."); if (which_app == cqp) install_signal_handler(); free_group(&group); /* sets return value to NULL to indicate failure */ } EvaluationIsRunning = 0; return group; }
__kernel void TRIPLE_DGEMM_UPDATE_192_96_PART2_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na) { // Ain is the non inverse matrix; the size of Ain is lda * na // offAin is the offset of Ain // d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1 // blk is subblock size, which is 48 here. // lda in leading dimension. Column major here // npages = (na-1)/12*2 + 1; for 96 this is 1 for 192 this is 2 //Work group size is [24, 2] //global work size is [48*number of blocks, 4] //each work item in each work group is responsible for 12 elements (1/4) in that row //each work group is responsible for 24 by 24 macro tile; ////////////// -invA11*invA12 const uint gidx = get_group_id(0); const uint gidy = get_group_id(1); const uint idx = get_local_id(0); const uint idy = get_local_id(1); //uint page = gidx / 2;//0-1 for 192; 0 for 96 //const uint page = (gidx/2)%2;//index of page within a page_block; 1 pages per page_block const uint page_block = gidx / 4; //#index of page_block; 4 WG per page; 4 WG per page_block __global double *A, *B, *C; __local double lA[24][48]; __local double lB[48][24]; double privateC[12] = { (double)0 }; //decide invA11 location for each page //each workgroup loads half of A (left or right) //A = d_dinvA + page*NB*NB + gidx%2*(blk/2); A = d_dinvA + page_block*NB*NB + gidx % 4 * (blk / 4); //decide invA12 (B) location for each page //actually it was saved in invA21 from last kernel //each workgroup loads half of B (up or down) //B = d_dinvA + page*NB*NB + blk*NB + gidy*(blk/2)*NB; B = d_dinvA + page_block*NB*NB + blk*NB + gidy*(blk / 4)*NB; //decide invA12 location for each page //each workgroup writes 1/4 of C //C = d_dinvA + page*NB*NB + blk * NB + gidx%2*(blk/2) + gidy*(blk/2)*NB; C = d_dinvA + page_block*NB*NB + blk*NB + gidx % 4 * (blk / 4) + gidy*(blk / 4)*NB; //read A and B into LDS no transpose operated here //each work item loads a half row of A and half column of B //idx 0-23 idy 0-1 uint block_k = blk / 48; //thus we need 2 iterations here do{ barrier(CLK_LOCAL_MEM_FENCE); lA[idx][0 + idy * 24] = A[idx + idy * 24 * NB]; lA[idx][1 + idy * 24] = A[idx + NB + idy * 24 * NB]; lA[idx][2 + idy * 24] = A[idx + NB * 2 + idy * 24 * NB]; lA[idx][3 + idy * 24] = A[idx + NB * 3 + idy * 24 * NB]; lA[idx][4 + idy * 24] = A[idx + NB * 4 + idy * 24 * NB]; lA[idx][5 + idy * 24] = A[idx + NB * 5 + idy * 24 * NB]; lA[idx][6 + idy * 24] = A[idx + NB * 6 + idy * 24 * NB]; lA[idx][7 + idy * 24] = A[idx + NB * 7 + idy * 24 * NB]; lA[idx][8 + idy * 24] = A[idx + NB * 8 + idy * 24 * NB]; lA[idx][9 + idy * 24] = A[idx + NB * 9 + idy * 24 * NB]; lA[idx][10 + idy * 24] = A[idx + NB * 10 + idy * 24 * NB]; lA[idx][11 + idy * 24] = A[idx + NB * 11 + idy * 24 * NB]; lA[idx][12 + idy * 24] = A[idx + NB * 12 + idy * 24 * NB]; lA[idx][13 + idy * 24] = A[idx + NB * 13 + idy * 24 * NB]; lA[idx][14 + idy * 24] = A[idx + NB * 14 + idy * 24 * NB]; lA[idx][15 + idy * 24] = A[idx + NB * 15 + idy * 24 * NB]; lA[idx][16 + idy * 24] = A[idx + NB * 16 + idy * 24 * NB]; lA[idx][17 + idy * 24] = A[idx + NB * 17 + idy * 24 * NB]; lA[idx][18 + idy * 24] = A[idx + NB * 18 + idy * 24 * NB]; lA[idx][19 + idy * 24] = A[idx + NB * 19 + idy * 24 * NB]; lA[idx][20 + idy * 24] = A[idx + NB * 20 + idy * 24 * NB]; lA[idx][21 + idy * 24] = A[idx + NB * 21 + idy * 24 * NB]; lA[idx][22 + idy * 24] = A[idx + NB * 22 + idy * 24 * NB]; lA[idx][23 + idy * 24] = A[idx + NB * 23 + idy * 24 * NB]; lB[0 + idy * 24][idx] = B[idx*NB + idy * 24]; lB[1 + idy * 24][idx] = B[idx*NB + idy * 24 + 1]; lB[2 + idy * 24][idx] = B[idx*NB + idy * 24 + 2]; lB[3 + idy * 24][idx] = B[idx*NB + idy * 24 + 3]; lB[4 + idy * 24][idx] = B[idx*NB + idy * 24 + 4]; lB[5 + idy * 24][idx] = B[idx*NB + idy * 24 + 5]; lB[6 + idy * 24][idx] = B[idx*NB + idy * 24 + 6]; lB[7 + idy * 24][idx] = B[idx*NB + idy * 24 + 7]; lB[8 + idy * 24][idx] = B[idx*NB + idy * 24 + 8]; lB[9 + idy * 24][idx] = B[idx*NB + idy * 24 + 9]; lB[10 + idy * 24][idx] = B[idx*NB + idy * 24 + 10]; lB[11 + idy * 24][idx] = B[idx*NB + idy * 24 + 11]; lB[12 + idy * 24][idx] = B[idx*NB + idy * 24 + 12]; lB[13 + idy * 24][idx] = B[idx*NB + idy * 24 + 13]; lB[14 + idy * 24][idx] = B[idx*NB + idy * 24 + 14]; lB[15 + idy * 24][idx] = B[idx*NB + idy * 24 + 15]; lB[16 + idy * 24][idx] = B[idx*NB + idy * 24 + 16]; lB[17 + idy * 24][idx] = B[idx*NB + idy * 24 + 17]; lB[18 + idy * 24][idx] = B[idx*NB + idy * 24 + 18]; lB[19 + idy * 24][idx] = B[idx*NB + idy * 24 + 19]; lB[20 + idy * 24][idx] = B[idx*NB + idy * 24 + 20]; lB[21 + idy * 24][idx] = B[idx*NB + idy * 24 + 21]; lB[22 + idy * 24][idx] = B[idx*NB + idy * 24 + 22]; lB[23 + idy * 24][idx] = B[idx*NB + idy * 24 + 23]; barrier(CLK_LOCAL_MEM_FENCE); //do math uint i = 0; do{ privateC[0] = mad(lA[idx][i], lB[i][0 + idy * 12], privateC[0]); privateC[1] = mad(lA[idx][i], lB[i][1 + idy * 12], privateC[1]); privateC[2] = mad(lA[idx][i], lB[i][2 + idy * 12], privateC[2]); privateC[3] = mad(lA[idx][i], lB[i][3 + idy * 12], privateC[3]); privateC[4] = mad(lA[idx][i], lB[i][4 + idy * 12], privateC[4]); privateC[5] = mad(lA[idx][i], lB[i][5 + idy * 12], privateC[5]); privateC[6] = mad(lA[idx][i], lB[i][6 + idy * 12], privateC[6]); privateC[7] = mad(lA[idx][i], lB[i][7 + idy * 12], privateC[7]); privateC[8] = mad(lA[idx][i], lB[i][8 + idy * 12], privateC[8]); privateC[9] = mad(lA[idx][i], lB[i][9 + idy * 12], privateC[9]); privateC[10] = mad(lA[idx][i], lB[i][10 + idy * 12], privateC[10]); privateC[11] = mad(lA[idx][i], lB[i][11 + idy * 12], privateC[11]); i = i + 1; } while (i < 48); A += 48 * NB; B += 48; } while (--block_k>0); uint i = 0; do{ C[NB*idy * 12 + NB*i + idx] = -1 * privateC[i]; i = i + 1; } while (i < 12); }
__kernel void TRIPLE_DGEMM_UPDATE_192_24_PART1_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, uint lda, int npages, int na) { // Ain is the non inverse matrix; the size of Ain is lda * na // offAin is the offset of Ain // d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1 // blk is subblock size, which is 24 here. // lda in leading dimension. Column major here // npages = (na-1)/12*2 + 1; for 96 this is 2 for 192 this is 4 //Work group size is [24, 2] //global work size is [96*number of blocks, 2] //each work item in each work group is responsible for 12 elements (half) in that row //each work group is responsible for one gemm; ////////////// A12*invA22 const uint gidx = get_group_id(0); const uint gidy = get_group_id(1); const uint idx = get_local_id(0); const uint idy = get_local_id(1); const uint page = gidx % npages; //0-3 for 192; 0-1 for 96 const uint page_block = page / 4; //4 pages per page block __global double *B, *C; __local double lA[24][24]; __local double lB[24][24]; double privateC[12] = { (double)0 }; //decide A12 location for each page Ain = Ain + offAin; Ain += (page*blk * 2 + blk) * lda + page * 2 * blk; //decide invA22 (B) location for each page B = d_dinvA + page_block*NB*NB + ((page % 4)*blk * 2 + blk) * NB + (page % 4) * 2 * blk + blk; //decide invA12 location for each page C = d_dinvA + page_block*NB*NB + ((page % 4)*blk * 2 + blk) * NB + (page % 4) * 2 * blk; //read A and B into LDS no transpose operated here //each work iteam loads half a row lA[idx][0 + idy * 12] = Ain[idx + idy * 12 * lda]; lA[idx][1 + idy * 12] = Ain[idx + lda + idy * 12 * lda]; lA[idx][2 + idy * 12] = Ain[idx + lda * 2 + idy * 12 * lda]; lA[idx][3 + idy * 12] = Ain[idx + lda * 3 + idy * 12 * lda]; lA[idx][4 + idy * 12] = Ain[idx + lda * 4 + idy * 12 * lda]; lA[idx][5 + idy * 12] = Ain[idx + lda * 5 + idy * 12 * lda]; lA[idx][6 + idy * 12] = Ain[idx + lda * 6 + idy * 12 * lda]; lA[idx][7 + idy * 12] = Ain[idx + lda * 7 + idy * 12 * lda]; lA[idx][8 + idy * 12] = Ain[idx + lda * 8 + idy * 12 * lda]; lA[idx][9 + idy * 12] = Ain[idx + lda * 9 + idy * 12 * lda]; lA[idx][10 + idy * 12] = Ain[idx + lda * 10 + idy * 12 * lda]; lA[idx][11 + idy * 12] = Ain[idx + lda * 11 + idy * 12 * lda]; lB[idx][0 + idy * 12] = B[idx + idy * 12 * NB]; lB[idx][1 + idy * 12] = B[idx + NB + idy * 12 * NB]; lB[idx][2 + idy * 12] = B[idx + NB * 2 + idy * 12 * NB]; lB[idx][3 + idy * 12] = B[idx + NB * 3 + idy * 12 * NB]; lB[idx][4 + idy * 12] = B[idx + NB * 4 + idy * 12 * NB]; lB[idx][5 + idy * 12] = B[idx + NB * 5 + idy * 12 * NB]; lB[idx][6 + idy * 12] = B[idx + NB * 6 + idy * 12 * NB]; lB[idx][7 + idy * 12] = B[idx + NB * 7 + idy * 12 * NB]; lB[idx][8 + idy * 12] = B[idx + NB * 8 + idy * 12 * NB]; lB[idx][9 + idy * 12] = B[idx + NB * 9 + idy * 12 * NB]; lB[idx][10 + idy * 12] = B[idx + NB * 10 + idy * 12 * NB]; lB[idx][11 + idy * 12] = B[idx + NB * 11 + idy * 12 * NB]; barrier(CLK_LOCAL_MEM_FENCE); //do math uint i = 0; do{ privateC[0] = mad(lA[idx][i], lB[i][0 + idy * 12], privateC[0]); privateC[1] = mad(lA[idx][i], lB[i][1 + idy * 12], privateC[1]); privateC[2] = mad(lA[idx][i], lB[i][2 + idy * 12], privateC[2]); privateC[3] = mad(lA[idx][i], lB[i][3 + idy * 12], privateC[3]); privateC[4] = mad(lA[idx][i], lB[i][4 + idy * 12], privateC[4]); privateC[5] = mad(lA[idx][i], lB[i][5 + idy * 12], privateC[5]); privateC[6] = mad(lA[idx][i], lB[i][6 + idy * 12], privateC[6]); privateC[7] = mad(lA[idx][i], lB[i][7 + idy * 12], privateC[7]); privateC[8] = mad(lA[idx][i], lB[i][8 + idy * 12], privateC[8]); privateC[9] = mad(lA[idx][i], lB[i][9 + idy * 12], privateC[9]); privateC[10] = mad(lA[idx][i], lB[i][10 + idy * 12], privateC[10]); privateC[11] = mad(lA[idx][i], lB[i][11 + idy * 12], privateC[11]); i = i + 1; } while (i < 24); i = 0; do{ C[NB*idy * 12 + NB*i + idx] = privateC[i]; i = i + 1; } while (i < 12); }
std::string marathon_app::get_group_id() const { return get_group_id(get_id()); }
uint offsetC) { float rC[2][4] = { (float)0 }; float rA[1][2]; float rB[1][4]; A += offsetA; B += offsetB; C += offsetC; __local float lA[528];//16*32+16 __local float lB[1040];//16*64+16 uint gidx = M / 64;//get_group_id(0); uint gidy = get_group_id(1); uint idx = get_local_id(0); uint idy = get_local_id(1); int CurrentOffSetA = gidx * 64 + idx; A += gidx * 64 + idx + idy*lda; B += gidy * 64 + idx + idy*ldb; uint block_k = K >> 4; do { __local float* plA = lA + idy * 33 + idx; __local float* plB = lB + idy * 65 + idx;
kernel void convolute(int4 imagesize, global unsigned char *input, global unsigned char *output, global kernf *filterG) { int4 gid = (int4)(get_global_id(0)*CONV_UNROLL, get_global_id(1), get_global_id(2), 0); int4 lid = (int4)(get_local_id(0), get_local_id(1), get_local_id(2), 0); int4 group = (int4)(get_group_id(0), get_group_id(1), get_group_id(2), 0); // First (?) pixel to process with this kernel int4 pixelid = gid; // Starting offset of the first pixel to process int imoffset = pixelid.s0 + imagesize.s0 * pixelid.s1 + imagesize.s0 * imagesize.s1 * pixelid.s2; int i,j; int dx,dy,dz; /* MAD performs a single convolution operation for each kernel, using the current 'raw' value as the input image 'ko' as an instance of an unrolled convolution filter 'pos' as the X-offset for each of the unrolled convolution filters Note that all the if statements dependent only on static values - meaning that they can be optimized away by the compiler */ #define MAD(ko,pos) {if(CONV_UNROLL>ko) { \ if(pos-ko >= 0 && pos-ko < kernsize) { \ val[ko] = mmad(val[ko],(kernf)(raw),filter[(pos-ko)+offset]); \ }}} #define MADS(pos) {if(pos<kernsize) { \ raw=input[imoffset2+pos]; \ MAD(0,pos); MAD(1,pos); MAD(2,pos); MAD(3,pos); MAD(4,pos); MAD(5,pos); MAD(6,pos); MAD(7,pos); \ MAD(8,pos); MAD(9,pos); MAD(10,pos); MAD(11,pos); MAD(12,pos); MAD(13,pos); MAD(14,pos); MAD(15,pos); \ MAD(16,pos); MAD(17,pos); MAD(18,pos); MAD(19,pos); MAD(20,pos); MAD(21,pos); MAD(22,pos); MAD(23,pos); \ MAD(24,pos); MAD(25,pos); MAD(26,pos); MAD(27,pos); MAD(28,pos); MAD(29,pos); MAD(30,pos); MAD(31,pos); \ MAD(32,pos); MAD(33,pos); MAD(34,pos); MAD(35,pos); MAD(36,pos); MAD(37,pos); MAD(38,pos); MAD(39,pos); \ }} kernf val[CONV_UNROLL]; for(j=0;j<CONV_UNROLL;j++) val[j]=(kernf)(0.0); int localSize = get_local_size(0) * get_local_size(1) * get_local_size(2); local kernf filter[kernsize*kernsize*kernsize]; /* Copy global filter to local memory */ event_t event = async_work_group_copy(filter,filterG,kernsize*kernsize*kernsize,0); wait_group_events(1, &event); if(gid.s0 + kernsize + CONV_UNROLL > imagesize.s0 || gid.s1 + kernsize > imagesize.s1 || gid.s2 + kernsize > imagesize.s2) return; for(dz=0;dz<kernsize;dz++) for(dy=0;dy<kernsize;dy++) { int offset = dy*kernsize*nkernels + dz*kernsize*kernsize*nkernels; int imoffset2 = imoffset+dy*imagesize.s0 + dz*imagesize.s0*imagesize.s1; unsigned char raw; /* kernsize + convolution_unroll < 42 */ MADS(0); MADS(1); MADS(2); MADS(3); MADS(4); MADS(5); MADS(6); MADS(7); MADS(8); MADS(9); MADS(10); MADS(11); MADS(12); MADS(13); MADS(14); MADS(15); MADS(16); MADS(17); MADS(18); MADS(19); MADS(20); MADS(21); MADS(22); MADS(23); MADS(24); MADS(25); MADS(26); MADS(27); MADS(28); MADS(29); MADS(30); MADS(31); MADS(32); MADS(33); MADS(34); MADS(35); MADS(36); MADS(37); MADS(38); MADS(39); MADS(40); MADS(41); } for(j=0;j<CONV_UNROLL;j++) { kernstore( convert_kernuc(val[j]), imoffset+j, output); } }
__kernel void TRIPLE_DGEMM_UPDATE_192_48_PART1_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n {\n // Ain is the non inverse matrix; the size of Ain is lda * na // offAin is the offset of Ain // d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1 // blk is subblock size, which is 48 here. // lda in leading dimension. Column major here // npages = (na-1)/12*2 + 1; for 96 this is 1 for 192 this is 2 //Work group size is [24, 2] //global work size is [96*number of blocks, 4] //each work item in each work group is responsible for 12 elements (1/4) in that row //each work group is responsible for 24 by 24 macro tile; ////////////// A12*invA22 const uint gidx = get_group_id(0);\n const uint gidy = get_group_id(1);\n const uint idx = get_local_id(0);\n const uint idy = get_local_id(1);\n //uint page = gidx / 2;//0-1 for 192; 0 for 96 const uint page = (gidx / 2) % 2; \n//index of page within a page_block; 2 pages per page_block const uint page_block = gidx / 4; \n//#index of page_block; 2 WG per page; 4 WG per page_block __global double *B, *C; \n __local double lA[24][48]; \n __local double lB[48][24]; \n double privateC[12] = { (double)0 }; \n //decide A12 location for each page //each workgroup loads half of A (left or right) Ain = Ain + offAin; \n Ain += page_block*NB*lda + page_block*NB + page*blk * 2 * lda + page*blk * 2 + blk*lda + gidx % 2 * (blk / 2); \n //decide invA22 (B) location for each page //each workgroup loads half of B (up or down) B = d_dinvA + page_block*NB*NB + page*blk * 2 * NB + page*blk * 2 + blk*NB + blk + gidy*(blk / 2)*NB; \n //decide invA12 location for each page; //Actually this will be stored in invA21 temporarily //each workgroup writes 1/4 of C C = d_dinvA + page_block*NB*NB + page*blk * 2 * NB + page*blk * 2 + blk*NB + gidx % 2 * (blk / 2) + gidy*(blk / 2)*NB; \n //read A and B into LDS no transpose operated here //each work item loads a half row of A and half column of B //idx 0-23 idy 0-1 lA[idx][0 + idy * 24] = Ain[idx + idy * 24 * lda]; \n lA[idx][1 + idy * 24] = Ain[idx + lda + idy * 24 * lda]; \n lA[idx][2 + idy * 24] = Ain[idx + lda * 2 + idy * 24 * lda]; \n lA[idx][3 + idy * 24] = Ain[idx + lda * 3 + idy * 24 * lda]; \n lA[idx][4 + idy * 24] = Ain[idx + lda * 4 + idy * 24 * lda]; \n lA[idx][5 + idy * 24] = Ain[idx + lda * 5 + idy * 24 * lda]; \n lA[idx][6 + idy * 24] = Ain[idx + lda * 6 + idy * 24 * lda]; \n lA[idx][7 + idy * 24] = Ain[idx + lda * 7 + idy * 24 * lda]; \n lA[idx][8 + idy * 24] = Ain[idx + lda * 8 + idy * 24 * lda]; \n lA[idx][9 + idy * 24] = Ain[idx + lda * 9 + idy * 24 * lda]; \n lA[idx][10 + idy * 24] = Ain[idx + lda * 10 + idy * 24 * lda];\n lA[idx][11 + idy * 24] = Ain[idx + lda * 11 + idy * 24 * lda];\n lA[idx][12 + idy * 24] = Ain[idx + lda * 12 + idy * 24 * lda];\n lA[idx][13 + idy * 24] = Ain[idx + lda * 13 + idy * 24 * lda];\n lA[idx][14 + idy * 24] = Ain[idx + lda * 14 + idy * 24 * lda];\n lA[idx][15 + idy * 24] = Ain[idx + lda * 15 + idy * 24 * lda];\n lA[idx][16 + idy * 24] = Ain[idx + lda * 16 + idy * 24 * lda];\n lA[idx][17 + idy * 24] = Ain[idx + lda * 17 + idy * 24 * lda];\n lA[idx][18 + idy * 24] = Ain[idx + lda * 18 + idy * 24 * lda];\n lA[idx][19 + idy * 24] = Ain[idx + lda * 19 + idy * 24 * lda];\n lA[idx][20 + idy * 24] = Ain[idx + lda * 20 + idy * 24 * lda];\n lA[idx][21 + idy * 24] = Ain[idx + lda * 21 + idy * 24 * lda];\n lA[idx][22 + idy * 24] = Ain[idx + lda * 22 + idy * 24 * lda];\n lA[idx][23 + idy * 24] = Ain[idx + lda * 23 + idy * 24 * lda];\n lB[0 + idy * 24][idx] = B[idx*NB + idy * 24]; \n lB[1 + idy * 24][idx] = B[idx*NB + idy * 24 + 1];\n lB[2 + idy * 24][idx] = B[idx*NB + idy * 24 + 2];\n lB[3 + idy * 24][idx] = B[idx*NB + idy * 24 + 3];\n lB[4 + idy * 24][idx] = B[idx*NB + idy * 24 + 4];\n lB[5 + idy * 24][idx] = B[idx*NB + idy * 24 + 5];\n lB[6 + idy * 24][idx] = B[idx*NB + idy * 24 + 6];\n lB[7 + idy * 24][idx] = B[idx*NB + idy * 24 + 7];\n lB[8 + idy * 24][idx] = B[idx*NB + idy * 24 + 8];\n lB[9 + idy * 24][idx] = B[idx*NB + idy * 24 + 9];\n lB[10 + idy * 24][idx] = B[idx*NB + idy * 24 + 10];\n lB[11 + idy * 24][idx] = B[idx*NB + idy * 24 + 11];\n lB[12 + idy * 24][idx] = B[idx*NB + idy * 24 + 12];\n lB[13 + idy * 24][idx] = B[idx*NB + idy * 24 + 13];\n lB[14 + idy * 24][idx] = B[idx*NB + idy * 24 + 14];\n lB[15 + idy * 24][idx] = B[idx*NB + idy * 24 + 15];\n lB[16 + idy * 24][idx] = B[idx*NB + idy * 24 + 16];\n lB[17 + idy * 24][idx] = B[idx*NB + idy * 24 + 17];\n lB[18 + idy * 24][idx] = B[idx*NB + idy * 24 + 18];\n lB[19 + idy * 24][idx] = B[idx*NB + idy * 24 + 19];\n lB[20 + idy * 24][idx] = B[idx*NB + idy * 24 + 20];\n lB[21 + idy * 24][idx] = B[idx*NB + idy * 24 + 21];\n lB[22 + idy * 24][idx] = B[idx*NB + idy * 24 + 22];\n lB[23 + idy * 24][idx] = B[idx*NB + idy * 24 + 23];\n barrier(CLK_LOCAL_MEM_FENCE); \n //do math uint i = 0; \n do{\n privateC[0] = mad(lA[idx][i], lB[i][0 + idy * 12], privateC[0]);\n privateC[1] = mad(lA[idx][i], lB[i][1 + idy * 12], privateC[1]);\n privateC[2] = mad(lA[idx][i], lB[i][2 + idy * 12], privateC[2]);\n privateC[3] = mad(lA[idx][i], lB[i][3 + idy * 12], privateC[3]);\n privateC[4] = mad(lA[idx][i], lB[i][4 + idy * 12], privateC[4]);\n privateC[5] = mad(lA[idx][i], lB[i][5 + idy * 12], privateC[5]);\n privateC[6] = mad(lA[idx][i], lB[i][6 + idy * 12], privateC[6]);\n privateC[7] = mad(lA[idx][i], lB[i][7 + idy * 12], privateC[7]);\n privateC[8] = mad(lA[idx][i], lB[i][8 + idy * 12], privateC[8]);\n privateC[9] = mad(lA[idx][i], lB[i][9 + idy * 12], privateC[9]);\n privateC[10] = mad(lA[idx][i], lB[i][10 + idy * 12], privateC[10]); \n privateC[11] = mad(lA[idx][i], lB[i][11 + idy * 12], privateC[11]); \n i = i + 1; \n } while (i < 48); \n i = 0; \n do{\n C[NB*idy * 12 + NB*i + idx] = privateC[i]; \n i = i + 1; \n } while (i < 12); \n }\n
__kernel void TRIPLE_DGEMM_UPDATE_192_12_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, uint lda, int npages, int na) { // Ain is the non inverse matrix; the size of Ain is lda * na // offAin is the offset of Ain // d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1 // blk is subblock size, which is 12 here. // lda in leading dimension. Column major here // npages = (na-1)/12*2 + 1; for 96 this is 4 for 192 this is 8 //Work group size is [12] //global work size is [96*number of blocks] //each work item in each work group is responsible for every element in that row //each work group is responsible for one gemm;\ ////////////// A12*invA22 const uint gidx = get_group_id(0); const uint idx = get_local_id(0); const uint page = gidx % npages; const uint page_block = page / 8;//8 pages per page block const uint page_index_in_block = page % 8; __global double *B, *C; __local double lA[12][12]; __local double lB[12][12]; double privateC[12] = { (double)0 }; //decide A12 location for each page Ain = Ain + offAin; Ain += (page*blk * 2 + blk) * lda + page * 2 * blk; //decide invA22 (B) location for each page B = d_dinvA + page_block*NB*NB + (page_index_in_block*blk * 2 + blk) * NB + page_index_in_block * 2 * blk + blk; //decide invA12 location for each page C = d_dinvA + page_block*NB*NB + (page_index_in_block*blk * 2 + blk) * NB + page_index_in_block * 2 * blk; //read A and B into LDS no transpose operated here lA[idx][0] = Ain[idx]; lA[idx][1] = Ain[idx + lda]; lA[idx][2] = Ain[idx + lda * 2]; lA[idx][3] = Ain[idx + lda * 3]; lA[idx][4] = Ain[idx + lda * 4]; lA[idx][5] = Ain[idx + lda * 5]; lA[idx][6] = Ain[idx + lda * 6]; lA[idx][7] = Ain[idx + lda * 7]; lA[idx][8] = Ain[idx + lda * 8]; lA[idx][9] = Ain[idx + lda * 9]; lA[idx][10] = Ain[idx + lda * 10]; lA[idx][11] = Ain[idx + lda * 11]; lB[idx][0] = B[idx]; lB[idx][1] = B[idx + NB]; lB[idx][2] = B[idx + NB * 2]; lB[idx][3] = B[idx + NB * 3]; lB[idx][4] = B[idx + NB * 4]; lB[idx][5] = B[idx + NB * 5]; lB[idx][6] = B[idx + NB * 6]; lB[idx][7] = B[idx + NB * 7]; lB[idx][8] = B[idx + NB * 8]; lB[idx][9] = B[idx + NB * 9]; lB[idx][10] = B[idx + NB * 10]; lB[idx][11] = B[idx + NB * 11]; barrier(CLK_LOCAL_MEM_FENCE); //do math uint i = 0; do{ privateC[0] = mad(lA[idx][i], lB[i][0], privateC[0]); privateC[1] = mad(lA[idx][i], lB[i][1], privateC[1]); privateC[2] = mad(lA[idx][i], lB[i][2], privateC[2]); privateC[3] = mad(lA[idx][i], lB[i][3], privateC[3]); privateC[4] = mad(lA[idx][i], lB[i][4], privateC[4]); privateC[5] = mad(lA[idx][i], lB[i][5], privateC[5]); privateC[6] = mad(lA[idx][i], lB[i][6], privateC[6]); privateC[7] = mad(lA[idx][i], lB[i][7], privateC[7]); privateC[8] = mad(lA[idx][i], lB[i][8], privateC[8]); privateC[9] = mad(lA[idx][i], lB[i][9], privateC[9]); privateC[10] = mad(lA[idx][i], lB[i][10], privateC[10]); privateC[11] = mad(lA[idx][i], lB[i][11], privateC[11]); //mem_fence(CLK_LOCAL_MEM_FENCE); i = i + 1; } while (i < 12); i = 0; do{ C[NB*i + idx] = privateC[i]; i = i + 1; } while (i < 12); ////////////// -invA11*invA12 barrier(CLK_GLOBAL_MEM_FENCE); //A is moving to invA11 __global double *A; A = d_dinvA + page_block*NB*NB + ((page % 4)*blk * 2) * NB + (page % 4) * 2 * blk; //both B and C are pointing at invA12 B = C; //read A and B into LDS no transpose operated here lA[idx][0] = A[idx]; lA[idx][1] = A[idx + NB]; lA[idx][2] = A[idx + NB * 2]; lA[idx][3] = A[idx + NB * 3]; lA[idx][4] = A[idx + NB * 4]; lA[idx][5] = A[idx + NB * 5]; lA[idx][6] = A[idx + NB * 6]; lA[idx][7] = A[idx + NB * 7]; lA[idx][8] = A[idx + NB * 8]; lA[idx][9] = A[idx + NB * 9]; lA[idx][10] = A[idx + NB * 10]; lA[idx][11] = A[idx + NB * 11]; lB[idx][0] = B[idx]; lB[idx][1] = B[idx + NB]; lB[idx][2] = B[idx + NB * 2]; lB[idx][3] = B[idx + NB * 3]; lB[idx][4] = B[idx + NB * 4]; lB[idx][5] = B[idx + NB * 5]; lB[idx][6] = B[idx + NB * 6]; lB[idx][7] = B[idx + NB * 7]; lB[idx][8] = B[idx + NB * 8]; lB[idx][9] = B[idx + NB * 9]; lB[idx][10] = B[idx + NB * 10]; lB[idx][11] = B[idx + NB * 11]; barrier(CLK_LOCAL_MEM_FENCE); //do math i = 0; privateC[0] = 0; privateC[1] = 0; privateC[2] = 0; privateC[3] = 0; privateC[4] = 0; privateC[5] = 0; privateC[6] = 0; privateC[7] = 0; privateC[8] = 0; privateC[9] = 0; privateC[10] = 0; privateC[11] = 0; do{ privateC[0] = mad(lA[idx][i], lB[i][0], privateC[0]); privateC[1] = mad(lA[idx][i], lB[i][1], privateC[1]); privateC[2] = mad(lA[idx][i], lB[i][2], privateC[2]); privateC[3] = mad(lA[idx][i], lB[i][3], privateC[3]); privateC[4] = mad(lA[idx][i], lB[i][4], privateC[4]); privateC[5] = mad(lA[idx][i], lB[i][5], privateC[5]); privateC[6] = mad(lA[idx][i], lB[i][6], privateC[6]); privateC[7] = mad(lA[idx][i], lB[i][7], privateC[7]); privateC[8] = mad(lA[idx][i], lB[i][8], privateC[8]); privateC[9] = mad(lA[idx][i], lB[i][9], privateC[9]); privateC[10] = mad(lA[idx][i], lB[i][10], privateC[10]); privateC[11] = mad(lA[idx][i], lB[i][11], privateC[11]); //mem_fence(CLK_LOCAL_MEM_FENCE); i = i + 1; } while (i < 12); i = 0; do{ C[NB*i + idx] = -1 * privateC[i]; i = i + 1; } while (i < 12); }
void step_bodies( struct Body * bodies, struct Pair * pairs, unsigned int * map, float dt, unsigned int num_bodies, // in float * velocity_ratio, // in/out float * mass_center, // in float mass, // in unsigned int * number_escaped // out ) { /* work group */ int local_block = num_bodies / get_num_groups(0); unsigned int i_group0 = get_group_id(0) * local_block; unsigned int i_group1 = i_group0 + local_block; if(get_group_id(0) == (get_num_groups(0) - 1)) i_group1 = num_bodies; /* work item */ int block = (i_group1 - i_group0) / get_local_size(0); unsigned int i_local0 = i_group0 + get_local_id(0) * block; unsigned int i_local1 = i_local0 + block; if(get_local_id(0) == (get_local_size(0) - 1)) i_local1 = i_group1; /* printf("local_block = %i\n", local_block); printf("block = %i\n", block); */ /* printf("i_local0 = %i\n", i_local0); printf("i_local1 = %i\n", i_local1); */ /* copy data for work group */ //__local struct Pair local_pairs[NUM_PAIRS]; //__local struct BodyMap local_bodymaps[NUM_BODIES / NUM_GROUPS]; //event_t e0 = async_work_group_copy((__local char *)local_pairs, (char *)pairs, NUM_PAIRS * sizeof(struct Pair), 0); //wait_group_events(1, &e0); //event_t e1 = async_work_group_copy((__local char *)local_bodymaps, (char *)(bodymaps + i_group0), (i_group1 - i_group0) * sizeof(struct BodyMap), 0); //wait_group_events(1, &e1); /* */ float f[3]; //__local struct BodyMap * pbm = 0; //struct BodyMap * pbm = 0; Body * pb = 0; for(unsigned int b = i_local0; b < i_local1; b++) { //pbm = local_bodymaps + b; //pbm = bodymaps + b; pb = bodies + b; if(pb->alive == 0) { //puts("body dead"); continue; } f[0] = 0; f[1] = 0; f[2] = 0; for(unsigned int i = 0; i < num_bodies; i++) { if(b == i) continue; //__local struct Pair * pp = &local_pairs[pbm->pair[p]]; Pair * pp = pairs + map[b * num_bodies + i]; if(pp->_M_alive == 0) continue; if(pp->b0 == b) { f[0] -= pp->u[0] * pp->f; f[1] -= pp->u[1] * pp->f; f[2] -= pp->u[2] * pp->f; } else if(pp->b1 == b) { f[0] += pp->u[0] * pp->f; f[1] += pp->u[1] * pp->f; f[2] += pp->u[2] * pp->f; } else { assert(0); } } float dv[3]; if(0) { dv[0] = dt * f[0] / pb->mass; dv[1] = dt * f[1] / pb->mass; dv[2] = dt * f[2] / pb->mass; } else { dv[0] = dt * pb->f[0] / pb->mass; dv[1] = dt * pb->f[1] / pb->mass; dv[2] = dt * pb->f[2] / pb->mass; } //print(pb->f); if( (!feq(pb->f[0], f[0])) || (!feq(pb->f[1], f[1])) || (!feq(pb->f[2], f[2])) ) { print(f); print(pb->f); abort(); } assert(std::isfinite(pb->mass)); assert(std::isfinite(dt)); assert(std::isfinite(pb->f[0])); assert(std::isfinite(pb->f[1])); assert(std::isfinite(pb->f[2])); // reset accumulating force pb->f[0] = 0; pb->f[1] = 0; pb->f[2] = 0; float e = 0.01; float rat[3]; rat[0] = fabs(dv[0] / pb->v[0]); rat[1] = fabs(dv[1] / pb->v[1]); rat[2] = fabs(dv[2] / pb->v[2]); // atomic if(std::isfinite(rat[0])) if(rat[0] > velocity_ratio[0]) velocity_ratio[0] = rat[0]; if(std::isfinite(rat[1])) if(rat[1] > velocity_ratio[1]) velocity_ratio[1] = rat[1]; if(std::isfinite(rat[2])) if(rat[2] > velocity_ratio[2]) velocity_ratio[2] = rat[2]; if(0) { if( ((std::isfinite(rat[0])) && (rat[0] > e)) || ((std::isfinite(rat[1])) && (rat[1] > e)) || ((std::isfinite(rat[2])) && (rat[2] > e)) ) { printf("% 12f % 12f % 12f\n", rat[0], rat[1], rat[2]); } } pb->v[0] += dv[0]; pb->v[1] += dv[1]; pb->v[2] += dv[2]; pb->x[0] += dt * pb->v[0]; pb->x[1] += dt * pb->v[1]; pb->x[2] += dt * pb->v[2]; // distance from mass center float r[3]; r[0] = pb->x[0] - mass_center[0]; r[1] = pb->x[1] - mass_center[1]; r[2] = pb->x[2] - mass_center[2]; float d = sqrt(r[0]*r[0] + r[1]*r[1] + r[2]*r[2]); float escape_speed2 = 2.0 * 6.67e-11 * mass / d; float s2 = pb->v[0]*pb->v[0] + pb->v[1]*pb->v[1] + pb->v[2]*pb->v[2]; // dot product of velocity and displacement vector float dot = pb->v[0] * r[0] + pb->v[1] * r[1] + pb->v[2] * r[2]; if(s2 > (escape_speed2)) // speed exceeds escape speed { if(dot > 0.0) // parallel componenet points away from mass_center { // atomic (*number_escaped)++; //printf("escape!\n"); } } } }
void fs_private_dev(void){ // install a new /dev directory if (arg_debug) printf("Mounting tmpfs on /dev\n"); // create DRI_DIR // keep a copy of dev directory mkdir_attr(RUN_DEV_DIR, 0755, 0, 0); if (mount("/dev", RUN_DEV_DIR, NULL, MS_BIND|MS_REC, NULL) < 0) errExit("mounting /dev"); // create DEVLOG_FILE int have_devlog = 0; struct stat s; if (stat("/dev/log", &s) == 0) { have_devlog = 1; FILE *fp = fopen(RUN_DEVLOG_FILE, "w"); if (!fp) have_devlog = 0; else { fprintf(fp, "\n"); fclose(fp); if (mount("/dev/log", RUN_DEVLOG_FILE, NULL, MS_BIND|MS_REC, NULL) < 0) errExit("mounting /dev/log"); } } // mount tmpfs on top of /dev if (mount("tmpfs", "/dev", "tmpfs", MS_NOSUID | MS_STRICTATIME | MS_REC, "mode=755,gid=0") < 0) errExit("mounting /dev"); fs_logger("tmpfs /dev"); // optional devices: sound, video cards etc... deventry_mount(); // bring back /dev/log if (have_devlog) { FILE *fp = fopen("/dev/log", "w"); if (fp) { fprintf(fp, "\n"); fclose(fp); if (mount(RUN_DEVLOG_FILE, "/dev/log", NULL, MS_BIND|MS_REC, NULL) < 0) errExit("mounting /dev/log"); fs_logger("clone /dev/log"); } } // bring forward the current /dev/shm directory if necessary if (arg_debug) printf("Process /dev/shm directory\n"); process_dev_shm(); if (mount(RUN_RO_DIR, RUN_DEV_DIR, "none", MS_BIND, "mode=400,gid=0") < 0) errExit("disable run dev directory"); // create default devices create_char_dev("/dev/zero", 0666, 1, 5); // mknod -m 666 /dev/zero c 1 5 fs_logger("mknod /dev/zero"); create_char_dev("/dev/null", 0666, 1, 3); // mknod -m 666 /dev/null c 1 3 fs_logger("mknod /dev/null"); create_char_dev("/dev/full", 0666, 1, 7); // mknod -m 666 /dev/full c 1 7 fs_logger("mknod /dev/full"); create_char_dev("/dev/random", 0666, 1, 8); // Mknod -m 666 /dev/random c 1 8 fs_logger("mknod /dev/random"); create_char_dev("/dev/urandom", 0666, 1, 9); // mknod -m 666 /dev/urandom c 1 9 fs_logger("mknod /dev/urandom"); create_char_dev("/dev/tty", 0666, 5, 0); // mknod -m 666 /dev/tty c 5 0 fs_logger("mknod /dev/tty"); #if 0 create_dev("/dev/tty0", "mknod -m 666 /dev/tty0 c 4 0"); create_dev("/dev/console", "mknod -m 622 /dev/console c 5 1"); #endif // pseudo-terminal mkdir_attr("/dev/pts", 0755, 0, 0); fs_logger("mkdir /dev/pts"); fs_logger("create /dev/pts"); create_char_dev("/dev/pts/ptmx", 0666, 5, 2); //"mknod -m 666 /dev/pts/ptmx c 5 2"); fs_logger("mknod /dev/pts/ptmx"); create_link("/dev/pts/ptmx", "/dev/ptmx"); // code before github issue #351 // mount -vt devpts -o newinstance -o ptmxmode=0666 devpts //dev/pts // if (mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL, "newinstance,ptmxmode=0666") < 0) // errExit("mounting /dev/pts"); // mount /dev/pts gid_t ttygid = get_group_id("tty"); char *data; if (asprintf(&data, "newinstance,gid=%d,mode=620,ptmxmode=0666", (int) ttygid) == -1) errExit("asprintf"); if (mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL, data) < 0) errExit("mounting /dev/pts"); free(data); fs_logger("clone /dev/pts"); // stdin, stdout, stderr #if 0 create_link("/proc/self/fd", "/dev/fd"); create_link("/proc/self/fd/0", "/dev/stdin"); create_link("/proc/self/fd/1", "/dev/stdout"); create_link("/proc/self/fd/2", "/dev/stderr"); #endif // symlinks for DVD/CD players if (stat("/dev/sr0", &s) == 0) { create_link("/dev/sr0", "/dev/cdrom"); create_link("/dev/sr0", "/dev/cdrw"); create_link("/dev/sr0", "/dev/dvd"); create_link("/dev/sr0", "/dev/dvdrw"); } }