void cluster_t::update_projection_gpu(){ #ifdef USE_GPU // // 1) Run kernel to copy U_project into U_project_prev // 2) Run kernel to compute sub_fnorms and store in // buffer_subject_variable_block_norms. // 3) Fetch sub_fnorm from GPU // int x_dim = BLOCK_WIDTH * variable_blocks; runKernel("store_U_project_prev",kernel_store_U_project_prev,x_dim,n,1,BLOCK_WIDTH,1,1); bool debug_gpu1 = false; if(debug_gpu1){ float testArr[n*p]; readFromBuffer(buffer_U_project_prev,n*p,testArr,"buffer_U_project_prev"); for(int i=0;i<n;++i){ for(int j=0;j<p;++j){ if(i>(n-2) && j>(p-10)){ cerr<<"GPU: U_project_prev: "<<i<<","<<j<<": "<<testArr[i*p+j]<<endl; } } } for(int i=0;i<n;++i){ for(int j=0;j<p;++j){ if(i>(n-2) && j>(p-10)){ cerr<<"CPU: U_project_prev: "<<i<<","<<j<<": "<<U_project_prev[i*p+j]<<endl; } } } } // for debugging only //writeToBuffer(buffer_V_project_coeff,triangle_dim,V_project_coeff,"buffer_V_project_coeff"); //writeToBuffer(buffer_U_project_orig,n*p,U_project_orig,"buffer_U_project_orig"); runKernel("iterate_projection",kernel_iterate_projection,x_dim,n,1,BLOCK_WIDTH,1,1); readFromBuffer(buffer_subject_variable_block_norms,n*variable_blocks,sub_fnorm,"buffer_subject_variable_block_norms"); bool debug_gpu = false; if (debug_gpu){ cerr<<"GPU iterate projection:\n"; float test_U[n*p]; readFromBuffer(buffer_U_project,n*p,test_U,"buffer_U_project"); for(int i=n-5;i<n;++i){ cerr<<i<<":"; for(int j=0;j<p;++j){ cerr<<" "<<test_U[i*p+j]; } cerr<<endl; } for(int i=(0);i<n;++i){ cerr<<i<<":"; for(int j=0;j<variable_blocks;++j){ cerr<<" "<<sub_fnorm[i*variable_blocks+j]; } cerr<<endl; } } #endif }
void OCLSample::timeKernel(cl::Kernel kernel, double& elapsed, double& average) { cl::Event* events = new cl::Event[numIterations_]; cl_int result; elapsed = 0.0; for(unsigned i = 0; i < numIterations_; ++i) { cl_ulong start, end; runKernel(kernel, &events[i]); queue_.flush(); events[i].wait(); result = events[i].getProfilingInfo<cl_ulong>(CL_PROFILING_COMMAND_START, &start); assert(result == CL_SUCCESS && "Unable to get profiling information"); result = events[i].getProfilingInfo<cl_ulong>(CL_PROFILING_COMMAND_END, &end); assert(result == CL_SUCCESS && "Unable to get profiling information"); elapsed += (double)1e-9 * (end - start); } average = elapsed / (double)numIterations_; delete [] events; }
static CALresult runNuStep(MWCALInfo* ci, SeparationCALMem* cm, const IntegralArea* ia, const SeparationCALChunks* chunks, CALint pollingMode, CALuint nuStep) { CALdomain domain; CALuint i; CALresult err = CAL_RESULT_OK; err = setNuKernelArgs(cm, ia, nuStep); if (err != CAL_RESULT_OK) return err; domain.x = 0; domain.width = ia->r_steps; for (i = 0; i < chunks->nChunkMu && err == CAL_RESULT_OK; ++i) { domain.y = chunks->chunkMuBorders[i]; domain.height = chunks->chunkMuBorders[i + 1] - chunks->chunkMuBorders[i]; mw_begin_critical_section(); err = runKernel(ci, &domain, pollingMode, chunks->chunkWaitTime); mw_end_critical_section(); } return err; }
void runKernel(cl_runtime_env env, std::string kernel_name, double* vars, double* out, int start_index, int out_len) { kernel kern; for (int i = 1; i < env.num_kerns; i++) { if (env.kernels[i].name == kernel_name) kern = env.kernels[i]; } runKernel(env.cv, env.cl_kernels[kernel_name], kern, env.gpu_data, vars); cl_int err; err = clEnqueueReadBuffer(env.cv.commands, env.gpu_data["out"].array, true, start_index, sizeof(double)*out_len, out, 0, NULL, NULL); CHK_ERR(err); err = clFlush(env.cv.commands); CHK_ERR(err); }
void cluster_t::evaluate_obj_gpu(){ #ifdef USE_GPU int x_dim = BLOCK_WIDTH * n; runKernel("evaluate_obj",kernel_evaluate_obj,x_dim,n,1,BLOCK_WIDTH,1,1); readFromBuffer(buffer_n_norms,n,norm1_arr,"buffer_n_norms"); readFromBuffer(buffer_n2_norms,triangle_dim,norm2_arr,"buffer_n_norms2"); bool debug_gpu = false; if(debug_gpu){ cerr<<"GPU NORM1:"; for(int i=0;i<n;++i){ cerr<<" "<<norm1_arr[i]; } cerr<<endl; for(int i1=0;i1<n-1;++i1){ cerr<<"GPU NORM2["<<i1<<"]:"; for(int i2=i1+1;i2<n;++i2){ if (i2>i1){ cerr<<" "<<norm2_arr[offsets[i1]+i2-i1]; } } cerr<<endl; } } #endif }
void AppManager::render(){ double dt = 1e-45f;//timer.elapsedAndRestart(); runKernel(dt); glViewport(0, 0, window_width*2, window_height*2); visualize->use(); float rx = (1.0f/(float)Nx); float ry = (1.0f/(float)Ny); glUniform1f(visualize->getUniform("rx"), rx); glUniform1f(visualize->getUniform("ry"), ry); glUniform1i(visualize->getUniform("QTex"), 0); glActiveTexture(GL_TEXTURE0); glBindTexture(GL_TEXTURE_2D, kernel0->getTexture()); glBindVertexArray(vao); glDrawElements(GL_TRIANGLES, 6, GL_UNSIGNED_BYTE, NULL); glBindVertexArray(0); visualize->disuse(); CHECK_GL_ERRORS(); }
int main(int argc, char ** argv) { long lower, upper; int WGS; if (argc != 4) { printf("not 2 arguments\n"); return 1; } sscanf(argv[1], "%ld", &lower); sscanf(argv[2], "%ld", &upper); sscanf(argv[3], "%d", &WGS); long results_size = (upper*(upper-1))/2; long* results = (long *) malloc(sizeof(long)*WGS); int i; for(i = 0; i < WGS; i ++) results[i] = 0L; printf("%ld\n", results_size); FILE *fp; char *KernelSource; cl_kernel kernel; fp = fopen("totient_kernel.cl", "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } KernelSource = (char*)malloc(MAX_SOURCE_SIZE); fread( KernelSource, 1, MAX_SOURCE_SIZE, fp); fclose( fp ); size_t local[1]; size_t global[1]; local[0] = WGS; global[0] = results_size; initGPU(); // Fill in here: kernel = setupKernel( KernelSource, "totient", 2, LongArr, WGS, results, IntConst, WGS); // Fill in here: runKernel( kernel, 1, global, local); long tot = 0; int l; for(l = 0; l < WGS; l ++) tot += results[l]; printf("C: Sum of Totients between [%ld..%ld] is %ld\n", lower, upper, tot); return 0; }
void cluster_t::finalize_iteration_gpu(){ #ifdef USE_GPU int x_dim = BLOCK_WIDTH * n; runKernel("get_U_norm_diff",kernel_get_U_norm_diff,x_dim,1,1,BLOCK_WIDTH,1,1); readFromBuffer(buffer_n_norms,n,norm1_arr,"buffer_n_norms"); float gpu_U_norm_diff = 0; for(int i=0;i<n;++i){ gpu_U_norm_diff+=norm1_arr[i]; } U_norm_diff = sqrt(gpu_U_norm_diff); if(config->verbose)cerr<<"FINALIZE_ITERATION: GPU U_norm_diff: "<<U_norm_diff<<endl; #endif }
void GLWidget::drawPoints() { runKernel(); glEnable(GL_VERTEX_PROGRAM_POINT_SIZE); glEnable(GL_POINT_SPRITE); glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE); glDisable(GL_DEPTH_TEST); glEnable(GL_BLEND); glBlendFunc(GL_SRC_ALPHA, GL_ONE); glDepthMask(GL_FALSE); particleShaderProgram->bind(); particleShaderProgram->enableAttributeArray(particleVertexLocation); particleShaderProgram->enableAttributeArray(particleColorLocation); particlesVBO->bind(); int tupleSize = 4; particleShaderProgram->setAttributeBuffer(particleVertexLocation, GL_FLOAT, 0, tupleSize, sizeof(Particle)); particleShaderProgram->setAttributeBuffer(particleColorLocation, GL_FLOAT, tupleSize*sizeof(float), tupleSize, sizeof(Particle)); particleShaderProgram->setUniformValue(particleMatrixLocation, pMatrix * vMatrix); particleShaderProgram->setUniformValue(particleSamplerPSLocation, 0); particleShaderProgram->setUniformValue(particleSamplerVelLocation, 1); glActiveTexture(GL_TEXTURE0); glBindTexture(GL_TEXTURE_2D, psTexture); glActiveTexture(GL_TEXTURE1); glBindTexture(GL_TEXTURE_2D, velTexture); glDrawArrays(GL_POINTS, 0, vertexNumber); particleShaderProgram->disableAttributeArray(particleVertexLocation); particleShaderProgram->disableAttributeArray(particleColorLocation); particlesVBO->release(); particleShaderProgram->release(); glDisable(GL_POINT_SPRITE); glDepthMask(GL_TRUE); glEnable(GL_DEPTH_TEST); }
void cluster_t::store_U_projection_gpu(){ #ifdef USE_GPU int x_dim = BLOCK_WIDTH * variable_blocks; runKernel("store_U_project",kernel_store_U_project,x_dim,n,1,BLOCK_WIDTH,1,1); bool debug_gpu = false; if(debug_gpu){ float testArr[n*p]; readFromBuffer(buffer_U_project,n*p,testArr,"buffer_U_project"); for(int i=0;i<n;++i){ for(int j=0;j<p;++j){ if(i>(n-3) && j>(p-3)){ cerr<<"GPU store U_project for subject,var: "<<i<<","<<j<<": "<<testArr[i*p+j]<<endl; } } } } #endif }
void cluster_t::initialize_gpu(){ #ifdef USE_GPU int x_dim = BLOCK_WIDTH * variable_blocks; runKernel("init_U",kernel_init_U,x_dim,n,1,BLOCK_WIDTH,1,1); bool debug_gpu = false; if(debug_gpu){ float testArr[n*p]; readFromBuffer(buffer_U_project,n*p,testArr,"buffer_U_project"); for(int i=0;i<n;++i){ for(int j=0;j<p;++j){ if(i==(n-10) && j>(p-10)){ cerr<<"GPU: U_project_orig "<<i<<","<<j<<": "<<testArr[i*p+j]<<endl; } } } } #endif }
void cluster_t::init_v_project_coeff_gpu(){ #ifdef USE_GPU float unweighted_lambda = mu * dist_func / rho; writeToBuffer(buffer_unweighted_lambda, 1, &unweighted_lambda, "buffer_unweighted_lambda"); runKernel("init_v_project_coeff",kernel_init_v_project_coeff,BLOCK_WIDTH*n,n,1,BLOCK_WIDTH,1,1); bool debug_gpu = false; if(debug_gpu){ float * testv = new float[triangle_dim]; readFromBuffer(buffer_V_project_coeff,triangle_dim,testv,"buffer_V_project_coeff"); for(int index1=0;index1<n-1;++index1){ for(int index2=index1+1;index2<n;++index2){ float & scaler = testv[offsets[index1]+(index2-index1)]; if (scaler !=0 && scaler !=1 ) cerr<<"GPU Init_V Index: "<<index1<<","<<index2<<": "<<scaler<<endl; } } } #endif }
void cluster_t::update_u_gpu(){ #ifdef USE_GPU writeToBuffer(buffer_dist_func,1,&dist_func,"buffer_dist_func"); writeToBuffer(buffer_rho,1,&rho,"buffer_rho"); int x_dim = BLOCK_WIDTH * variable_blocks; runKernel("update_U",kernel_update_U,x_dim,n,1,BLOCK_WIDTH,1,1); bool debug_gpu = false; if(debug_gpu){ float testArr[n*p]; readFromBuffer(buffer_U,n*p,testArr,"buffer_U"); for(int i=0;i<n;++i){ for(int j=0;j<p;++j){ if(i==(n-10) && j>(p-10)){ cerr<<"update U GPU: U: "<<i<<","<<j<<": "<<testArr[i*p+j]<<endl; } } } } #endif }
void cluster_t::update_map_distance_gpu(){ #ifdef USE_GPU int x_dim = BLOCK_WIDTH * variable_blocks; runKernel("update_map_distance",kernel_update_map_distance,x_dim,1,1,BLOCK_WIDTH,1,1); float norm1_arr[variable_blocks]; float norm2_arr[variable_blocks]; readFromBuffer(buffer_variable_block_norms1,variable_blocks,norm1_arr,"buffer_variable_block_norms1"); readFromBuffer(buffer_variable_block_norms2,variable_blocks,norm2_arr,"buffer_variable_block_norms2"); float norm1=0,norm2=0; for(int i=0;i<variable_blocks;++i){ //cerr<<"GPU Block "<<i<<" norm: "<<norm1_arr[i]<<","<<norm2_arr[i]<<endl; norm1+=norm1_arr[i]; norm2+=norm2_arr[i]; } if(config->verbose) cerr<<"GPU Norm1 was "<<norm1<<" and norm2 was "<<norm2<<endl; float norm = norm1+norm2; this->map_distance = norm; this->dist_func = sqrt(this->map_distance+epsilon); if(config->verbose)cerr<<"GET_MAP_DISTANCE: New map distance is "<<norm<<" with U distance="<<norm1<<", V distance="<<norm2<<" dist_func: "<<dist_func<<endl; #endif }
Color* Engine::getPixels() { if(!ocl->isDone()) runKernel(); return colors; }
void Engine::dataChanged() { runKernel(); }
int main() { int width = 512; int height = 512; // Creation du device cutilSafeCall( cudaSetDevice( cutGetMaxGflopsDeviceId() ) ); // Creation des buffers sur CPU int * a = new int[width*height]; int * b = new int[width*height]; int * res = new int[width*height]; for(int i = 0; i < width*height; i++) { a[i] = (int)ceil(((double)rand()/ (double)RAND_MAX)*100); b[i] = (int)ceil(((double)rand()/ (double)RAND_MAX)*100); } // Allocation des objects on the device // *** data unsigned int size = width * height * sizeof(int); std::cout << "Allocation d'un buffer de taille " << width*height << "\n"; int* d_a = NULL; int* d_b = NULL; int* d_res = NULL; cutilSafeCall( cudaMalloc( (void**) &d_a, size)); cutilSafeCall( cudaMalloc( (void**) &d_b, size)); cutilSafeCall( cudaMalloc( (void**) &d_res, size)); // Copy des donnees cutilSafeCall( cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice)); cutilSafeCall( cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice)); // Lancer le calcul std::cout << "Lancer le kernel ... \n"; runKernel(d_a, d_b, d_res, width*height); cutilSafeCall( cutilDeviceSynchronize() ); // Copie DeviceHost cutilSafeCall( cudaMemcpy(res, d_res, size, cudaMemcpyDeviceToHost)); // Verification du test int i = 0; for(;i < width*height;i++) if(res[i] != a[i] + b[i]) std::cout << "Error : [" << i << "] " << res[i] << " != " << a[i] << " + " << b[i] << std::endl; // Liberation des ressources // *** Device cutilSafeCall(cudaFree(d_a)); cutilSafeCall(cudaFree(d_b)); cutilSafeCall(cudaFree(d_res)); // *** CPU delete[] res; delete[] a; delete[] b; // Close device cutilDeviceReset(); std::cout << "Test result : " << ((i == width*height) ? "Succes" : "Error" ) << std::endl; std::cout.flush(); return 0; }
int main() { int width = 800; int height = 600; int mesh_width = 256; int mesh_height = 256; // Creation du device cutilSafeCall( cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ) ); // Creation d'une fenetre C3::Window OpenGLWin; OpenGLWin.Create(C3::WindowMode(width,height),"CudaC3"); // Glew init GLenum err = glewInit(); if(err != GLEW_OK) std::cout << "Error on GLEW initialization.\n"; // Configuration OpenGL glClearColor(0.0, 0.0, 0.0, 1.0); glDisable(GL_DEPTH_TEST); glViewport(0, 0, width, height); glMatrixMode(GL_PROJECTION); glLoadIdentity(); gluPerspective(60.0, (GLfloat)width / (GLfloat) height, 0.1, 10.0); // VBO // *** Create GLuint vbo; glGenBuffers(1, &vbo); glBindBuffer(GL_ARRAY_BUFFER, vbo); // *** Initialize unsigned int size = mesh_width * mesh_height * 4 * sizeof(float); glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW); glBindBuffer(GL_ARRAY_BUFFER, 0); // *** Register in CUDA cudaGraphicsResource *cuda_vbo_resource = NULL; cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsWriteDiscard)); float g_fAnim = 0.f; int nbFrame = 0; float timeFPS = 0.f; while(OpenGLWin.IsOpened()) { // Events C3::Event event; while(OpenGLWin.PoolEvent(event)) { //std::cout << "Event !" << std::endl; if(event.Type == C3::Event::Closed) { std::cout << "Close ... " << std::endl; OpenGLWin.Close(); } else if(event.Type == C3::Event::KeyPressed) { if(event.Key.Code == C3::Key::Escape) { std::cout << "Close ... " << std::endl; OpenGLWin.Close(); } } } // Mise a jour du temps g_fAnim += OpenGLWin.GetFrameTime() / 1000.f; timeFPS += OpenGLWin.GetFrameTime() / 1000.f; nbFrame++; if(timeFPS > 1.0f) { std::stringstream ss; ss << "CudaC3 [" << (int)ceil( nbFrame / timeFPS ) << " FPS]"; OpenGLWin.SetTitle(ss.str()); timeFPS = 0.f; nbFrame = 0; } // Draw the scene glClear( GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT ); // Lancer le calcul CUDA // *** map OpenGL buffer object for writing from CUDA float4 *dptr; cutilSafeCall(cudaGraphicsMapResources(1, &cuda_vbo_resource, 0)); size_t num_bytes; cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void **)&dptr, &num_bytes, cuda_vbo_resource)); // *** Run kernel runKernel(dptr, mesh_width, mesh_height,g_fAnim); cutilSafeCall( cutilDeviceSynchronize() ); // *** Unmap cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_vbo_resource, 0)); // OpenGL // *** Make some transformation glMatrixMode(GL_MODELVIEW); glLoadIdentity(); glTranslatef(0.0, 0.0, -3.0); glRotatef(0.0, 1.0, 0.0, 0.0); glRotatef(0.0, 0.0, 1.0, 0.0); // *** Render VBO // --- Bind glBindBuffer(GL_ARRAY_BUFFER, vbo); glVertexPointer(4, GL_FLOAT, 0, 0); // --- Draw glEnableClientState(GL_VERTEX_ARRAY); glColor3f(1.0, 0.0, 0.0); glDrawArrays(GL_POINTS, 0, mesh_width * mesh_height); glDisableClientState(GL_VERTEX_ARRAY); // Swap buffers OpenGLWin.Display(); } // Liberation des ressources cudaGraphicsUnregisterResource(cuda_vbo_resource); glBindBuffer(1, vbo); glDeleteBuffers(1, &vbo); // Close device cutilDeviceReset(); return 0; }
static void mdlOutputs(SimStruct *S, int_T tid) { //printf("mdlOutputs at %g\n", ssGetT(S)); rtsys = (RTsys*) ssGetUserData(S); if (rtsys->init_phase) { /* Failure during initialization */ return; } real_T *y = ssGetOutputPortRealSignal(S,0); real_T *n = ssGetOutputPortRealSignal(S,1); real_T *s = ssGetOutputPortRealSignal(S,2); real_T *m = ssGetOutputPortRealSignal(S,3); real_T *energyConsumption = ssGetOutputPortRealSignal(S,4); int i, j, k, detected; double dTime; DataNode *dn; Task* task; UserTask* t; InterruptHandler* hdl; Monitor *mon; if (!rtsys->started && ssGetT(S) == 0.0) { rtsys->started = true; return; } if (!rtsys->mdlzerocalled) { printf("Zero crossing detection must be turned on in order to run TrueTime!\n"); ssSetErrorStatus(S, "Zero crossing detection must be turned on in order to run TrueTime!"); return; } /* Storing the time */ rtsys->time = ssGetT(S) * rtsys->clockDrift + rtsys->clockOffset; detected = 0; /* Check interrupts */ i = 0; dn = (DataNode*) rtsys->triggerList->getFirst(); while (dn != NULL) { if (fabs(rtsys->interruptinputs[i]-rtsys->oldinterruptinputs[i]) > 0.1) { hdl = (InterruptHandler*) dn->data; Trigger* trig = hdl->trigger; if (rtsys->time - trig->prevHit > trig->latency) { // Trigger interrupt handler if (hdl->myList == rtsys->readyQ) { // handler serving older interrupts hdl->pending++; } else { hdl->moveToList(rtsys->readyQ); detected = 1; } trig->prevHit = rtsys->time; } else { //printf("Call to interrupt handler %s ignored at time %f. Within interrupt latency!\n", hdl->name, rtsys->time); } rtsys->oldinterruptinputs[i] = rtsys->interruptinputs[i]; } i++; dn = (DataNode*) dn->getNext(); } /* Check network */ dn = (DataNode*) rtsys->networkList->getFirst(); while (dn != NULL) { hdl = (InterruptHandler*) dn->data; Network* network = hdl->network; i = network->networkID - 1; //printf("mdlOutputs: checking network #%d inp: %d oldinp: %d\n",i,rtsys->networkinputs[i],rtsys->oldnetworkinputs[i]); if (fabs(rtsys->networkinputs[i] - rtsys->oldnetworkinputs[i]) > 0.1) { hdl->moveToList(rtsys->readyQ); detected = 1; rtsys->oldnetworkinputs[i] = rtsys->networkinputs[i]; } dn = (DataNode*) dn->getNext(); } /* Run kernel? */ double externTime = (rtsys->time- rtsys->clockOffset) / rtsys->clockDrift; if ((externTime >= rtsys->nextHit) || (detected > 0)) { dTime = runKernel(ssGetT(S)); if (rtsys->error) { // Something went wrong executing a code function mxArray *bn[1]; mexCallMATLAB(1, bn, 0, 0, "gcs"); // get current system char buf[200]; mxGetString(bn[0], buf, 200); for (unsigned int i=0; i<strlen(buf); i++) if (buf[i]=='\n') buf[i]=' '; printf("In block ==> '%s'\nSimulation aborted!\n", buf); ssSetStopRequested(S, 1); } else { rtsys->nextHit = (rtsys->time + dTime - rtsys->clockOffset) / rtsys->clockDrift; } } /* Outputs */ for (i=0; i<rtsys->nbrOfOutputs; i++) { y[i] = rtsys->outputs[i]; } /* Network send */ for (i=0; i<rtsys->nbrOfNetworks; i++) { n[i] = rtsys->nwSnd[i]; } /* Task schedule */ i = 0; j = 0; dn = (DataNode*) rtsys->taskList->getFirst(); while (dn != NULL) { t = (UserTask*) dn->data; rtsys->taskSched[i] = (double) (j+1); if (t->display) j++; dn = (DataNode*) dn->getNext(); i++; } task = (Task*) rtsys->readyQ->getFirst(); while (task != NULL) { if (task->isUserTask()) { t = (UserTask*) task; rtsys->taskSched[t->taskID - 1] += 0.25; } task = (Task*) task->getNext(); } if ((rtsys->running != NULL) && (rtsys->running->isUserTask())) { t = (UserTask*) rtsys->running; rtsys->taskSched[t->taskID - 1] += 0.25; } i = 0; j = 0; dn = (DataNode*) rtsys->taskList->getFirst(); while (dn != NULL) { t = (UserTask*) dn->data; if (t->display) { s[j] = rtsys->taskSched[i]; j++; } dn = (DataNode*) dn->getNext(); i++; } /* Handler schedule */ i = 0; j = 0; dn = (DataNode*) rtsys->handlerList->getFirst(); while (dn != NULL) { rtsys->handlerSched[i] = (double) (j+rtsys->nbrOfSchedTasks+2); if (i==0 && rtsys->contextSwitchTime > EPS) { // Context switch schedule, move graph down to task level rtsys->handlerSched[i] = rtsys->handlerSched[i] - 1; } hdl = (InterruptHandler*) dn->data; if (hdl->display) j++; dn = (DataNode*) dn->getNext(); i++; } task = (Task*) rtsys->readyQ->getFirst(); while (task != NULL) { if (!(task->isUserTask())) { hdl = (InterruptHandler*) task; rtsys->handlerSched[hdl->handlerID - 1] += 0.25; } task = (Task*) task->getNext(); } if ((rtsys->running != NULL) && (!(rtsys->running->isUserTask()))) { hdl = (InterruptHandler*) rtsys->running; rtsys->handlerSched[hdl->handlerID - 1] += 0.25; } i = 0; j = 0; dn = (DataNode*) rtsys->handlerList->getFirst(); while (dn != NULL) { hdl = (InterruptHandler*) dn->data; if (hdl->display) { s[j+rtsys->nbrOfSchedTasks] = rtsys->handlerSched[i]; j++; } dn = (DataNode*) dn->getNext(); i++; } /* Monitor graph */ k = 0; dn = (DataNode*) rtsys->monitorList->getFirst(); while (dn != NULL) { mon = (Monitor*) dn->data; for (j=0; j<rtsys->nbrOfTasks; j++) rtsys->monitorGraph[j] = (double) (j+1+k*(1+rtsys->nbrOfTasks)); t = (UserTask*) mon->waitingQ->getFirst(); while (t != NULL) { i = t->taskID; rtsys->monitorGraph[i-1] += 0.25; t = (UserTask*) t->getNext(); } if (mon->heldBy != NULL) { i = mon->heldBy->taskID; rtsys->monitorGraph[i-1] += 0.5; } if (mon->display) { for (j=0; j<rtsys->nbrOfTasks; j++) m[j+k*rtsys->nbrOfTasks] = rtsys->monitorGraph[j]; k++; } dn = (DataNode*) dn->getNext(); } /* Energy consumption */ energyConsumption[0] = rtsys->energyConsumption; }
static void mdlOutputs(SimStruct *S, int_T tid) { debugPrintf("'%s': mdlOutputs at %.16f\n", rtsys->blockName, ssGetT(S)); rtsys = (RTsys*) ssGetUserData(S); if (rtsys->init_phase) { /* Failure during initialization */ return; } real_T *y = ssGetOutputPortRealSignal(S,0); real_T *n = ssGetOutputPortRealSignal(S,1); real_T *s = ssGetOutputPortRealSignal(S,2); real_T *e = ssGetOutputPortRealSignal(S,3); int i, shouldRunKernel = 0; double timestep; DataNode *dn; UserTask* t; InterruptHandler* hdl; if (!rtsys->started && ssGetT(S) == 0.0) { rtsys->started = true; } else { /* Storing the time */ rtsys->time = ssGetT(S) * rtsys->clockDrift + rtsys->clockOffset; shouldRunKernel = 0; /* Run kernel? */ double externTime = (rtsys->time- rtsys->clockOffset) / rtsys->clockDrift; if ((externTime >= rtsys->nextHit) || (shouldRunKernel > 0)) { timestep = runKernel(ssGetT(S)); if (rtsys->error) { mexPrintf("In block ==> '%s'\n", ssGetBlockName(S)); mexPrintf("Simulation aborted!\n"); ssSetErrorStatus(S, errbuf); return; } else { rtsys->nextHit = (rtsys->time + timestep - rtsys->clockOffset) / rtsys->clockDrift; } } } /* Analog outputs */ for (i=0; i<rtsys->nbrOfOutputs; i++) { y[i] = rtsys->outputs[i]; } /* Network send outputs */ for (i=0; i<rtsys->nbrOfNetworks; i++) { n[i] = rtsys->nwSnd[i]; rtsys->oldnwSnd[i] = rtsys->nwSnd[i]; } /* Usertask schedule outputs */ i = 0; dn = (DataNode*) rtsys->taskList->getFirst(); while (dn != NULL) { t = (UserTask*) dn->data; if (t->display) { double val = (double) (i+1); for (int j = 0; j < rtsys->nbrOfCPUs; j++) { s[i + j * rtsys->nbrOfSchedTasks] = val; } if (t->state == RUNNING) { val += 0.5; } else if (t->state == READY) { val += 0.25; } else if (t->state == WAITING) { val += 0.125; } s[i + t->affinity * rtsys->nbrOfSchedTasks] = val; i++; if (i > rtsys->nbrOfSchedTasks) { mexPrintf("FATAL ERROR: schedule output port out of bounds!\n"); ssSetErrorStatus(S, "error"); return; } } dn = (DataNode*) dn->getNext(); } /* Handler schedule outputs */ dn = (DataNode*) rtsys->handlerList->getFirst(); while (dn != NULL) { hdl = (InterruptHandler*) dn->data; if (hdl->display) { double val = (double) (i+1); if (hdl->state == RUNNING) { val += 0.5; } else if (hdl->state == READY) { val += 0.25; } s[i + hdl->affinity * rtsys->nbrOfSchedTasks] = val; i++; if (i > rtsys->nbrOfSchedTasks) { mexPrintf("FATAL ERROR: schedule output port out of bounds!\n"); ssSetErrorStatus(S, "error"); return; } } dn = (DataNode*) dn->getNext(); } /* Energy consumption output */ e[0] = rtsys->energyConsumption; }
void bluesteinsFFTGpu(const char* const argv[],const unsigned n, const unsigned orign,const unsigned size) { const unsigned powM = (unsigned) log2(n); printf("Compiling Bluesteins Program..\n"); compileProgram(argv, "fft.h", "kernels/bluesteins.cl"); printf("Creating Kernel\n"); for (unsigned i = 0; i < deviceCount; ++i) { createKernel(i, "bluesteins"); } const unsigned sizePerGPU = size / deviceCount; for (unsigned i = 0; i < deviceCount; ++i) { workSize[i] = (i != (deviceCount - 1)) ? sizePerGPU : (size - workOffset[i]); allocateDeviceMemoryBS(i , workSize[i], workOffset[i]); clSetKernelArg(kernel[i], 0, sizeof(cl_mem), (void*) &d_Hreal[i]); clSetKernelArg(kernel[i], 1, sizeof(cl_mem), (void*) &d_Himag[i]); clSetKernelArg(kernel[i], 2, sizeof(cl_mem), (void*) &d_Yreal[i]); clSetKernelArg(kernel[i], 3, sizeof(cl_mem), (void*) &d_Yimag[i]); clSetKernelArg(kernel[i], 4, sizeof(cl_mem), (void*) &d_Zreal[i]); clSetKernelArg(kernel[i], 5, sizeof(cl_mem), (void*) &d_Zimag[i]); clSetKernelArg(kernel[i], 6, sizeof(unsigned), &n); clSetKernelArg(kernel[i], 7, sizeof(unsigned), &orign); clSetKernelArg(kernel[i], 8, sizeof(unsigned), &powM); clSetKernelArg(kernel[i], 9, sizeof(unsigned), &blockSize); if ((i + 1) < deviceCount) { workOffset[i + 1] = workOffset[i] + workSize[i]; } } size_t localWorkSize[] = {blockSize}; for (unsigned i = 0; i < deviceCount; ++i) { size_t globalWorkSize[] = {shrRoundUp(blockSize, workSize[i])}; // kernel non blocking execution runKernel(i, localWorkSize, globalWorkSize); } h_Rreal = h_Hreal; h_Rimag = h_Himag; for (unsigned i = 0; i < deviceCount; ++i) { copyFromDevice(i, d_Hreal[i], h_Rreal + workOffset[i], workSize[i]); copyFromDevice(i, d_Himag[i], h_Rimag + workOffset[i], workSize[i]); } // wait for copy event const cl_int ciErrNum = clWaitForEvents(deviceCount, gpuDone); checkError(ciErrNum, CL_SUCCESS, "clWaitForEvents"); printGpuTime(); }