TEST(Profiler, one_timer) { Profiler::initialize(); double wait_time = clock_resolution(); double total=0; { // uninitialize can not be in the same block as the START_TIMER START_TIMER("test_tag"); EXPECT_EQ( 1, AC); total += wait(wait_time); END_TIMER("test_tag"); START_TIMER("test_tag"); EXPECT_EQ( total, ACT); EXPECT_EQ( 2, AC); total += wait(wait_time); total += wait(wait_time); END_TIMER("test_tag"); START_TIMER("test_tag"); EXPECT_EQ( total, ACT); EXPECT_EQ( 3, AC); } // test add_call { START_TIMER("add_call"); ADD_CALLS(1000); EXPECT_EQ(1000, AC); } Profiler::uninitialize(); }
void ProfilerTest::test_petsc_memory_monitor() { int ierr, mpi_rank; ierr = MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); EXPECT_EQ( ierr, 0 ); Profiler::initialize(); { PetscInt size = 10000; START_TIMER("A"); Vec tmp_vector; VecCreateSeq(PETSC_COMM_SELF, size, &tmp_vector); VecDestroy(&tmp_vector); START_TIMER("C"); END_TIMER("C"); END_TIMER("A"); START_TIMER("B"); Vec tmp_vector1, tmp_vector2; VecCreateSeq(PETSC_COMM_SELF, size, &tmp_vector1); VecCreateSeq(PETSC_COMM_SELF, size, &tmp_vector2); VecDestroy(&tmp_vector1); VecDestroy(&tmp_vector2); END_TIMER("B"); } PI->output(MPI_COMM_WORLD, cout); Profiler::uninitialize(); }
void ProfilerTest::test_one_timer() { const double TIMER_RESOLUTION = Profiler::get_resolution(); const double DELTA = TIMER_RESOLUTION*1000; double total=0; Profiler::initialize(); { // uninitialize can not be in the same block as the START_TIMER START_TIMER("test_tag"); // test that number of calls of current timer is EXPECT_EQ( 1, ACC); // wait a TIMER_RESOLUTION time total += wait_sec(TIMER_RESOLUTION); END_TIMER("test_tag"); START_TIMER("test_tag"); // test that number of calls of current timer is EXPECT_EQ( 2, ACC); // test whether difference between measured time and total time is within TIMER_RESOLUTION EXPECT_LE( abs(ACT-total), DELTA); cout << "difference: " << abs(total-ACT) << ", tolerance: " << DELTA << endl; // wait a TIMER_RESOLUTION time total += wait_sec (TIMER_RESOLUTION); total += wait_sec (TIMER_RESOLUTION); END_TIMER("test_tag"); START_TIMER("test_tag"); EXPECT_EQ( 3, ACC); EXPECT_LE( abs(ACT-total), DELTA); cout << "difference: " << abs(total-ACT) << ", tolerance: " << DELTA << endl; } // test add_call { START_TIMER("add_call"); ADD_CALLS(1000); EXPECT_EQ(1000, ACC); } // test absolute time { START_TIMER("one_second"); wait_sec(1); } std::stringstream sout; PI->output(MPI_COMM_WORLD, sout); PI->output(MPI_COMM_WORLD, cout); //EXPECT_NE( sout.str().find("\"tag\": \"Whole Program\""), string::npos ); Profiler::uninitialize(); }
void ProfilerTest::test_memory_propagation(){ const int SIZE = 25; int allocated_whole = 0; int allocated_A = 0; int allocated_B = 0; int allocated_C = 0; int allocated_D = 0; Profiler::initialize(); { allocated_whole = MALLOC; allocated_whole += alloc_and_dealloc<int>(SIZE); EXPECT_EQ(MALLOC, allocated_whole); START_TIMER("A"); allocated_A += alloc_and_dealloc<int>(10 * SIZE); EXPECT_EQ(MALLOC, allocated_A); START_TIMER("B"); allocated_B += alloc_and_dealloc<int>(100 * SIZE); START_TIMER("C"); EXPECT_EQ(MALLOC, allocated_C); END_TIMER("C"); allocated_B += allocated_C; END_TIMER("B"); allocated_A += allocated_B; allocated_A += alloc_and_dealloc<int>(10 * SIZE); for(int i = 0; i < 5; i++) { START_TIMER("D"); allocated_D += alloc_and_dealloc<int>(1 * SIZE); END_TIMER("D"); START_TIMER("D"); allocated_D += alloc_and_dealloc<int>(1 * SIZE); END_TIMER("D"); } allocated_A += allocated_D; END_TIMER("A"); allocated_whole += allocated_A; } PI->propagate_timers(); EXPECT_EQ(MALLOC, allocated_whole); EXPECT_EQ(MALLOC, DEALOC); Profiler::uninitialize(); }
// testing non-fatal functioning of Profiler when debug is off TEST(Profiler, test_calls_only) { Profiler::initialize(); START_TIMER("sub1"); END_TIMER("sub1"); PI->output(MPI_COMM_WORLD, cout); Profiler::uninitialize(); }
void upload(cl_command_queue commands, cl_mem dst, T* src, int N) { int err = clEnqueueWriteBuffer(commands, dst, CL_TRUE, 0, sizeof(T) * N, src, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "CFD Data Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Unable to write memory to device"); }
void download(cl_command_queue commands, T* dst, cl_mem src, int N) { int err = clEnqueueReadBuffer(commands, src, CL_TRUE, 0, sizeof(T)*N, dst, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "CFD Data Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Unable to read memory from device"); }
void ProfilerTest::test_memory_profiler() { const int ARR_SIZE = 1000; const int LOOP_CNT = 1000; Profiler::initialize(); { START_TIMER("memory-profiler-int"); // alloc and dealloc array of int for (int i = 0; i < LOOP_CNT; i++) alloc_and_dealloc<int>(ARR_SIZE); // test that we deallocated all allocated space EXPECT_EQ(MALLOC, DEALOC); // test that allocated space is correct size EXPECT_EQ(MALLOC, ARR_SIZE * LOOP_CNT * sizeof(int)); END_TIMER("memory-profiler-int"); START_TIMER("memory-profiler-double"); // alloc and dealloc array of float for (int i = 0; i < LOOP_CNT; i++) alloc_and_dealloc<double>(ARR_SIZE); // test that we deallocated all allocated space EXPECT_EQ(MALLOC, DEALOC); // test that allocated space is correct size EXPECT_EQ(MALLOC, ARR_SIZE * LOOP_CNT * sizeof(double)); END_TIMER("memory-profiler-double"); START_TIMER("memory-profiler-simple"); // alloc and dealloc array of float for (int i = 0; i < LOOP_CNT; i++) { int * j = new int; delete j; } // test that we deallocated all allocated space EXPECT_EQ(MALLOC, DEALOC); // test that allocated space is correct size EXPECT_EQ(MALLOC, LOOP_CNT * sizeof(int)); END_TIMER("memory-profiler-simple"); } PI->output(MPI_COMM_WORLD, cout); Profiler::uninitialize(); }
void ProfilerTest::test_propagate_values() { int allocated = 0; Profiler::initialize(); { START_TIMER("A"); START_TIMER("B"); START_TIMER("C"); allocated += alloc_and_dealloc<int>(25); END_TIMER("C"); END_TIMER("B"); START_TIMER("D"); END_TIMER("D"); PI->propagate_timers(); EXPECT_EQ(MALLOC, allocated); END_TIMER("A"); } PI->output(MPI_COMM_WORLD, cout); Profiler::uninitialize(); }
void IngredientInput::reloadCombos() { //these only needed to be loaded once if ( ingredientBox->count() == 0 ) { START_TIMER("Loading ingredient input auto-completion"); ingredientBox->reload(); END_TIMER(); } if ( headerBox->count() == 0 ) { START_TIMER("Loading ingredient header input auto-completion"); headerBox->reload(); END_TIMER(); } if ( prepMethodBox->count() == 0 ) { START_TIMER("Loading prep method input auto-completion"); prepMethodBox->reload(); END_TIMER(); } loadUnitListCombo(); }
void ProfilerTest::test_petsc_memory() { int ierr, mpi_rank; ierr = MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); EXPECT_EQ( ierr, 0 ); Profiler::initialize(); { PetscLogDouble mem; START_TIMER("A"); PetscInt size = 100*1000; PetscScalar value = 0.1; Vec tmp_vector; VecCreateSeq(PETSC_COMM_SELF, size, &tmp_vector); VecSet(tmp_vector, value); // VecSetRandom(tmp_vector, NULL); END_TIMER("A"); START_TIMER("A"); // allocated memory MUST be greater or equal to size * size of double EXPECT_GE(AN.petsc_memory_difference, size*sizeof(double)); END_TIMER("A"); START_TIMER("B"); PetscScalar sum; VecSum(tmp_vector, &sum); END_TIMER("B"); START_TIMER("C"); VecDestroy(&tmp_vector); END_TIMER("C"); START_TIMER("C"); // since we are destroying vector, we expect to see negative memory difference EXPECT_LE(AN.petsc_memory_difference, 0); END_TIMER("C"); } PI->output(MPI_COMM_WORLD, cout); Profiler::uninitialize(); }
void StableFitting::shrink(BoundingPolyhedron& poly, const std::vector<math::vec3>& points, math::vec3 backgroundPoint, float minimumDistance) const { START_TIMER(Shrinking); poly.positionAround(backgroundPoint, points); float step = poly.radius()/2.f; float minDistanceSquared = minimumDistance*minimumDistance; int originalPointsInside = countPointsInside(points, poly); for(int iIteration = 0; iIteration < mNoOfIterations; ++iIteration) { float stepSquared = step*step; for(auto vertex = poly.mVertices.rbegin(); vertex!=poly.mVertices.rend(); ++vertex) { //If too close, or move distance too great const float distanceCentreToVertex = poly.centre().distanceSquared(*vertex); if(distanceCentreToVertex < minDistanceSquared || distanceCentreToVertex < stepSquared) continue; math::vec3 moveNormal = (poly.centre() - *vertex).normalize(); const math::vec3 vec = moveNormal*step; //Move *vertex += vec; int newPointsInside = countPointsInside(points, poly); //Move back if movement violates rule if(newPointsInside<originalPointsInside) *vertex -= vec; } step *= 0.5f; } END_TIMER(Shrinking); }
void ProfilerTest::test_structure() { Profiler::initialize(); { START_TIMER("main"); EXPECT_EQ("main", ATN); START_TIMER("sub1"); EXPECT_EQ("sub1", ATN); START_TIMER("cross"); EXPECT_EQ("cross", ATN); END_TIMER("sub1"); EXPECT_EQ("main", ATN ); START_TIMER("sub2"); END_TIMER("cross"); START_TIMER("sub_sub"); START_TIMER("sub1"); END_TIMER("sub1"); END_TIMER("sub_sub"); END_TIMER("sub2"); START_TIMER("sub1"); END_TIMER("sub1"); } std::stringstream sout; PI->output(MPI_COMM_WORLD, cout); PI->output(MPI_COMM_WORLD, sout); int ierr, mpi_rank; ierr = MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); EXPECT_EQ( ierr, 0 ); // 0 processor will have valid profiler report // other processors should have empty string only if (mpi_rank == 0) { // when using find, we need to compare result to string::npos value (which indicates not found) EXPECT_NE( sout.str().find("\"tag\": \"Whole Program\""), string::npos ); EXPECT_NE( sout.str().find("\"tag\": \"sub1\""), string::npos ); } else { EXPECT_TRUE( sout.str().empty() ); } Profiler::uninitialize(); }
void countCandidatesStatic(size_t* globalWork, size_t* localWork, cl_mem episodeSupport, long eventSize, int level, int sType, int numCandidates, cl_mem candidateTex, cl_mem intervalTex, cl_mem eventTex, cl_mem timeTex, size_t sharedMemNeeded) { int errcode; errcode = clSetKernelArg(kernel_countCandidatesStatic, 0, sizeof(cl_mem), (void *) &episodeSupport); errcode |= clSetKernelArg(kernel_countCandidatesStatic, 1, sizeof(long), (void *) &eventSize); errcode |= clSetKernelArg(kernel_countCandidatesStatic, 2, sizeof(int), (void *) &level); errcode |= clSetKernelArg(kernel_countCandidatesStatic, 3, sizeof(int), (void *) &sType); errcode |= clSetKernelArg(kernel_countCandidatesStatic, 4, sizeof(int), (void *) &numCandidates); errcode |= clSetKernelArg(kernel_countCandidatesStatic, 5, sizeof(cl_mem), (void *) &candidateTex); errcode |= clSetKernelArg(kernel_countCandidatesStatic, 6, sizeof(cl_mem), (void *) &intervalTex); errcode |= clSetKernelArg(kernel_countCandidatesStatic, 7, sizeof(cl_mem), (void *) &eventTex); errcode |= clSetKernelArg(kernel_countCandidatesStatic, 8, sizeof(cl_mem), (void *) &timeTex); errcode |= clSetKernelArg(kernel_countCandidatesStatic, 9, sharedMemNeeded, NULL); CHKERR(errcode, "Unable to set arguments for countCandidates"); errcode = clEnqueueNDRangeKernel(commands, kernel_countCandidatesStatic, 3, NULL, globalWork, localWork, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "TDM Candidate Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(errcode, "Error running countCandidatesStatic"); }
TEST(Profiler, structure) { Profiler::initialize(); { START_TIMER("main"); EXPECT_EQ("main", AT ); START_TIMER("sub1"); EXPECT_EQ("sub1", AT); START_TIMER("cross"); EXPECT_EQ("cross", AT); END_TIMER("sub1"); EXPECT_EQ("main", AT ); START_TIMER("sub2"); END_TIMER("cross"); START_TIMER("sub_sub"); START_TIMER("sub1"); END_TIMER("sub1"); END_TIMER("sub_sub"); END_TIMER("sub2"); START_TIMER("sub1"); END_TIMER("sub1"); } std::stringstream sout; Profiler::instance()->output(MPI_COMM_WORLD, cout); Profiler::instance()->output(MPI_COMM_WORLD, sout); EXPECT_TRUE( sout.str().find("Whole Program 0") ); EXPECT_TRUE( sout.str().find(" sub1 2") ); Profiler::uninitialize(); }
int main(int argc, char** argv) { cl_int err; int usegpu = USEGPU; int do_verify = 0; int opt, option_index=0; unsigned int correct; size_t global_size; size_t local_size; cl_device_id device_id; cl_context context; cl_command_queue commands; cl_program program; cl_kernel kernel; stopwatch sw; cl_mem csr_ap; cl_mem csr_aj; cl_mem csr_ax; cl_mem x_loc; cl_mem y_loc; FILE *kernelFile; char *kernelSource; size_t kernelLength; size_t lengthRead; ocd_init(&argc, &argv, NULL); ocd_options opts = ocd_get_options(); platform_id = opts.platform_id; n_device = opts.device_id; while ((opt = getopt_long(argc, argv, "::vc::", long_options, &option_index)) != -1 ) { switch(opt){ //case 'i': //input_file = optarg; //break; case 'v': fprintf(stderr, "verify\n"); do_verify = 1; break; case 'c': fprintf(stderr, "using cpu\n"); usegpu = 0; break; default: fprintf(stderr, "Usage: %s [-v Warning: lots of output] [-c use CPU]\n", argv[0]); exit(EXIT_FAILURE); } } /* Fill input set with random float values */ int i; csr_matrix csr; csr = laplacian_5pt(512); int k = 0; for(k = 0; k < csr.num_nonzeros; k++){ csr.Ax[k] = 1.0 - 2.0 * (rand() / (RAND_MAX + 1.0)); } //The other arrays float * x_host = float_new_array(csr.num_cols); float * y_host = float_new_array(csr.num_rows); unsigned int ii; for(ii = 0; ii < csr.num_cols; ii++){ x_host[ii] = rand() / (RAND_MAX + 1.0); } for(ii = 0; ii < csr.num_rows; ii++){ y_host[ii] = rand() / (RAND_MAX + 2.0); } /* Retrieve an OpenCL platform */ device_id = GetDevice(platform_id, n_device); /* Create a compute context */ context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); CHKERR(err, "Failed to create a compute context!"); /* Create a command queue */ commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err); CHKERR(err, "Failed to create a command queue!"); /* Load kernel source */ kernelFile = fopen("spmv_csr_kernel.cl", "r"); fseek(kernelFile, 0, SEEK_END); kernelLength = (size_t) ftell(kernelFile); kernelSource = (char *) malloc(sizeof(char)*kernelLength); rewind(kernelFile); lengthRead = fread((void *) kernelSource, kernelLength, 1, kernelFile); fclose(kernelFile); /* Create the compute program from the source buffer */ program = clCreateProgramWithSource(context, 1, (const char **) &kernelSource, &kernelLength, &err); CHKERR(err, "Failed to create a compute program!"); /* Free kernel source */ free(kernelSource); /* Build the program executable */ err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err == CL_BUILD_PROGRAM_FAILURE) { char *buildLog; size_t logLen; err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &logLen); buildLog = (char *) malloc(sizeof(char)*logLen); err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, logLen, (void *) buildLog, NULL); fprintf(stderr, "CL Error %d: Failed to build program! Log:\n%s", err, buildLog); free(buildLog); exit(1); } CHKERR(err, "Failed to build program!"); /* Create the compute kernel in the program we wish to run */ kernel = clCreateKernel(program, "csr", &err); CHKERR(err, "Failed to create a compute kernel!"); /* Create the input and output arrays in device memory for our calculation */ csr_ap = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned int)*csr.num_rows+4, NULL, &err); CHKERR(err, "Failed to allocate device memory!"); csr_aj = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned int)*csr.num_nonzeros, NULL, &err); CHKERR(err, "Failed to allocate device memory!"); csr_ax = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*csr.num_nonzeros, NULL, &err); CHKERR(err, "Failed to allocate device memory!"); x_loc = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*csr.num_cols, NULL, &err); CHKERR(err, "Failed to allocate device memory!"); y_loc = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*csr.num_rows, NULL, &err); CHKERR(err, "Failed to allocate device memory!"); /* beginning of timing point */ stopwatch_start(&sw); /* Write our data set into the input array in device memory */ err = clEnqueueWriteBuffer(commands, csr_ap, CL_TRUE, 0, sizeof(unsigned int)*csr.num_rows+4, csr.Ap, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "CSR Data Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to write to source array!"); err = clEnqueueWriteBuffer(commands, csr_aj, CL_TRUE, 0, sizeof(unsigned int)*csr.num_nonzeros, csr.Aj, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "CSR Data Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to write to source array!"); err = clEnqueueWriteBuffer(commands, csr_ax, CL_TRUE, 0, sizeof(float)*csr.num_nonzeros, csr.Ax, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "CSR Data Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to write to source array!"); err = clEnqueueWriteBuffer(commands, x_loc, CL_TRUE, 0, sizeof(float)*csr.num_cols, x_host, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "CSR Data Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to write to source array!"); err = clEnqueueWriteBuffer(commands, y_loc, CL_TRUE, 0, sizeof(float)*csr.num_rows, y_host, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "CSR Data Copy", ocdTempTimer) CHKERR(err, "Failed to write to source array!"); END_TIMER(ocdTempTimer) /* Set the arguments to our compute kernel */ err = 0; err = clSetKernelArg(kernel, 0, sizeof(unsigned int), &csr.num_rows); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &csr_ap); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &csr_aj); err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &csr_ax); err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &x_loc); err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &y_loc); CHKERR(err, "Failed to set kernel arguments!"); /* Get the maximum work group size for executing the kernel on the device */ err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel work group info!"); /* Execute the kernel over the entire range of our 1d input data set */ /* using the maximum number of work group items for this device */ global_size = csr.num_rows; err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global_size, &local_size, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CSR Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel!"); /* Wait for the command commands to get serviced before reading back results */ float output[csr.num_rows]; /* Read back the results from the device to verify the output */ err = clEnqueueReadBuffer(commands, y_loc, CL_TRUE, 0, sizeof(float)*csr.num_rows, output, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "CSR Data Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to read output array!"); /* end of timing point */ stopwatch_stop(&sw); printf("Time consumed(ms): %lf Gflops: %f \n", 1000*get_interval_by_sec(&sw), (2.0 * (double) csr.num_nonzeros / get_interval_by_sec(&sw)) / 1e9); /* Validate our results */ if(do_verify){ for (i = 0; i < csr.num_rows; i++){ printf("row: %d output: %f \n", i, output[i]); } } int row = 0; float sum = 0; int row_start = 0; int row_end = 0; for(row =0; row < csr.num_rows; row++){ sum = y_host[row]; row_start = csr.Ap[row]; row_end = csr.Ap[row+1]; unsigned int jj = 0; for (jj = row_start; jj < row_end; jj++){ sum += csr.Ax[jj] * x_host[csr.Aj[jj]]; } y_host[row] = sum; } for (i = 0; i < csr.num_rows; i++){ if((fabsf(y_host[i]) - fabsf(output[i])) > .001) printf("Possible error, difference greater then .001 at row %d \n", i); } /* Print a brief summary detailing the results */ ocd_finalize(); /* Shutdown and cleanup */ clReleaseMemObject(csr_ap); clReleaseMemObject(csr_aj); clReleaseMemObject(csr_ax); clReleaseMemObject(x_loc); clReleaseMemObject(y_loc); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); return 0; }
void IngredientMatcherDialog::findRecipes( void ) { KApplication::setOverrideCursor( Qt::WaitCursor ); START_TIMER("Ingredient Matcher: loading database data"); RecipeList rlist; database->loadRecipes( &rlist, RecipeDB::Title | RecipeDB::NamesOnly | RecipeDB::Ingredients | RecipeDB::IngredientAmounts ); END_TIMER(); START_TIMER("Ingredient Matcher: analyzing data for matching recipes"); // Clear the list recipeListView->listView() ->clear(); // Now show the recipes with ingredients that are contained in the previous set // of ingredients RecipeList incompleteRecipes; QList <int> missingNumbers; Q3ValueList <IngredientList> missingIngredients; RecipeList::Iterator it; for ( it = rlist.begin();it != rlist.end();++it ) { IngredientList il = ( *it ).ingList; if ( il.isEmpty() ) continue; IngredientList missing; if ( m_ingredientList.containsSubSet( il, missing, true, database ) ) { new CustomRecipeListItem( recipeListView->listView(), *it ); } else { incompleteRecipes.append( *it ); missingIngredients.append( missing ); missingNumbers.append( missing.count() ); } } END_TIMER(); //Check if the user wants to show missing ingredients if ( missingNumberSpinBox->value() == 0 ) { KApplication::restoreOverrideCursor(); return ; } //"None" START_TIMER("Ingredient Matcher: searching for and displaying partial matches"); IngredientList requiredIngredients; for ( Q3ListViewItem *it = ingListView->listView()->firstChild(); it; it = it->nextSibling() ) { if ( ((Q3CheckListItem*)it)->isOn() ) requiredIngredients << *m_item_ing_map[it]; } // Classify recipes with missing ingredients in different lists by amount QList<int>::Iterator nit; Q3ValueList<IngredientList>::Iterator ilit; int missingNoAllowed = missingNumberSpinBox->value(); if ( missingNoAllowed == -1 ) // "Any" { for ( nit = missingNumbers.begin();nit != missingNumbers.end();++nit ) if ( ( *nit ) > missingNoAllowed ) missingNoAllowed = ( *nit ); } for ( int missingNo = 1; missingNo <= missingNoAllowed; missingNo++ ) { nit = missingNumbers.begin(); ilit = missingIngredients.begin(); bool titleShownYet = false; for ( it = incompleteRecipes.begin();it != incompleteRecipes.end();++it, ++nit, ++ilit ) { if ( !( *it ).ingList.containsAny( m_ingredientList ) ) continue; if ( !( *it ).ingList.containsSubSet( requiredIngredients ) ) continue; if ( ( *nit ) == missingNo ) { if ( !titleShownYet ) { new SectionItem( recipeListView->listView(), i18ncp( "@label:textbox", "You are missing 1 ingredient for:", "You are missing %1 ingredients for:", missingNo ) ); titleShownYet = true; } new CustomRecipeListItem( recipeListView->listView(), *it, *ilit ); } } } END_TIMER(); KApplication::restoreOverrideCursor(); }
void allocateMemory(int npoints, int nfeatures, int nclusters, float **features) { cl_int errcode; size_t globalWorkSize; size_t localWorkSize; num_blocks = npoints / num_threads; if (npoints % num_threads > 0) /* defeat truncation */ num_blocks++; num_blocks_perdim = sqrt((double) num_blocks); while (num_blocks_perdim * num_blocks_perdim < num_blocks) // defeat truncation (should run once) num_blocks_perdim++; num_blocks = num_blocks_perdim*num_blocks_perdim; /* allocate memory for memory_new[] and initialize to -1 (host) */ membership_new = (int*) malloc(npoints * sizeof(int)); for(int i=0;i<npoints;i++) { membership_new[i] = -1; } /* allocate memory for block_new_centers[] (host) */ block_new_centers = (float *) malloc(nclusters*nfeatures*sizeof(float)); /* allocate memory for feature_flipped_d[][], feature_d[][] (device) */ feature_flipped_d = clCreateBuffer(clContext, CL_MEM_READ_ONLY, npoints*nfeatures*sizeof(float), NULL, &errcode); CHECKERR(errcode); errcode = clEnqueueWriteBuffer(clCommands, feature_flipped_d, CL_TRUE, 0, npoints*nfeatures*sizeof(float), features[0], 0, NULL, &ocdTempEvent); clFinish(clCommands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "Point/Feature Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHECKERR(errcode); feature_d = clCreateBuffer(clContext, CL_MEM_READ_WRITE, npoints*nfeatures*sizeof(float), NULL, &errcode); CHECKERR(errcode); /* invert the data array (kernel execution) */ unsigned int arg = 0; errcode = clSetKernelArg(clKernel_invert_mapping, arg++, sizeof(cl_mem), (void *) &feature_flipped_d); errcode |= clSetKernelArg(clKernel_invert_mapping, arg++, sizeof(cl_mem), (void *) &feature_d); errcode |= clSetKernelArg(clKernel_invert_mapping, arg++, sizeof(int), (void *) &npoints); errcode |= clSetKernelArg(clKernel_invert_mapping, arg++, sizeof(int), (void *) &nfeatures); CHECKERR(errcode); globalWorkSize = num_blocks*num_threads; localWorkSize = num_threads; errcode = clEnqueueNDRangeKernel(clCommands, clKernel_invert_mapping, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, &ocdTempEvent); clFinish(clCommands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "Invert Mapping Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHECKERR(errcode); /* allocate memory for membership_d[] and clusters_d[][] (device) */ membership_d = clCreateBuffer(clContext, CL_MEM_READ_WRITE, npoints*sizeof(int), NULL, &errcode); CHECKERR(errcode); clusters_d = clCreateBuffer(clContext, CL_MEM_READ_ONLY, nclusters*nfeatures*sizeof(float), NULL, &errcode); CHECKERR(errcode); #ifdef BLOCK_DELTA_REDUCE // allocate array to hold the per block deltas on the gpu side block_deltas_d = clCreateBuffer(clContext, CL_MEM_READ_WRITE, num_blocks_perdim*num_blocks_perdim*sizeof(int), NULL, &errcode); CHECKERR(errcode); //cudaMemcpy(block_delta_d, &delta_h, sizeof(int), cudaMemcpyHostToDevice); #endif #ifdef BLOCK_CENTER_REDUCE // allocate memory and copy to card cluster array in which to accumulate center points for the next iteration block_clusters_d = clCreateBuffer(clContext, CL_MEM_READ_WRITE, num_blocks_perdim*num_blocks_perdim*nclusters*nfeatures*sizeof(float), NULL, &errcode); CHECKERR(errcode); //cudaMemcpy(new_clusters_d, new_centers[0], nclusters*nfeatures*sizeof(float), cudaMemcpyHostToDevice); #endif }
int main(int argc, char** argv) { ocd_init(&argc, &argv, NULL); ocd_initCL(); cl_int err; size_t global_size; size_t local_size; cl_program program; cl_kernel kernel_compute_flux; cl_kernel kernel_compute_flux_contributions; cl_kernel kernel_compute_step_factor; cl_kernel kernel_time_step; cl_kernel kernel_initialize_variables; cl_mem ff_variable; cl_mem ff_fc_momentum_x; cl_mem ff_fc_momentum_y; cl_mem ff_fc_momentum_z; cl_mem ff_fc_density_energy; if (argc < 2) { printf("Usage ./cfd <data input file>\n"); return 0; } const char* data_file_name = argv[1]; // set far field conditions and load them into constant memory on the gpu { float h_ff_variable[NVAR]; const float angle_of_attack = (float)(3.1415926535897931 / 180.0) * (float)(deg_angle_of_attack); h_ff_variable[VAR_DENSITY] = (float)(1.4); float ff_pressure = (float)(1.0); float ff_speed_of_sound = sqrt(GAMMA*ff_pressure / h_ff_variable[VAR_DENSITY]); float ff_speed = (float)(ff_mach)*ff_speed_of_sound; float3 ff_velocity; ff_velocity.x = ff_speed*(float)(cos((float)angle_of_attack)); ff_velocity.y = ff_speed*(float)(sin((float)angle_of_attack)); ff_velocity.z = 0.0; h_ff_variable[VAR_MOMENTUM+0] = h_ff_variable[VAR_DENSITY] * ff_velocity.x; h_ff_variable[VAR_MOMENTUM+1] = h_ff_variable[VAR_DENSITY] * ff_velocity.y; h_ff_variable[VAR_MOMENTUM+2] = h_ff_variable[VAR_DENSITY] * ff_velocity.z; h_ff_variable[VAR_DENSITY_ENERGY] = h_ff_variable[VAR_DENSITY]*((float)(0.5)*(ff_speed*ff_speed)) + (ff_pressure / (float)(GAMMA-1.0)); float3 h_ff_momentum; h_ff_momentum.x = *(h_ff_variable+VAR_MOMENTUM+0); h_ff_momentum.y = *(h_ff_variable+VAR_MOMENTUM+1); h_ff_momentum.z = *(h_ff_variable+VAR_MOMENTUM+2); float3 h_ff_fc_momentum_x; float3 h_ff_fc_momentum_y; float3 h_ff_fc_momentum_z; float3 h_ff_fc_density_energy; compute_flux_contribution(&h_ff_variable[VAR_DENSITY], &h_ff_momentum, &h_ff_variable[VAR_DENSITY_ENERGY], ff_pressure, &ff_velocity, &h_ff_fc_momentum_x, &h_ff_fc_momentum_y, &h_ff_fc_momentum_z, &h_ff_fc_density_energy); // copy far field conditions to the gpu ff_variable = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * NVAR, h_ff_variable, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_momentum_x = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_x, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_momentum_y = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_y, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_momentum_z = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_z, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_density_energy = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_density_energy, &err); CHKERR(err, "Unable to allocate ff data"); } int nel; int nelr; // read in domain geometry cl_mem areas; cl_mem elements_surrounding_elements; cl_mem normals; { std::ifstream file(data_file_name); file >> nel; nelr = block_length*((nel / block_length )+ std::min(1, nel % block_length)); //float* h_areas = new float[nelr]; //int* h_elements_surrounding_elements = new int[nelr*NNB]; //float* h_normals = new float[nelr*NDIM*NNB]; float* h_areas ; int* h_elements_surrounding_elements ; float* h_normals ; h_areas = (float*) memalign(AOCL_ALIGNMENT,nelr*sizeof(float)); h_elements_surrounding_elements = (int*) memalign(AOCL_ALIGNMENT,nelr*NNB*sizeof(int)); h_normals = (float *) memalign(AOCL_ALIGNMENT,nelr*NDIM*NNB*sizeof(float)); //posix_memalign(&h_areas , AOCL_ALIGNMENT, nelr); //posix_memalign(&h_elements_surrounding_elements , AOCL_ALIGNMENT, nelr*NNB); //posix_memalign(&h_normals , AOCL_ALIGNMENT, nelr*NDIM*NNB); // read in data for(int i = 0; i < nel; i++) { file >> h_areas[i]; for(int j = 0; j < NNB; j++) { file >> h_elements_surrounding_elements[i + j*nelr]; if(h_elements_surrounding_elements[i+j*nelr] < 0) h_elements_surrounding_elements[i+j*nelr] = -1; h_elements_surrounding_elements[i + j*nelr]--; //it's coming in with Fortran numbering for(int k = 0; k < NDIM; k++) { file >> h_normals[i + (j + k*NNB)*nelr]; h_normals[i + (j + k*NNB)*nelr] = -h_normals[i + (j + k*NNB)*nelr]; } } } // fill in remaining data int last = nel-1; for(int i = nel; i < nelr; i++) { h_areas[i] = h_areas[last]; for(int j = 0; j < NNB; j++) { // duplicate the last element h_elements_surrounding_elements[i + j*nelr] = h_elements_surrounding_elements[last + j*nelr]; for(int k = 0; k < NDIM; k++) h_normals[last + (j + k*NNB)*nelr] = h_normals[last + (j + k*NNB)*nelr]; } } areas = alloc<float>(context, nelr); upload<float>(commands, areas, h_areas, nelr); elements_surrounding_elements = alloc<int>(context, nelr*NNB); upload<int>(commands, elements_surrounding_elements, h_elements_surrounding_elements, nelr*NNB); normals = alloc<float>(context, nelr*NDIM*NNB); upload<float>(commands, normals, h_normals, nelr*NDIM*NNB); delete[] h_areas; delete[] h_elements_surrounding_elements; delete[] h_normals; } char* kernel_files; int num_kernels = 20; kernel_files = (char*) malloc(sizeof(char*)*num_kernels); strcpy(kernel_files,"cfd_kernel"); program = ocdBuildProgramFromFile(context,device_id,kernel_files, NULL); // Create the compute kernel in the program we wish to run kernel_compute_flux = clCreateKernel(program, "compute_flux", &err); CHKERR(err, "Failed to create a compute kernel!"); // Create the reduce kernel in the program we wish to run kernel_compute_flux_contributions = clCreateKernel(program, "compute_flux_contributions", &err); CHKERR(err, "Failed to create a compute_flux_contributions kernel!"); // Create the reduce kernel in the program we wish to run kernel_compute_step_factor = clCreateKernel(program, "compute_step_factor", &err); CHKERR(err, "Failed to create a compute_step_factor kernel!"); // Create the reduce kernel in the program we wish to run kernel_time_step = clCreateKernel(program, "time_step", &err); CHKERR(err, "Failed to create a time_step kernel!"); // Create the reduce kernel in the program we wish to run kernel_initialize_variables = clCreateKernel(program, "initialize_variables", &err); CHKERR(err, "Failed to create a initialize_variables kernel!"); // Create arrays and set initial conditions cl_mem variables = alloc<cl_float>(context, nelr*NVAR); err = 0; err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&variables); err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device //err = clGetKernelWorkGroupInfo(kernel_initialize_variables, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_initialize_variables work group info!"); local_size = 1;//std::min(local_size, (size_t)nelr); global_size = nelr; err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); err = clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 0"); cl_mem old_variables = alloc<float>(context, nelr*NVAR); cl_mem fluxes = alloc<float>(context, nelr*NVAR); cl_mem step_factors = alloc<float>(context, nelr); clFinish(commands); cl_mem fc_momentum_x = alloc<float>(context, nelr*NDIM); cl_mem fc_momentum_y = alloc<float>(context, nelr*NDIM); cl_mem fc_momentum_z = alloc<float>(context, nelr*NDIM); cl_mem fc_density_energy = alloc<float>(context, nelr*NDIM); clFinish(commands); // make sure all memory is floatly allocated before we start timing err = 0; err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&old_variables); err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_initialize_variables, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_initialize_variables work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 1"); err = 0; err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&fluxes); err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_step_factor, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_step_factor work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 2"); std::cout << "About to memcopy" << std::endl; err = clReleaseMemObject(step_factors); float temp[nelr]; for(int i = 0; i < nelr; i++) temp[i] = 0; step_factors = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * nelr, temp, &err); CHKERR(err, "Unable to memset step_factors"); // make sure CUDA isn't still doing something before we start timing clFinish(commands); // these need to be computed the first time in order to compute time step std::cout << "Starting..." << std::endl; // Begin iterations for(int i = 0; i < iterations; i++) { copy<float>(commands, old_variables, variables, nelr*NVAR); // for the first iteration we compute the time step err = 0; err = clSetKernelArg(kernel_compute_step_factor, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_compute_step_factor, 1, sizeof(cl_mem),&variables); err |= clSetKernelArg(kernel_compute_step_factor, 2, sizeof(cl_mem), &areas); err |= clSetKernelArg(kernel_compute_step_factor, 3, sizeof(cl_mem), &step_factors); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_step_factor, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_step_factor work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_compute_step_factor, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Step Factor Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel[kernel_compute_step_factor]!"); for(int j = 0; j < RK; j++) { err = 0; err = clSetKernelArg(kernel_compute_flux_contributions, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_compute_flux_contributions, 1, sizeof(cl_mem),&variables); err |= clSetKernelArg(kernel_compute_flux_contributions, 2, sizeof(cl_mem), &fc_momentum_x); err |= clSetKernelArg(kernel_compute_flux_contributions, 3, sizeof(cl_mem), &fc_momentum_y); err |= clSetKernelArg(kernel_compute_flux_contributions, 4, sizeof(cl_mem), &fc_momentum_z); err |= clSetKernelArg(kernel_compute_flux_contributions, 5, sizeof(cl_mem), &fc_density_energy); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_flux_contributions, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_flux_contributions work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_compute_flux_contributions, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Flux Contribution Kernel", ocdTempTimer) //compute_flux_contributions(nelr, variables, fc_momentum_x, fc_momentum_y, fc_momentum_z, fc_density_energy); END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_compute_flux_contributions]!"); err = 0; err = clSetKernelArg(kernel_compute_flux, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_compute_flux, 1, sizeof(cl_mem), &elements_surrounding_elements); err |= clSetKernelArg(kernel_compute_flux, 2, sizeof(cl_mem), &normals); err |= clSetKernelArg(kernel_compute_flux, 3, sizeof(cl_mem), &variables); err |= clSetKernelArg(kernel_compute_flux, 4, sizeof(cl_mem), &fc_momentum_x); err |= clSetKernelArg(kernel_compute_flux, 5, sizeof(cl_mem), &fc_momentum_y); err |= clSetKernelArg(kernel_compute_flux, 6, sizeof(cl_mem), &fc_momentum_z); err |= clSetKernelArg(kernel_compute_flux, 7, sizeof(cl_mem), &fc_density_energy); err |= clSetKernelArg(kernel_compute_flux, 8, sizeof(cl_mem), &fluxes); err |= clSetKernelArg(kernel_compute_flux, 9, sizeof(cl_mem), &ff_variable); err |= clSetKernelArg(kernel_compute_flux, 10, sizeof(cl_mem), &ff_fc_momentum_x); err |= clSetKernelArg(kernel_compute_flux, 11, sizeof(cl_mem), &ff_fc_momentum_y); err |= clSetKernelArg(kernel_compute_flux, 12, sizeof(cl_mem), &ff_fc_momentum_z); err |= clSetKernelArg(kernel_compute_flux, 13, sizeof(cl_mem), &ff_fc_density_energy); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_flux, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_flux work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_compute_flux, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Flux Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_compute_flux]!"); err = 0; err = clSetKernelArg(kernel_time_step, 0, sizeof(int), &j); err |= clSetKernelArg(kernel_time_step, 1, sizeof(int), &nelr); err |= clSetKernelArg(kernel_time_step, 2, sizeof(cl_mem), &old_variables); err |= clSetKernelArg(kernel_time_step, 3, sizeof(cl_mem), &variables); err |= clSetKernelArg(kernel_time_step, 4, sizeof(cl_mem), &step_factors); err |= clSetKernelArg(kernel_time_step, 5, sizeof(cl_mem), &fluxes); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_time_step, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_time_step work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_time_step, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Time Step Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_time_step]!"); } } clFinish(commands); std::cout << "Finished" << std::endl; std::cout << "Saving solution..." << std::endl; dump(commands, variables, nel, nelr); std::cout << "Saved solution..." << std::endl; std::cout << "Cleaning up..." << std::endl; clReleaseProgram(program); clReleaseKernel(kernel_compute_flux); clReleaseKernel(kernel_compute_flux_contributions); clReleaseKernel(kernel_compute_step_factor); clReleaseKernel(kernel_time_step); clReleaseKernel(kernel_initialize_variables); clReleaseCommandQueue(commands); clReleaseContext(context); dealloc<float>(areas); dealloc<int>(elements_surrounding_elements); dealloc<float>(normals); dealloc<float>(variables); dealloc<float>(old_variables); dealloc<float>(fluxes); dealloc<float>(step_factors); dealloc<float>(fc_momentum_x); dealloc<float>(fc_momentum_y); dealloc<float>(fc_momentum_z); dealloc<float>(fc_density_energy); std::cout << "Done..." << std::endl; ocd_finalize(); return 0; }
int main(int argc, char ** argv) { ocd_init(&argc, &argv, NULL); ocd_initCL(); if (argc < 3) { printf("Calculate similarities between two strings.\n"); printf("Maximum length of each string is: %d\n", MAX_LEN); printf("Usage: %s query database\n", argv[0]); printf("or: %s query database [openPenalty extensionPenalty block#]\n", argv[0]); printf("openPenalty (5.0), extensionPenalty (0.5)\n"); return 1; } ///////////////////////////////////// // 00 --> 01 // | | // 10 --> 11 //////////////////////////////////// char queryFilePathName[255], dbDataFilePathName[255], dbLenFilePathName[255]; int querySize, subSequenceNum, subSequenceSize; float openPenalty, extensionPenalty; int coalescedOffset = COALESCED_OFFSET; int nblosumWidth = 23; size_t blockSize = 64; size_t setZeroThreadNum, mfThreadNum; int blockNum = 14; cl_ulong maxLocalSize; int arraySize; struct timeval t1, t2; float tmpTime; FILE *pfile; //record time memset(&strTime, 0, sizeof(STRUCT_TIME)); timerStart(); openPenalty = 5.0f; extensionPenalty = 0.5; if (argc == 6) { openPenalty = atof(argv[3]); extensionPenalty = atof(argv[4]); blockNum = atoi(argv[5]); } //relocated to after MAX_COMPUTE_UNITS check //mfThreadNum = blockNum * blockSize; cl_program hProgram; cl_kernel hMatchStringKernel, hTraceBackKernel, hSetZeroKernel; size_t sourceFileSize; char *cSourceCL = NULL; //err = clGetPlatformIDs(1, &platformID, NULL); //CHKERR(err, "Get platform ID error!"); cl_int err; //check to make sure the device supports this block count //then scale threads appropriately cl_uint devBlockNum = 0; CHKERR(clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS,\ sizeof(cl_uint), &devBlockNum, 0), \ "Error while querying CL_DEVICE_MAX_COMPUTE_UNITS."); if (devBlockNum == MIN(blockNum, devBlockNum)) { printf("Scaling blocks from %d to %d to fit on device\n",\ blockNum, devBlockNum); blockNum = devBlockNum; } mfThreadNum = blockNum * blockSize; CHKERR(clGetDeviceInfo(device_id, CL_DEVICE_LOCAL_MEM_SIZE,\ sizeof(cl_ulong), &maxLocalSize, 0), \ "Error while querying CL_DEVICE_LOCAL_MEM_SIZE."); //load the source file char kernel_file[] = "kernels.cl"; cSourceCL = loadSource(kernel_file, &sourceFileSize); hProgram = clCreateProgramWithSource(context, 1, (const char **)&cSourceCL, &sourceFileSize, &err); CHKERR(err, "Create program with source error"); err = clBuildProgram(hProgram, 0, 0, 0, 0, 0); //debug================================ int logSize = 3000, i; size_t retSize; char logTxt[3000]; err = clGetProgramBuildInfo(hProgram, device_id, CL_PROGRAM_BUILD_LOG, logSize, logTxt, &retSize); for (i = 0; i < retSize; i++) { printf("%c", logTxt[i]); } //=================================== CHKERR(err, "Build program error"); hMatchStringKernel = clCreateKernel(hProgram, "MatchStringGPUSync", &err); CHKERR(err, "Create MatchString kernel error"); hTraceBackKernel = clCreateKernel(hProgram, "trace_back2", &err); CHKERR(err, "Create trace_back2 kernel error"); hSetZeroKernel = clCreateKernel(hProgram, "setZero", &err); CHKERR(err, "Create setZero kernel error"); sprintf(queryFilePathName, "%s", argv[1]); sprintf(dbDataFilePathName, "%s.data", argv[2]); sprintf(dbLenFilePathName, "%s.loc", argv[2]); char *allSequences, *querySequence, *subSequence; char *seq1, *seq2; cl_mem seq1D, seq2D; allSequences = new char[2 * (MAX_LEN)]; if (allSequences == NULL) { printf("Allocate sequence buffer error!\n"); return 1; } querySequence = allSequences; seq1D = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_char) * MAX_LEN, 0, &err); CHKERR(err, "Create seq1D memory"); seq2D = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_char) * MAX_LEN, 0, &err); CHKERR(err, "Create seq2D memory"); //read query sequence querySize = readQuerySequence(queryFilePathName, querySequence); if (querySize <= 0 || querySize > MAX_LEN) { printf("Query size %d is out of range (0, %d)\n", MAX_LEN, querySize); return 1; } encoding(querySequence, querySize); subSequence = allSequences + querySize; //allocate output sequence buffer char *outSeq1, *outSeq2; outSeq1 = new char[2 * MAX_LEN]; outSeq2 = new char[2 * MAX_LEN]; if (outSeq1 == NULL || outSeq2 == NULL) { printf("Allocate output sequence buffer on host error!\n"); return 1; } cl_mem outSeq1D, outSeq2D; outSeq1D = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_char) * MAX_LEN * 2, 0, &err); CHKERR(err, "Create outSeq1D memory"); outSeq2D = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_char) * MAX_LEN * 2, 0, &err); CHKERR(err, "Create outSeq2D memory"); //allocate thread number per launch and //location difference information int *threadNum, *diffPos; threadNum = new int[2 * MAX_LEN]; diffPos = new int[2 * MAX_LEN]; if (threadNum == NULL || diffPos == NULL) { printf("Allocate location buffer on host error!\n"); return 1; } cl_mem threadNumD, diffPosD; threadNumD = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_int) * (2 * MAX_LEN), 0, &err); CHKERR(err, "Create threadNumD memory"); diffPosD = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_int) * (2 * MAX_LEN), 0, &err); CHKERR(err, "Create diffPosD memory"); //allocate matrix buffer char *pathFlag, *extFlag; float *nGapDist, *hGapDist, *vGapDist; int maxElemNum = (MAX_LEN + 1) * (MAX_LEN + 1); pathFlag = new char[maxElemNum]; extFlag = new char[maxElemNum]; nGapDist = new float[maxElemNum]; hGapDist = new float[maxElemNum]; vGapDist = new float[maxElemNum]; if (pathFlag == NULL || extFlag == NULL || nGapDist == NULL || hGapDist == NULL || vGapDist == NULL) { printf("Allocate DP matrices on host error!\n"); return 1; } cl_mem pathFlagD, extFlagD, nGapDistD, hGapDistD, vGapDistD; pathFlagD = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_char) * maxElemNum, 0, &err); CHKERR(err, "Create pathFlagD memory"); extFlagD = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_char) * maxElemNum, 0, &err); CHKERR(err, "Create extFlagD memory"); nGapDistD = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * maxElemNum, 0, &err); CHKERR(err, "Create nGapDistD memory"); hGapDistD = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * maxElemNum, 0, &err); CHKERR(err, "Create hGapDistD memory"); vGapDistD = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * maxElemNum, 0, &err); CHKERR(err, "Create vGapDistD memory"); //Allocate the MAX INFO structure MAX_INFO *maxInfo; maxInfo = new MAX_INFO[1]; if (maxInfo == NULL) { printf("Alloate maxInfo on host error!\n"); return 1; } cl_mem maxInfoD; maxInfoD = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(MAX_INFO) * mfThreadNum, 0, &err); CHKERR(err, "Create maxInfoD memory"); //allocate the distance table cl_mem blosum62D; int nblosumHeight = 23; blosum62D = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_float) * nblosumWidth * nblosumHeight, 0, &err); err = clEnqueueWriteBuffer(commands, blosum62D, CL_TRUE, 0, nblosumWidth * nblosumHeight * sizeof(cl_float), blosum62[0], 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "SWAT Scoring Matrix Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "copy blosum62 to device"); cl_mem mutexMem; mutexMem = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), 0, &err); CHKERR(err, "create mutex mem error!"); //copy the scoring matrix to the constant memory //copyScoringMatrixToConstant(); //open the database pDBDataFile = fopen(dbDataFilePathName, "rb"); if (pDBDataFile == NULL) { printf("DB data file %s open error!\n", dbDataFilePathName); return 1; } pDBLenFile = fopen(dbLenFilePathName, "rb"); if (pDBLenFile == NULL) { printf("DB length file %s open error!\n", dbLenFilePathName); return 1; } //record time timerEnd(); strTime.iniTime = elapsedTime(); //read the total number of sequences fread(&subSequenceNum, sizeof(int), 1, pDBLenFile); //get the larger and smaller of the row and colum number int subSequenceNo, launchNum, launchNo; int rowNum, columnNum, matrixIniNum; int DPMatrixSize; int seq1Pos, seq2Pos, nOffset, startPos; for (subSequenceNo = 0; subSequenceNo < subSequenceNum; subSequenceNo++) { //record time timerStart(); //read subject sequence fread(&subSequenceSize, sizeof(int), 1, pDBLenFile); if (subSequenceSize <= 0 || subSequenceSize > MAX_LEN) { printf("Size %d of bubject sequence %d is out of range!\n", subSequenceSize, subSequenceNo); break; } fread(subSequence, sizeof(char), subSequenceSize, pDBDataFile); gettimeofday(&t1, NULL); if (subSequenceSize > querySize) { seq1 = subSequence; seq2 = querySequence; rowNum = subSequenceSize + 1; columnNum = querySize + 1; } else { seq1 = querySequence; seq2 = subSequence; rowNum = querySize + 1; columnNum = subSequenceSize + 1; } launchNum = rowNum + columnNum - 1; //preprocessing for sequences DPMatrixSize = preProcessing(rowNum, columnNum, threadNum, diffPos, matrixIniNum); //record time timerEnd(); strTime.preprocessingTime += elapsedTime(); //record time timerStart(); //use a kernel to initialize the matrix arraySize = DPMatrixSize * sizeof(char); setZeroThreadNum = ((arraySize - 1) / blockSize + 1) * blockSize; err = clSetKernelArg(hSetZeroKernel, 0, sizeof(cl_mem), (void *)&pathFlagD); err |= clSetKernelArg(hSetZeroKernel, 1, sizeof(int), (void *)&arraySize); err |= clEnqueueNDRangeKernel(commands, hSetZeroKernel, 1, NULL, &setZeroThreadNum, &blockSize, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "SWAT DP Matrix Init", ocdTempTimer) END_TIMER(ocdTempTimer) err |= clSetKernelArg(hSetZeroKernel, 0, sizeof(cl_mem), (void *)&extFlagD); err |= clEnqueueNDRangeKernel(commands, hSetZeroKernel, 1, NULL, &setZeroThreadNum, &blockSize, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "SWAT DP Matrix Init", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Initialize flag matrice"); arraySize = matrixIniNum * sizeof(float); setZeroThreadNum = ((arraySize - 1) / blockSize + 1) * blockSize; err = clSetKernelArg(hSetZeroKernel, 0, sizeof(cl_mem), (void *)&nGapDistD); err |= clSetKernelArg(hSetZeroKernel, 1, sizeof(int), (void *)&arraySize); err |= clEnqueueNDRangeKernel(commands, hSetZeroKernel, 1, NULL, &setZeroThreadNum, &blockSize, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "SWAT Distance Matrix Init", ocdTempTimer) END_TIMER(ocdTempTimer) err |= clSetKernelArg(hSetZeroKernel, 0, sizeof(cl_mem), (void *)&hGapDistD); err |= clEnqueueNDRangeKernel(commands, hSetZeroKernel, 1, NULL, &setZeroThreadNum, &blockSize, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "SWAT Distance Matrix Init", ocdTempTimer) END_TIMER(ocdTempTimer) err |= clSetKernelArg(hSetZeroKernel, 0, sizeof(cl_mem), (void *)&vGapDistD); err |= clEnqueueNDRangeKernel(commands, hSetZeroKernel, 1, NULL, &setZeroThreadNum, &blockSize, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "SWAT Distance Matrix Init", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Initialize dist matrice"); arraySize = sizeof(MAX_INFO) * mfThreadNum; setZeroThreadNum = ((arraySize - 1) / blockSize + 1) * blockSize; err = clSetKernelArg(hSetZeroKernel, 0, sizeof(cl_mem), (void *)&maxInfoD); err |= clSetKernelArg(hSetZeroKernel, 1, sizeof(int), (void *)&arraySize); err |= clEnqueueNDRangeKernel(commands, hSetZeroKernel, 1, NULL, &setZeroThreadNum, &blockSize, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "SWAT Max Info Matrix Init", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Initialize max info"); arraySize = sizeof(int); setZeroThreadNum = ((arraySize - 1) / blockSize + 1) * blockSize; err = clSetKernelArg(hSetZeroKernel, 0, sizeof(cl_mem), (void *)&mutexMem); err |= clSetKernelArg(hSetZeroKernel, 1, sizeof(int), (void *)&arraySize); err |= clEnqueueNDRangeKernel(commands, hSetZeroKernel, 1, NULL, &setZeroThreadNum, &blockSize, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "SWAT Mutex Init", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Initialize mutex variable"); //copy input sequences to device err = clEnqueueWriteBuffer(commands, seq1D, CL_FALSE, 0, (rowNum - 1) * sizeof(cl_char), seq1, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "SWAT Sequence Copy", ocdTempTimer) END_TIMER(ocdTempTimer) err |= clEnqueueWriteBuffer(commands, seq2D, CL_FALSE, 0, (columnNum - 1) * sizeof(cl_char), seq2, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "SWAT Sequence Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "copy input sequence"); err = clEnqueueWriteBuffer(commands, diffPosD, CL_FALSE, 0, launchNum * sizeof(cl_int), diffPos, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "SWAT Mutex Info Copy", ocdTempTimer) END_TIMER(ocdTempTimer) err |= clEnqueueWriteBuffer(commands, threadNumD, CL_FALSE, 0, launchNum * sizeof(cl_int), threadNum, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "SWAT Mutex Info Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "copy diffpos and/or threadNum mutexMem info error!"); //record time timerEnd(); strTime.copyTimeHostToDevice += elapsedTime(); //record time timerStart(); //set arguments err = clSetKernelArg(hMatchStringKernel, 0, sizeof(cl_mem), (void *)&pathFlagD); err |= clSetKernelArg(hMatchStringKernel, 1, sizeof(cl_mem), (void *)&extFlagD); err |= clSetKernelArg(hMatchStringKernel, 2, sizeof(cl_mem), (void *)&nGapDistD); err |= clSetKernelArg(hMatchStringKernel, 3, sizeof(cl_mem), (void *)&hGapDistD); err |= clSetKernelArg(hMatchStringKernel, 4, sizeof(cl_mem), (void *)&vGapDistD); err |= clSetKernelArg(hMatchStringKernel, 5, sizeof(cl_mem), (void *)&diffPosD); err |= clSetKernelArg(hMatchStringKernel, 6, sizeof(cl_mem), (void *)&threadNumD); err |= clSetKernelArg(hMatchStringKernel, 7, sizeof(cl_int), (void *)&rowNum); err |= clSetKernelArg(hMatchStringKernel, 8, sizeof(cl_int), (void *)&columnNum); err |= clSetKernelArg(hMatchStringKernel, 9, sizeof(cl_mem), (void *)&seq1D); err |= clSetKernelArg(hMatchStringKernel, 10, sizeof(cl_mem), (void *)&seq2D); err |= clSetKernelArg(hMatchStringKernel, 11, sizeof(cl_int), (void *)&nblosumWidth); err |= clSetKernelArg(hMatchStringKernel, 12, sizeof(cl_float), (void *)&openPenalty); err |= clSetKernelArg(hMatchStringKernel, 13, sizeof(cl_float), (void *)&extensionPenalty); err |= clSetKernelArg(hMatchStringKernel, 14, sizeof(cl_mem), (void *)&maxInfoD); err |= clSetKernelArg(hMatchStringKernel, 15, sizeof(cl_mem), (void *)&blosum62D); err |= clSetKernelArg(hMatchStringKernel, 16, sizeof(cl_mem), (void *)&mutexMem); //err |= clSetKernelArg(hMatchStringKernel, 17, maxLocalSize, NULL); CHKERR(err, "Set match string argument error!"); err = clEnqueueNDRangeKernel(commands, hMatchStringKernel, 1, NULL, &mfThreadNum, &blockSize, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "SWAT Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Launch kernel match string error"); //record time timerEnd(); strTime.matrixFillingTime += elapsedTime(); //record time timerStart(); err = clSetKernelArg(hTraceBackKernel, 0, sizeof(cl_mem), (void *)&pathFlagD); err |= clSetKernelArg(hTraceBackKernel, 1, sizeof(cl_mem), (void *)&extFlagD); err |= clSetKernelArg(hTraceBackKernel, 2, sizeof(cl_mem), (void *)&diffPosD); err |= clSetKernelArg(hTraceBackKernel, 3, sizeof(cl_mem), (void *)&seq1D); err |= clSetKernelArg(hTraceBackKernel, 4, sizeof(cl_mem), (void *)&seq2D); err |= clSetKernelArg(hTraceBackKernel, 5, sizeof(cl_mem), (void *)&outSeq1D); err |= clSetKernelArg(hTraceBackKernel, 6, sizeof(cl_mem), (void *)&outSeq2D); err |= clSetKernelArg(hTraceBackKernel, 7, sizeof(cl_mem), (void *)&maxInfoD); err |= clSetKernelArg(hTraceBackKernel, 8, sizeof(int), (void *)&mfThreadNum); size_t tbGlobalSize[1] = {1}; size_t tbLocalSize[1] = {1}; err = clEnqueueNDRangeKernel(commands, hTraceBackKernel, 1, NULL, tbGlobalSize, tbLocalSize, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "SWAT Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Launch kernel trace back error"); clFinish(commands); //record time timerEnd(); strTime.traceBackTime += elapsedTime(); //record time timerStart(); //copy matrix score structure back err = clEnqueueReadBuffer(commands, maxInfoD, CL_FALSE, 0, sizeof(MAX_INFO), maxInfo, 0, 0, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "SWAT Max Info Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Read maxInfo buffer error!"); int maxOutputLen = rowNum + columnNum - 2; err = clEnqueueReadBuffer(commands, outSeq1D, CL_FALSE, 0, maxOutputLen * sizeof(cl_char), outSeq1, 0, 0, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "SWAT Sequence Copy", ocdTempTimer) END_TIMER(ocdTempTimer) err = clEnqueueReadBuffer(commands, outSeq2D, CL_FALSE, 0, maxOutputLen * sizeof(cl_char), outSeq2, 0, 0, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "SWAT Sequence Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Read output sequence error!"); //record time clFinish(commands); gettimeofday(&t2, NULL); timerEnd(); strTime.copyTimeDeviceToHost += elapsedTime(); //call the print function to print the match result printf("============================================================\n"); printf("Sequence pair %d:\n", subSequenceNo); int nlength = maxInfo->noutputlen; PrintAlignment(outSeq1, outSeq2, nlength, CHAR_PER_LINE, openPenalty, extensionPenalty); printf("Max alignment score (on device) is %.1f\n", maxInfo->fmaxscore); //obtain max alignment score on host //err = clEnqueueReadBuffer(commands, nGapDistD, CL_TRUE, 0, sizeof(cl_float) * DPMatrixSize, // nGapDist, 0, 0, 0); //printf("Max alignment score (on host) is %.1f\n", maxScore(nGapDist, DPMatrixSize)); printf("openPenalty = %.1f, extensionPenalty = %.1f\n", openPenalty, extensionPenalty); printf("Input sequence size, querySize: %d, subSequenceSize: %d\n", querySize, subSequenceSize); printf("Max position, seq1 = %d, seq2 = %d\n", maxInfo->nposi, maxInfo->nposj); } tmpTime = 1000.0 * (t2.tv_sec - t1.tv_sec) + (t2.tv_usec - t1.tv_usec) / 1000.0; pfile = fopen("../kernelTime.txt", "at"); fprintf(pfile, "verOpencl4:\t%.3f\n", tmpTime); fclose(pfile); //print time printTime_toStandardOutput(); printTime_toFile(); fclose(pDBLenFile); fclose(pDBDataFile); clReleaseKernel(hMatchStringKernel); clReleaseKernel(hTraceBackKernel); clReleaseKernel(hSetZeroKernel); delete allSequences; clReleaseMemObject(seq1D); clReleaseMemObject(seq2D); delete outSeq1; delete outSeq2; clReleaseMemObject(outSeq1D); clReleaseMemObject(outSeq2D); delete threadNum; clReleaseMemObject(threadNumD); delete diffPos; clReleaseMemObject(diffPosD); delete pathFlag; delete extFlag; delete nGapDist; delete hGapDist; delete vGapDist; clReleaseMemObject(pathFlagD); clReleaseMemObject(extFlagD); clReleaseMemObject(nGapDistD); clReleaseMemObject(hGapDistD); clReleaseMemObject(vGapDistD); delete maxInfo; clReleaseMemObject(maxInfoD); free(cSourceCL); clReleaseMemObject(blosum62D); clReleaseMemObject(mutexMem); clReleaseProgram(hProgram); clReleaseCommandQueue(commands); clReleaseContext(context); ocd_finalize(); return 0; }
KrecipesView::KrecipesView( QWidget *parent ) : QWidget( parent ), m_actionshandler( 0, 0 ) { new KrecipesAdaptor(this ); QDBusConnection::sessionBus().registerObject("/Krecipes", this); #ifndef NDEBUG QTime dbg_total_timer; dbg_total_timer.start(); #endif // Init the setup wizard if necessary kDebug() << "Beginning wizard" ; wizard(); kDebug() << "Wizard finished correctly" ; // Show Splash Screen KStartupLogo* start_logo = 0L; start_logo = new KStartupLogo(); start_logo -> setHideEnabled( true ); start_logo->show(); start_logo->raise(); // Initialize Database // Check if the database type is among those supported // and initialize the database in each case START_TIMER("Initializing database") initDatabase(); END_TIMER() // Design the GUI QHBoxLayout *layout = new QHBoxLayout; setLayout( layout ); splitter = new KHBox( this ); layout->addWidget( splitter ); // Create Left and Right Panels (splitter) leftPanelFrame = new QFrame( splitter ); leftPanel = new KreMenu; QHBoxLayout *leftPanelFrameLayout = new QHBoxLayout; leftPanelFrame->setLayout( leftPanelFrameLayout ); leftPanelFrameLayout->addWidget( leftPanel ); leftPanelFrameLayout->setMargin( 0 ); leftPanelFrame->setFrameStyle( QFrame::StyledPanel | QFrame::Raised ); leftPanelFrame->setFrameRect( QRect( 0, 0, 0, 0 ) ); rightPanel = new PanelDeco( splitter, i18n( "Find/Edit Recipes" ), "system-search" ); // Design Left Panel START_TIMER("Setting up buttons") // Buttons button0 = new KreMenuButton( leftPanel, SelectP ); button0->setIconSet( KIcon( "system-search" ) ); buttonsList.append( button0 ); button1 = new KreMenuButton( leftPanel, ShoppingP ); button1->setIconSet( KIcon( "view-pim-tasks" ) ); buttonsList.append( button1 ); button7 = new KreMenuButton( leftPanel, DietP ); button7->setIconSet( KIcon( "diet" ) ); buttonsList.append( button7 ); button8 = new KreMenuButton( leftPanel, MatcherP ); button8->setIconSet( KIcon( "view-filter" ) ); buttonsList.append( button8 ); // Submenus dataMenu = leftPanel->createSubMenu( i18n( "Data..." ), "server-database" ); recipeButton = new KreMenuButton( leftPanel, RecipeEdit ); recipeButton->setIconSet( KIcon( "document-save" ) ); buttonsList.append( recipeButton ); recipeButton->setEnabled( false ); recipeButton->hide(); button2 = new KreMenuButton( leftPanel, IngredientsP, dataMenu ); button2->setIconSet( KIcon( "ingredients" ) ); buttonsList.append(button2); button3 = new KreMenuButton( leftPanel, PropertiesP, dataMenu ); button3->setIconSet( KIcon( "properties" ) ); buttonsList.append( button3 ); button4 = new KreMenuButton( leftPanel, UnitsP, dataMenu ); button4->setIconSet( KIcon( "units" ) ); buttonsList.append( button4 ); button9 = new KreMenuButton( leftPanel, PrepMethodsP, dataMenu ); button9->setIconSet( KIcon( "methods" ) ); buttonsList.append( button9 ); button5 = new KreMenuButton( leftPanel, CategoriesP, dataMenu ); button5->setIconSet( KIcon( "folder-yellow" ) ); buttonsList.append( button5 ); button6 = new KreMenuButton( leftPanel, AuthorsP, dataMenu ); button6->setIconSet( KIcon( "authors" ) ); buttonsList.append( button6 ); contextButton = new QPushButton( leftPanel ); contextButton->setObjectName( "contextButton" ); contextButton->setIcon( KIcon( "system-help" ) ); contextButton->setGeometry( leftPanel->width() - 42, leftPanel->height() - 42, 32, 32 ); QPalette p = palette(); p.setColor(backgroundRole(), contextButton->palette().color(backgroundRole()).light( 140 ) ); contextButton->setPalette(p); contextButton->setFlat( true ); END_TIMER() KConfigGroup config(KGlobal::config(), "Performance" ); int limit = config.readEntry( "CategoryLimit", -1 ); database->updateCategoryCache(limit); // Right Panel Widgets START_TIMER("Creating input dialog") inputPanel = new RecipeInputDialog( rightPanel, database ); rightPanel->addStackWidget( inputPanel ); END_TIMER() START_TIMER("Creating recipe view") viewPanel = new RecipeViewDialog( rightPanel, database ); rightPanel->addStackWidget( viewPanel ); END_TIMER() START_TIMER("Creating recipe selection dialog") selectPanel = new SelectRecipeDialog( rightPanel, database ); rightPanel->addStackWidget( selectPanel ); END_TIMER() START_TIMER("Creating ingredients component") ingredientsPanel = new IngredientsDialog( rightPanel, database ); rightPanel->addStackWidget( ingredientsPanel ); END_TIMER() START_TIMER("Creating properties component") propertiesPanel = new PropertiesDialog( rightPanel, database ); rightPanel->addStackWidget( propertiesPanel ); END_TIMER() START_TIMER("Creating units component") unitsPanel = new UnitsDialog( rightPanel, database ); rightPanel->addStackWidget( unitsPanel ); END_TIMER() START_TIMER("Creating shopping list dialog") shoppingListPanel = new ShoppingListDialog( rightPanel, database ); rightPanel->addStackWidget( shoppingListPanel ); END_TIMER() START_TIMER("Creating diet wizard dialog") dietPanel = new DietWizardDialog( rightPanel, database ); rightPanel->addStackWidget( dietPanel ); END_TIMER() START_TIMER("Creating categories component") categoriesPanel = new CategoriesEditorDialog( rightPanel, database ); rightPanel->addStackWidget( categoriesPanel ); END_TIMER() START_TIMER("Creating authors component") authorsPanel = new AuthorsDialog( rightPanel, database ); rightPanel->addStackWidget( authorsPanel ); END_TIMER() START_TIMER("Creating prep methods component") prepMethodsPanel = new PrepMethodsDialog( rightPanel, database ); rightPanel->addStackWidget( prepMethodsPanel ); END_TIMER() START_TIMER("Creating ingredients matcher dialog") ingredientMatcherPanel = new IngredientMatcherDialog( rightPanel, database ); rightPanel->addStackWidget( ingredientMatcherPanel ); END_TIMER() database->clearCategoryCache(); // Use to keep track of the panels panelMap.insert( inputPanel, RecipeEdit ); panelMap.insert( viewPanel, RecipeView ); panelMap.insert( selectPanel, SelectP ); panelMap.insert( ingredientsPanel, IngredientsP ); panelMap.insert( propertiesPanel, PropertiesP ); panelMap.insert( unitsPanel, UnitsP ); panelMap.insert( shoppingListPanel, ShoppingP ); panelMap.insert( dietPanel, DietP ); panelMap.insert( categoriesPanel, CategoriesP ); panelMap.insert( authorsPanel, AuthorsP ); panelMap.insert( prepMethodsPanel, PrepMethodsP ); panelMap.insert( ingredientMatcherPanel, MatcherP ); m_activePanel = SelectP; m_previousActivePanel = SelectP; slotSetPanel( SelectP ); // i18n translate(); // Connect Signals from Left Panel to slotSetPanel() connect( leftPanel, SIGNAL( clicked( KrePanel ) ), this, SLOT( slotSetPanel( KrePanel ) ) ); connect( contextButton, SIGNAL( clicked() ), SLOT( activateContextHelp() ) ); connect( leftPanel, SIGNAL( resized( int, int ) ), this, SLOT( resizeRightPane( int, int ) ) ); // Retransmit signal to parent to Enable/Disable the Save Button connect ( inputPanel, SIGNAL( enableSaveOption( bool ) ), this, SLOT( enableSaveOptionSlot( bool ) ) ); // Create a new button when a recipe is unsaved connect ( inputPanel, SIGNAL( createButton( QWidget*, const QString & ) ), this, SLOT( addRecipeButton( QWidget*, const QString & ) ) ); // Connect Signals from selectPanel (SelectRecipeDialog) connect ( selectPanel, SIGNAL( recipeSelected( int, int ) ), this, SLOT( actionRecipe( int, int ) ) ); connect ( selectPanel, SIGNAL( recipesSelected( const QList<int>&, int ) ), this, SLOT( actionRecipes( const QList<int>&, int ) ) ); // Connect Signals from ingredientMatcherPanel (IngredientMatcherDialog) connect ( ingredientMatcherPanel, SIGNAL( recipeSelected( int, int ) ), SLOT( actionRecipe( int, int ) ) ); // Close a recipe when requested (just switch panels) connect( inputPanel, SIGNAL( closeRecipe() ), this, SLOT( closeRecipe() ) ); // Show a recipe when requested (just switch panels) connect( inputPanel, SIGNAL( showRecipe( int ) ), this, SLOT( showRecipe( int ) ) ); // Close the recipe view when requested (just switch panels) connect( viewPanel, SIGNAL( closeRecipeView() ), this, SLOT( closeRecipe() ) ); // Create a new shopping list when a new diet is generated and accepted connect( dietPanel, SIGNAL( dietReady() ), this, SLOT( createShoppingListFromDiet() ) ); // Place the Tip Button in correct position when the left pane is resized connect( leftPanel, SIGNAL( resized( int, int ) ), this, SLOT( moveTipButton( int, int ) ) ); connect( rightPanel, SIGNAL( panelRaised( QWidget*, QWidget* ) ), SLOT( panelRaised( QWidget*, QWidget* ) ) ); connect( selectPanel, SIGNAL( recipeSelected(bool) ), SIGNAL( recipeSelected(bool) ) ); connect( ingredientMatcherPanel, SIGNAL( recipeSelected(bool) ), SIGNAL( recipeSelected(bool) ) ); // Close Splash Screen delete start_logo; #ifndef NDEBUG kDebug()<<"Total time elapsed: "<<dbg_total_timer.elapsed()/1000<<" sec"; #endif }
int main(int argc, char *argv[]){ boost_po::options_description options("Allowed options"); std::string trainfname; std::string testfname; int get_train_err = 0; options.add_options() ("train", boost_po::value<std::string>(&trainfname)->required(), "train") ("test", boost_po::value<std::string>(&testfname)->required(), "test") ("terr", boost_po::value<int>(&get_train_err), "get train err"); boost_po::variables_map options_map; boost_po::store(boost_po::parse_command_line(argc, argv, options), options_map); boost_po::notify(options_map); arma::mat X; arma::icolvec Y; clock_t t; std::cout << "start loading " << trainfname << std::endl; START_TIMER(t); X.load(trainfname, arma::csv_ascii); END_TIMER(t, t); std::cout << "data matrix loaded, size: " << X.n_rows << ", " << X.n_cols << " time = " << TO_SEC(t) << " sec" << std::endl; Y = arma::conv_to<arma::icolvec>::from(X.col(X.n_cols - 1)); X.shed_col(X.n_cols - 1); convert_binary(Y); const int32_t num_samples = X.n_rows; const int32_t num_features = X.n_cols; arma::colvec mu(num_features); //gaussian mean arma::mat sigma(num_features, num_features); //gaussian covariance double xi = 2; double delta = 1; //initialize gaussian parameters mu.zeros(); sigma = sigma.eye()*COV_FAC; std::cout << "start inverting sigma" << std::endl; START_TIMER(t); arma::mat sigma_inv = sigma.i(); END_TIMER(t, t); std::cout << "finished inverting, time = " << TO_SEC(t) << " secs" << std::endl; clock_t ts; std::cout << "starts training" << std::endl; START_TIMER(ts); int iter_cnt = 0; while(std::abs(delta) > CONVERGE_THRE){ arma::mat post_sigma_inv; arma::mat post_sigma; arma::colvec post_mu; double xi_sqr; double post_xi; int32_t i; for(i = 0; i < num_samples; i++){ std::cout << "sample = i = " << i << std::endl; arma::colvec x = X.row(i).t(); std::cout << "x.print()" << std::endl; x.print(); post_sigma_inv = get_post_sigma_inv(xi, x, sigma_inv); post_sigma = post_sigma_inv.i(); post_mu = get_post_mu(x, sigma_inv, post_sigma, mu, Y(i)); xi_sqr = get_xi_sqr(x, post_sigma, post_mu); post_xi = sqrt(xi_sqr); sigma = post_sigma; sigma_inv = post_sigma_inv; mu = post_mu; delta = post_xi - xi; xi = post_xi; END_TIMER(t, ts); std::cout << "xi = " << xi << " delta = " << delta << " time = " << TO_SEC(t) << " sec" << std::endl; } ++iter_cnt; std::cout << "iteration " << iter_cnt << " done. xi = " << xi << " delta = " << delta << " std::abs(delta) = " << std::abs(delta) << " time = " << TO_SEC(t) << " sec" << std::endl; } END_TIMER(t, ts); std::cout << "time = " << TO_SEC(t) << " sec" << std::endl; std::cout << " training done" << std::endl; std::cout << "xi = " << xi << std::endl; arma::mat X_test; arma::icolvec Y_test; std::cout << "start loading test matrix " << testfname << std::endl; START_TIMER(t); X_test.load(testfname, arma::csv_ascii); END_TIMER(t, t); std::cout << "test data matrix loaded, size: " << X_test.n_rows << ", " << X_test.n_cols << " time = " << TO_SEC(t) << " sec" << std::endl; Y_test = arma::conv_to<arma::icolvec>::from(X_test.col(X_test.n_cols - 1)); X_test.shed_col(X_test.n_cols - 1); double test_err = classification_error(xi, sigma, sigma_inv, mu, X_test, Y_test); std::cout << "test error = " << test_err << std::endl; if(get_train_err){ double train_err = classification_error(xi, sigma, sigma_inv, mu, X, Y); std::cout << "train error = " << train_err << std::endl; } sigma.save(trainfname + ".sigma.out", arma::csv_ascii); mu.save(trainfname + ".mu.out", arma::csv_ascii); X.save(trainfname + ".out", arma::csv_ascii); }
void DietWizardDialog::createDiet( void ) { KApplication::setOverrideCursor( Qt::WaitCursor ); START_TIMER("Creating the diet"); RecipeList rlist; dietRList->clear(); // Get the whole list of recipes, detailed int flags = RecipeDB::Title | getNecessaryFlags(); database->loadRecipes( &rlist, flags ); // temporal iterator list so elements can be removed without reloading them again from the DB // this list prevents the same meal from showing up in the same day twice Q3ValueList <RecipeList::Iterator> tempRList; bool alert = false; for ( int day = 0;day < dayNumber;day++ ) // Create the diet for the number of days defined by the user { populateIteratorList( rlist, &tempRList ); // temporal iterator list so elements can be removed without reloading them again from the DB for ( int meal = 0;meal < mealNumber;meal++ ) { int dishNo = ( ( MealInput* ) ( mealTabs->widget( meal ) ) ) ->dishNo(); for ( int dish = 0;dish < dishNo;dish++ ) { bool found = false; Q3ValueList <RecipeList::Iterator> tempDishRList = tempRList; while ( ( !found ) && !tempDishRList.empty() ) { int random_index = ( int ) ( ( float ) ( KRandom::random() ) / ( float ) RAND_MAX * tempDishRList.count() ); Q3ValueList<RecipeList::Iterator>::Iterator iit = tempDishRList.at( random_index ); // note that at() retrieves an iterator to the iterator list, so we need to use * in order to get the RecipeList::Iterator RecipeList::Iterator rit = *iit; if ( found = ( ( ( !categoryFiltering( meal, dish ) ) || checkCategories( *rit, meal, dish ) ) && checkConstraints( *rit, meal, dish ) ) ) // Check that the recipe is inside the constraint limits and in the categories specified { dietRList->append( *rit ); // Add recipe to the diet list tempRList.remove( tempRList.find(*iit) ); //can't just remove()... the iterator isn't from this list (its an iterator from tempDishRList) } else { tempDishRList.remove( iit ); // Remove this analized recipe from teh list } } if ( !found ) alert = true; } } } if ( alert ) { KApplication::restoreOverrideCursor(); KMessageBox::sorry( this, i18nc( "@info", "Given the constraints, a full diet list could not be constructed. Either the recipe list is too short or the constraints are too demanding. " ) ); } else // show the resulting diet { // make a list of dishnumbers QList<int> dishNumbers; for ( int meal = 0;meal < mealNumber;meal++ ) { int dishNo = ( ( MealInput* ) ( mealTabs->widget( meal ) ) ) ->dishNo(); dishNumbers << dishNo; } KApplication::restoreOverrideCursor(); // display the list QPointer<DietViewDialog> dietDisplay = new DietViewDialog( this, *dietRList, dayNumber, mealNumber, dishNumbers ); connect( dietDisplay, SIGNAL( signalOk() ), this, SLOT( createShoppingList() ) ); dietDisplay->exec(); delete dietDisplay; } END_TIMER(); }
int // delta -- had problems when return value was of float type kmeansCuda(float **feature, /* in: [npoints][nfeatures] */ int nfeatures, /* number of attributes for each point */ int npoints, /* number of data points */ int nclusters, /* number of clusters */ int *membership, /* which cluster the point belongs to */ float **clusters, /* coordinates of cluster centers */ int *new_centers_len, /* number of elements in each cluster */ float **new_centers /* sum of elements in each cluster */ ) { cl_int errcode; int delta = 0; /* if point has moved */ int i,j; /* counters */ /* copy membership (host to device) */ errcode = clEnqueueWriteBuffer(clCommands, membership_d, CL_TRUE, 0, npoints*sizeof(int), (void *) membership_new, 0, NULL, &ocdTempEvent); clFinish(clCommands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "Membership Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHECKERR(errcode); /* copy clusters (host to device) */ errcode = clEnqueueWriteBuffer(clCommands, clusters_d, CL_TRUE, 0, nclusters*nfeatures*sizeof(float), (void *) clusters[0], 0, NULL, &ocdTempEvent); clFinish(clCommands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "Cluster Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHECKERR(errcode); /* set up texture */ /*cudaChannelFormatDesc chDesc0 = cudaCreateChannelDesc<float>(); t_features.filterMode = cudaFilterModePoint; t_features.normalized = false; t_features.channelDesc = chDesc0; if(cudaBindTexture(NULL, &t_features, feature_d, &chDesc0, npoints*nfeatures*sizeof(float)) != CUDA_SUCCESS) printf("Couldn't bind features array to texture!\n"); cudaChannelFormatDesc chDesc1 = cudaCreateChannelDesc<float>(); t_features_flipped.filterMode = cudaFilterModePoint; t_features_flipped.normalized = false; t_features_flipped.channelDesc = chDesc1; if(cudaBindTexture(NULL, &t_features_flipped, feature_flipped_d, &chDesc1, npoints*nfeatures*sizeof(float)) != CUDA_SUCCESS) printf("Couldn't bind features_flipped array to texture!\n"); cudaChannelFormatDesc chDesc2 = cudaCreateChannelDesc<float>(); t_clusters.filterMode = cudaFilterModePoint; t_clusters.normalized = false; t_clusters.channelDesc = chDesc2; if(cudaBindTexture(NULL, &t_clusters, clusters_d, &chDesc2, nclusters*nfeatures*sizeof(float)) != CUDA_SUCCESS) printf("Couldn't bind clusters array to texture!\n");*/ /* copy clusters to constant memory */ //cudaMemcpyToSymbol("c_clusters",clusters[0],nclusters*nfeatures*sizeof(float),0,cudaMemcpyHostToDevice); /* setup execution parameters. changed to 2d (source code on NVIDIA CUDA Programming Guide) */ size_t localWorkSize[2] = {num_threads_perdim*num_threads_perdim, 1}; size_t globalWorkSize[2] = {num_blocks_perdim*localWorkSize[0], num_blocks_perdim*localWorkSize[1]}; unsigned int arg = 0; errcode = clSetKernelArg(clKernel_kmeansPoint, arg++, sizeof(cl_mem), (void *) &feature_d); errcode |= clSetKernelArg(clKernel_kmeansPoint, arg++, sizeof(cl_mem), (void *) &feature_flipped_d); errcode |= clSetKernelArg(clKernel_kmeansPoint, arg++, sizeof(int), (void *) &nfeatures); errcode |= clSetKernelArg(clKernel_kmeansPoint, arg++, sizeof(int), (void *) &npoints); errcode |= clSetKernelArg(clKernel_kmeansPoint, arg++, sizeof(int), (void *) &nclusters); errcode |= clSetKernelArg(clKernel_kmeansPoint, arg++, sizeof(cl_mem), (void *) &membership_d); errcode |= clSetKernelArg(clKernel_kmeansPoint, arg++, sizeof(cl_mem), (void *) &clusters_d); #ifdef BLOCK_DELTA_REDUCE errcode |= clSetKernelArg(clKernel_kmeansPoint, arg++, sizeof(cl_mem), (void *) &block_clusters_d); #endif #ifdef BLOCK_CENTER_REDUCE errcode |= clSetKernelArg(clKernel_kmeansPoint, arg++, sizeof(cl_mem), (void *) &block_deltas_d); #endif CHECKERR(errcode); /* execute the kernel */ errcode = clEnqueueNDRangeKernel(clCommands, clKernel_kmeansPoint, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent); CHECKERR(errcode); errcode = clFinish(clCommands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "Point Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHECKERR(errcode); /* copy back membership (device to host) */ errcode = clEnqueueReadBuffer(clCommands, membership_d, CL_TRUE, 0, npoints*sizeof(int), (void *) membership_new, 0, NULL, &ocdTempEvent); clFinish(clCommands); START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "Membership Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHECKERR(errcode); #ifdef BLOCK_CENTER_REDUCE /*** Copy back arrays of per block sums ***/ float * block_clusters_h = (float *) malloc( num_blocks_perdim * num_blocks_perdim * nclusters * nfeatures * sizeof(float)); errcode = clEnqueueReadBuffer(clCommands, block_clusters_d, CL_TRUE, 0, num_blocks_perdim*num_blocks_perdim*nclusters*nfeatures*sizeof(float), (void *) block_clusters_h, 0, NULL, &ocdTempEvent); clFinish(clCommands) END_TIMER COUNT_D2H CHECKERR(errcode); #endif #ifdef BLOCK_DELTA_REDUCE int * block_deltas_h = (int *) malloc( num_blocks_perdim * num_blocks_perdim * sizeof(int)); errcode = clEnqueueReadBuffer(clCommands, block_deltas_d, CL_TRUE, 0, num_blocks_perdim*num_blocks_perdim*sizeof(int), (void *) block_deltas_h, 0, NULL, &ocdTempEvent); clFinish(clCommands) END_TIMER COUNT_D2H CHECKERR(errcode); #endif /* for each point, sum data points in each cluster and see if membership has changed: if so, increase delta and change old membership, and update new_centers; otherwise, update new_centers */ delta = 0; for (i = 0; i < npoints; i++) { int cluster_id = membership_new[i]; new_centers_len[cluster_id]++; if (membership_new[i] != membership[i]) { #ifdef CPU_DELTA_REDUCE delta++; #endif membership[i] = membership_new[i]; } #ifdef CPU_CENTER_REDUCE for (j = 0; j < nfeatures; j++) { new_centers[cluster_id][j] += feature[i][j]; } #endif } #ifdef BLOCK_DELTA_REDUCE /*** calculate global sums from per block sums for delta and the new centers ***/ //debug //printf("\t \t reducing %d block sums to global sum \n",num_blocks_perdim * num_blocks_perdim); for(i = 0; i < num_blocks_perdim * num_blocks_perdim; i++) { //printf("block %d delta is %d \n",i,block_deltas_h[i]); delta += block_deltas_h[i]; } #endif #ifdef BLOCK_CENTER_REDUCE for(int j = 0; j < nclusters;j++) { for(int k = 0; k < nfeatures;k++) { block_new_centers[j*nfeatures + k] = 0.f; } } for(i = 0; i < num_blocks_perdim * num_blocks_perdim; i++) { for(int j = 0; j < nclusters;j++) { for(int k = 0; k < nfeatures;k++) { block_new_centers[j*nfeatures + k] += block_clusters_h[i * nclusters*nfeatures + j * nfeatures + k]; } } } #ifdef CPU_CENTER_REDUCE //debug /*for(int j = 0; j < nclusters;j++) { for(int k = 0; k < nfeatures;k++) { if(new_centers[j][k] > 1.001 * block_new_centers[j*nfeatures + k] || new_centers[j][k] < 0.999 * block_new_centers[j*nfeatures + k]) { printf("\t \t for %d:%d, normal value is %e and gpu reduced value id %e \n",j,k,new_centers[j][k],block_new_centers[j*nfeatures + k]); } } }*/ #endif #ifdef BLOCK_CENTER_REDUCE for(int j = 0; j < nclusters;j++) { for(int k = 0; k < nfeatures;k++) new_centers[j][k]= block_new_centers[j*nfeatures + k]; } #endif #endif return delta; }
//////////////////////////////////////////////////////////////////////////////// //! Run a simple test for CUDA //////////////////////////////////////////////////////////////////////////////// void runTest( int argc, char** argv) { ocd_options opts = ocd_get_options(); platform_id = opts.platform_id; n_device = opts.device_id; if ( argc != 8) { printf("Usage: GpuTemporalDataMining [<platform> <device> --] <file path> <temporal constraint path> <threads> <support> <(a)bsolute or (r)atio> <(s)tatic | (d)ynamic> <(m)ap and merge | (n)aive | (o)hybrid> \n"); return; } // CUT_DEVICE_INIT(); initGpu(); getDeviceVariables(device_id); printf("Dataset, Support Threshold, PTPE or MapMerge, A1 or A1+A2, Level, Episodes (N), Episodes Culled (X), A1 Counting Time, A2 Counting Time, Generation Time, Total Counting Time\n"); //CUT_SAFE_CALL( cutCreateTimer( &timer)); //CUT_SAFE_CALL( cutCreateTimer( &generating_timer)); //CUT_SAFE_CALL( cutCreateTimer( &a1_counting_timer)); //CUT_SAFE_CALL( cutCreateTimer( &a2_counting_timer)); //CUT_SAFE_CALL( cutCreateTimer( &total_timer)); //CUT_SAFE_CALL( cutStartTimer( total_timer)); //CUT_SAFE_CALL( cutStartTimer( timer)); //CUT_SAFE_CALL( cutStartTimer( generating_timer)); //CUT_SAFE_CALL( cutStartTimer( a1_counting_timer)); //CUT_SAFE_CALL( cutStartTimer( a2_counting_timer)); unsigned int num_threads = atoi(argv[3]); // allocate host memory //initEpisodeCandidates(); if ( loadData( argv[1] ) != 0 ) return; if ( loadTemporalConstraints(argv[2]) != 0 ) return; // Check whether value supplied is absolute or ratio support supportType = *(argv[5]) == 'a' ? ABSOLUTE : RATIO; memoryModel = *(argv[6]) == 's' ? STATIC : DYNAMIC; switch (*(argv[7])) { case 'm': algorithmType = MAP_AND_MERGE; break; case 'n': algorithmType = NAIVE; break; case 'o': algorithmType = OPTIMAL; break; } support = atof(argv[4]); dumpFile = fopen( "episode.txt", "w" ); //printf("Initializing GPU Data...\n"); setupGpu(); // setup execution parameters size_t grid[3]; size_t threads[3]; //printf("Event stream size: %i\n", eventSize); // BEGIN LOOP for ( int level = 1; level <= eventSize; level++ ) { printf("Generating episode candidates for level %i...\n", level); // CUT_SAFE_CALL( cutResetTimer( total_timer)); // CUT_SAFE_CALL( cutStartTimer( total_timer)); //CUDA_SAFE_CALL( cudaUnbindTexture( candidateTex ) ); if(level != 1){ unbindTexture(&candidateTex, d_episodeCandidates, numCandidates * (level-1) * sizeof(UBYTE) ); //CUDA_SAFE_CALL( cudaUnbindTexture( intervalTex ) ); unbindTexture(&intervalTex, d_episodeIntervals, numCandidates * (level-2) * 2 * sizeof(float)); } // CUT_SAFE_CALL( cutResetTimer( generating_timer)); // CUT_SAFE_CALL( cutStartTimer( generating_timer)); // int test1, test = numCandidates; // generateEpisodeCandidatesCPU( level ); // test1 = numCandidates; // numCandidates = test; printf("Generating Episodes\n"); #ifdef CPU_EPISODE_GENERATION generateEpisodeCandidatesCPU( level ); #else generateEpisodeCandidatesGPU( level, num_threads ); #endif // CUT_SAFE_CALL( cutStopTimer( generating_timer)); //printf( "\tGenerating time: %f (ms)\n", cutGetTimerValue( generating_timer)); if ( numCandidates == 0 ) break; printf("Writing to buffer\n"); // Copy candidates to GPU #ifdef CPU_EPISODE_GENERATION clEnqueueWriteBuffer(commands, d_episodeCandidates, CL_TRUE, 0, numCandidates * level * sizeof(UBYTE), h_episodeCandidates, 0, NULL, &ocdTempEvent); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "TDM Episode Copy", ocdTempTimer) END_TIMER(ocdTempTimer) clEnqueueWriteBuffer(commands, d_episodeIntervals, CL_TRUE, 0, numCandidates * (level-1) * 2 * sizeof(float), h_episodeIntervals, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "TDM Episode Copy", ocdTempTimer) END_TIMER(ocdTempTimer) #endif bindTexture( 0, &candidateTex, d_episodeCandidates, numCandidates * level * sizeof(UBYTE), CL_UNSIGNED_INT8); bindTexture( 0, &intervalTex, d_episodeIntervals, numCandidates * (level-1) * 2 * sizeof(float), CL_FLOAT ); //printf("Executing kernel on %i candidates...\n", numCandidates, level); // execute the kernel calculateGrid(grid, num_threads, numCandidates); calculateBlock(threads, num_threads, numCandidates); int sections; unsigned int shared_mem_needed; //CUT_SAFE_CALL( cutStartTimer( counting_timer)); int aType = algorithmType; if ( algorithmType == OPTIMAL ) aType = chooseAlgorithmType( level, numCandidates, num_threads ); if ( memoryModel == DYNAMIC ) { if ( aType == NAIVE ) { shared_mem_needed = MaxListSize*level*threads[0]*sizeof(float); printf("Shared memory needed %d\n", shared_mem_needed); //CUT_SAFE_CALL( cutResetTimer( a1_counting_timer)); //CUT_SAFE_CALL( cutStartTimer( a1_counting_timer)); countCandidates(grid, threads, d_episodeSupport, eventSize, level, supportType, numCandidates, candidateTex, intervalTex, eventTex, timeTex, shared_mem_needed ); } else { printf("DYNAMIC MAP MERGE\n"); calculateLevelParameters(level, threads, grid, sections); shared_mem_needed = 16000; printf("numCandidates=%d\n", numCandidates); //CUT_SAFE_CALL( cutResetTimer( a1_counting_timer)); //CUT_SAFE_CALL( cutStartTimer( a1_counting_timer)); countCandidatesMapMerge(grid, threads, d_episodeSupport, padEventSize, level, supportType, sections, padEventSize / sections, numCandidates, candidateTex, intervalTex, eventTex, timeTex, shared_mem_needed ); //countCandidatesMapMergeStatic<<< grid, threads, shared_mem_needed >>>( d_episodeSupport, padEventSize, level, supportType, sections, padEventSize / sections, numCandidates ); } } else { if ( aType == NAIVE ) { shared_mem_needed = level*threads[0]*sizeof(float); } else { calculateLevelParameters(level, threads, grid, sections); shared_mem_needed = 16000; } //CUT_SAFE_CALL( cutResetTimer( a2_counting_timer)); //CUT_SAFE_CALL( cutStartTimer( a2_counting_timer)); if ( aType == NAIVE ) countCandidatesStatic(grid, threads, d_episodeSupport, eventSize, level, supportType, numCandidates, candidateTex, intervalTex, eventTex, timeTex, shared_mem_needed ); else countCandidatesMapMergeStatic(grid, threads, d_episodeSupport, padEventSize, level, supportType, sections, padEventSize / sections, numCandidates, candidateTex, intervalTex, eventTex, timeTex, shared_mem_needed ); clFinish(commands); //CUT_SAFE_CALL( cutStopTimer( a2_counting_timer)); int err; err = clEnqueueReadBuffer(commands,d_episodeSupport, CL_TRUE, 0, numCandidates * sizeof(float), h_episodeSupport, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "TDM Episode Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Unable to read buffer from device."); unbindTexture(&candidateTex, d_episodeCandidates, numCandidates * level * sizeof(UBYTE) ); unbindTexture(&intervalTex, d_episodeIntervals, numCandidates * (level-1) * 2 * sizeof(float)); // Remove undersupported episodes cullCandidates( level ); if ( numCandidates == 0 ) break; unsigned int mmthreads = num_threads; if ( MaxListSize*level*num_threads*sizeof(float) > 16384 ) { if ( MaxListSize*level*96*sizeof(float) < 16384 ) mmthreads = 96; else if ( MaxListSize*level*64*sizeof(float) < 16384) mmthreads = 64; else if ( MaxListSize*level*32*sizeof(float) < 16384) mmthreads = 32; printf("More shared memory needed for %d threads. Changed to %d threads.\n", num_threads, mmthreads ); } #ifdef CPU_EPISODE_GENERATION err = clEnqueueWriteBuffer(commands, d_episodeCandidates, CL_TRUE, 0, numCandidates * level * sizeof(UBYTE), h_episodeCandidates, 0, NULL, &ocdTempEvent); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "TDM Episode Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Unable to write buffer 1."); if(numCandidates * (level - 1) * 2 * sizeof(float) != 0) err = clEnqueueWriteBuffer(commands, d_episodeIntervals, CL_TRUE, 0, numCandidates * (level-1) * 2 * sizeof(float), h_episodeIntervals, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "TDM Episode Copy", ocdTempTimer) CHKERR(err, "Unable to write buffer 2."); END_TIMER(ocdTempTimer) #endif bindTexture( 0, &candidateTex, d_episodeCandidates, numCandidates * level * sizeof(UBYTE), CL_UNSIGNED_INT8); bindTexture( 0, &intervalTex, d_episodeIntervals, numCandidates * (level-1) * 2 * sizeof(float), CL_FLOAT ); if ( algorithmType == OPTIMAL ) aType = chooseAlgorithmType( level, numCandidates, mmthreads ); // Run (T1,T2] algorithm if ( aType == NAIVE ) { shared_mem_needed = MaxListSize*level* mmthreads*sizeof(float); calculateGrid(grid, mmthreads, numCandidates ); calculateBlock(threads, mmthreads, numCandidates ); } else { calculateLevelParameters(level, threads, grid, sections); shared_mem_needed = 16000; } //CUT_SAFE_CALL( cutResetTimer( a1_counting_timer)); //CUT_SAFE_CALL( cutStartTimer( a1_counting_timer)); if ( aType == NAIVE ) countCandidates(grid, threads, d_episodeSupport, eventSize, level, supportType, numCandidates, candidateTex, intervalTex, eventTex, timeTex, shared_mem_needed ); else countCandidatesMapMerge(grid, threads, d_episodeSupport, padEventSize, level, supportType, sections, padEventSize / sections, numCandidates, candidateTex, intervalTex, eventTex, timeTex, shared_mem_needed ); } printf("Finishing\n"); clFinish(commands); //CUT_SAFE_CALL( cutStopTimer( a1_counting_timer)); //printf( "\tCounting time: %f (ms)\n", cutGetTimerValue( counting_timer)); // check if kernel execution generated an error //CUT_CHECK_ERROR("Kernel execution failed"); //printf("Copying result back to host...\n\n"); int err = clEnqueueReadBuffer(commands, d_episodeSupport, CL_TRUE, 0, numCandidates * sizeof(float), h_episodeSupport, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "TDM Episode Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Unable to read memory 1."); err = clEnqueueReadBuffer(commands, d_episodeCandidates, CL_TRUE, 0, numCandidates * level * sizeof(UBYTE), h_episodeCandidates, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "TDM Episode Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Unable to read memory 2."); //CUDA_SAFE_CALL( cudaMemcpy( h_mapRecords, d_mapRecords, 3 * numSections * maxLevel * maxCandidates * sizeof(float), cudaMemcpyDeviceToHost )); saveResult(level); fflush(dumpFile); // END LOOP //CUT_SAFE_CALL( cutStopTimer( total_timer)); // Print Statistics for this run printf("%s, %f, %s, %s, %d, %d, %d\n", argv[1], // Dataset support, // Support Threshold algorithmType == NAIVE ? "PTPE" : algorithmType == MAP_AND_MERGE ? "MapMerge" : "Episode-Based", // PTPE or MapMerge or Episode-Based memoryModel == STATIC ? "A1+A2" : "A1", // A1 or A1+A2 level, // Level numCandidates+episodesCulled, // Episodes counted episodesCulled // Episodes removed by A2 // cutGetTimerValue( a1_counting_timer), // Time for A1 // memoryModel == STATIC ? cutGetTimerValue( a2_counting_timer) : 0.0f, // Time for A2 // cutGetTimerValue( generating_timer), // Episode generation time // cutGetTimerValue( total_timer) ); // Time for total loop ); } printf("Done!\n"); cleanup(); //CUT_SAFE_CALL( cutStopTimer( timer)); //printf( "Processing time: %f (ms)\n", cutGetTimerValue( timer)); //CUT_SAFE_CALL( cutDeleteTimer( timer)); //CUT_SAFE_CALL( cutDeleteTimer( generating_timer)); //CUT_SAFE_CALL( cutDeleteTimer( a1_counting_timer)); //CUT_SAFE_CALL( cutDeleteTimer( a2_counting_timer)); //CUT_SAFE_CALL( cutDeleteTimer( total_timer)); }
void CDG::build(IN OUT OPT_CTX & oc, DGRAPH & cfg) { if (cfg.get_vertex_num() == 0) { return; } START_TIMER("CDG"); IS_TRUE0(OPTC_is_cfg_valid(oc)); m_ru->check_valid_and_recompute(&oc, OPT_PDOM, OPT_UNDEF); GRAPH pdom_tree; cfg.get_pdom_tree(pdom_tree); if (pdom_tree.get_vertex_num() == 0) { return; } SVECTOR<UINT> top_order; pdom_tree.sort_in_toplog_order(top_order, false); //dump_vec(top_order); BITSET_MGR bs_mgr; SVECTOR<BITSET*> cd_set; for (INT j = 0; j <= top_order.get_last_idx(); j++) { UINT ii = top_order.get(j); VERTEX * v = cfg.get_vertex(ii); IS_TRUE0(v != NULL); add_vertex(VERTEX_id(v)); BITSET * cd_of_v = cd_set.get(VERTEX_id(v)); if (cd_of_v == NULL) { cd_of_v = bs_mgr.create(); cd_set.set(VERTEX_id(v), cd_of_v); } EDGE_C * in = VERTEX_in_list(v); while (in != NULL) { VERTEX * pred = EDGE_from(EC_edge(in)); if (VERTEX_id(v) != ((DGRAPH&)cfg).get_ipdom(VERTEX_id(pred))) { cd_of_v->bunion(VERTEX_id(pred)); //if (pred != v) { add_edge(VERTEX_id(pred), VERTEX_id(v)); } } in = EC_next(in); } INT c; for (VERTEX * z = cfg.get_first_vertex(c); z != NULL; z = cfg.get_next_vertex(c)) { if (((DGRAPH&)cfg).get_ipdom(VERTEX_id(z)) == VERTEX_id(v)) { BITSET * cd = cd_set.get(VERTEX_id(z)); if (cd == NULL) { cd = bs_mgr.create(); cd_set.set(VERTEX_id(z), cd); } for (INT i = cd->get_first(); i != -1; i = cd->get_next(i)) { if (VERTEX_id(v) != ((DGRAPH&)cfg).get_ipdom(i)) { cd_of_v->bunion(i); //if (i != (INT)VERTEX_id(v)) { add_edge(i, VERTEX_id(v)); } } } } } } //end for OPTC_is_cdg_valid(oc) = true; END_TIMER(); }
int main ( int argc, char *argv[] ) { int matrix_dim = 32; /* default matrix_dim */ int opt, option_index=0; func_ret_t ret; const char *input_file = NULL; float *m, *mm; stopwatch sw; cl_device_id clDevice; cl_context clContext; cl_command_queue clCommands; cl_program clProgram; cl_kernel clKernel_diagonal; cl_kernel clKernel_perimeter; cl_kernel clKernel_internal; cl_int dev_type; cl_int errcode; FILE *kernelFile; char *kernelSource; size_t kernelLength; cl_mem d_m; ocd_init(&argc, &argv, NULL); ocd_options opts = ocd_get_options(); platform_id = opts.platform_id; device_id = opts.device_id; while ((opt = getopt_long(argc, argv, "::vs:i:", long_options, &option_index)) != -1 ) { switch(opt) { case 'i': input_file = optarg; break; case 'v': do_verify = 1; break; case 's': matrix_dim = atoi(optarg); fprintf(stderr, "Currently not supported, use -i instead\n"); fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file|-p platform|-d device]\n", argv[0]); exit(EXIT_FAILURE); case '?': fprintf(stderr, "invalid option\n"); break; case ':': fprintf(stderr, "missing argument\n"); break; default: fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file||-p platform|-d device]\n", argv[0]); exit(EXIT_FAILURE); } } if ( (optind < argc) || (optind == 1)) { fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file|-p platform|-d device]\n", argv[0]); exit(EXIT_FAILURE); } if (input_file) { printf("Reading matrix from file %s\n", input_file); ret = create_matrix_from_file(&m, input_file, &matrix_dim); if (ret != RET_SUCCESS) { m = NULL; fprintf(stderr, "error create matrix from file %s\n", input_file); exit(EXIT_FAILURE); } } else { printf("No input file specified!\n"); exit(EXIT_FAILURE); } if (do_verify) { printf("Before LUD\n"); print_matrix(m, matrix_dim); matrix_duplicate(m, &mm, matrix_dim); } // errcode = clGetPlatformIDs(NUM_PLATFORM, clPlatform, NULL); // CHECKERR(errcode); // // errcode = clGetDeviceIDs(clPlatform[PLATFORM_ID], USEGPU ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL); // CHECKERR(errcode); #ifdef USEGPU dev_type = CL_DEVICE_TYPE_GPU; #elif defined(USE_AFPGA) dev_type = CL_DEVICE_TYPE_ACCELERATOR; #else dev_type = CL_DEVICE_TYPE_CPU; #endif clDevice = GetDevice(platform_id, device_id,dev_type); size_t max_worksize[3]; errcode = clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES,sizeof(size_t)*3, &max_worksize, NULL); CHECKERR(errcode); while(BLOCK_SIZE*BLOCK_SIZE>max_worksize[0]) BLOCK_SIZE = BLOCK_SIZE/2; clContext = clCreateContext(NULL, 1, &clDevice, NULL, NULL, &errcode); CHECKERR(errcode); clCommands = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &errcode); CHECKERR(errcode); kernelFile = fopen("lud_kernel.cl", "r"); fseek(kernelFile, 0, SEEK_END); kernelLength = (size_t) ftell(kernelFile); kernelSource = (char *) malloc(sizeof(char)*kernelLength); rewind(kernelFile); fread((void *) kernelSource, kernelLength, 1, kernelFile); fclose(kernelFile); clProgram = clCreateProgramWithSource(clContext, 1, (const char **) &kernelSource, &kernelLength, &errcode); CHECKERR(errcode); free(kernelSource); char arg[100]; sprintf(arg,"-D BLOCK_SIZE=%d", (int)BLOCK_SIZE); errcode = clBuildProgram(clProgram, 1, &clDevice, arg, NULL, NULL); if (errcode == CL_BUILD_PROGRAM_FAILURE) { char *log; size_t logLength; errcode = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &logLength); log = (char *) malloc(sizeof(char)*logLength); errcode = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, logLength, (void *) log, NULL); fprintf(stderr, "Kernel build error! Log:\n%s", log); free(log); return 0; } CHECKERR(errcode); clKernel_diagonal = clCreateKernel(clProgram, "lud_diagonal", &errcode); CHECKERR(errcode); clKernel_perimeter = clCreateKernel(clProgram, "lud_perimeter", &errcode); CHECKERR(errcode); clKernel_internal = clCreateKernel(clProgram, "lud_internal", &errcode); CHECKERR(errcode); d_m = clCreateBuffer(clContext, CL_MEM_READ_WRITE, matrix_dim*matrix_dim*sizeof(float), NULL, &errcode); CHECKERR(errcode); /* beginning of timing point */ stopwatch_start(&sw); errcode = clEnqueueWriteBuffer(clCommands, d_m, CL_TRUE, 0, matrix_dim*matrix_dim*sizeof(float), (void *) m, 0, NULL, &ocdTempEvent); clFinish(clCommands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "Matrix Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHECKERR(errcode); int i=0; size_t localWorkSize[2]; size_t globalWorkSize[2]; //printf("BLOCK_SIZE: %d\n",BLOCK_SIZE); // printf("max Work-item Size: %d\n",(int)max_worksize[0]); #ifdef START_POWER for( int iter = 0; iter < 1000; iter++) #endif for (i=0; i < matrix_dim-BLOCK_SIZE; i += BLOCK_SIZE) { errcode = clSetKernelArg(clKernel_diagonal, 0, sizeof(cl_mem), (void *) &d_m); errcode |= clSetKernelArg(clKernel_diagonal, 1, sizeof(int), (void *) &matrix_dim); errcode |= clSetKernelArg(clKernel_diagonal, 2, sizeof(int), (void *) &i); CHECKERR(errcode); localWorkSize[0] = BLOCK_SIZE; globalWorkSize[0] = BLOCK_SIZE; errcode = clEnqueueNDRangeKernel(clCommands, clKernel_diagonal, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent); clFinish(clCommands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "Diagonal Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHECKERR(errcode); errcode = clSetKernelArg(clKernel_perimeter, 0, sizeof(cl_mem), (void *) &d_m); errcode |= clSetKernelArg(clKernel_perimeter, 1, sizeof(int), (void *) &matrix_dim); errcode |= clSetKernelArg(clKernel_perimeter, 2, sizeof(int), (void *) &i); CHECKERR(errcode); localWorkSize[0] = BLOCK_SIZE*2; globalWorkSize[0] = ((matrix_dim-i)/BLOCK_SIZE-1)*localWorkSize[0]; errcode = clEnqueueNDRangeKernel(clCommands, clKernel_perimeter, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent); clFinish(clCommands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "Perimeter Kernel", ocdTempTimer) CHECKERR(errcode); END_TIMER(ocdTempTimer) errcode = clSetKernelArg(clKernel_internal, 0, sizeof(cl_mem), (void *) &d_m); errcode |= clSetKernelArg(clKernel_internal, 1, sizeof(int), (void *) &matrix_dim); errcode |= clSetKernelArg(clKernel_internal, 2, sizeof(int), (void *) &i); CHECKERR(errcode); localWorkSize[0] = BLOCK_SIZE; localWorkSize[1] = BLOCK_SIZE; globalWorkSize[0] = ((matrix_dim-i)/BLOCK_SIZE-1)*localWorkSize[0]; globalWorkSize[1] = ((matrix_dim-i)/BLOCK_SIZE-1)*localWorkSize[1]; errcode = clEnqueueNDRangeKernel(clCommands, clKernel_internal, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent); clFinish(clCommands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "Internal Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHECKERR(errcode); } errcode = clSetKernelArg(clKernel_diagonal, 0, sizeof(cl_mem), (void *) &d_m); errcode |= clSetKernelArg(clKernel_diagonal, 1, sizeof(int), (void *) &matrix_dim); errcode |= clSetKernelArg(clKernel_diagonal, 2, sizeof(int), (void *) &i); CHECKERR(errcode); localWorkSize[0] = BLOCK_SIZE; globalWorkSize[0] = BLOCK_SIZE; errcode = clEnqueueNDRangeKernel(clCommands, clKernel_diagonal, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent); clFinish(clCommands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "Diagonal Kernels", ocdTempTimer) CHECKERR(errcode); END_TIMER(ocdTempTimer) errcode = clEnqueueReadBuffer(clCommands, d_m, CL_TRUE, 0, matrix_dim*matrix_dim*sizeof(float), (void *) m, 0, NULL, &ocdTempEvent); clFinish(clCommands); START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "Matrix copy", ocdTempTimer) END_TIMER(ocdTempTimer) /* end of timing point */ stopwatch_stop(&sw); printf("Time consumed(ms): %lf\n", 1000*get_interval_by_sec(&sw)); clReleaseMemObject(d_m); if (do_verify) { printf("After LUD\n"); print_matrix(m, matrix_dim); printf(">>>Verify<<<<\n"); printf("matrix_dim: %d\n",matrix_dim); lud_verify(mm, m, matrix_dim); free(mm); } clReleaseKernel(clKernel_diagonal); clReleaseKernel(clKernel_perimeter); clReleaseKernel(clKernel_internal); clReleaseProgram(clProgram); clReleaseCommandQueue(clCommands); clReleaseContext(clContext); free(m); ocd_finalize(); return EXIT_SUCCESS; } /* ---------- end of function main ---------- */
cv::Mat AlphaRayLocator::findAlphas( const BoundingPolyhedron* polyhedrons, const size_t polyhedronCount, const ia::InputAssembler& input) const { assert(polyhedronCount>1); START_TIMER(AlphaLocator); const cv::Mat& mat = input.mat(); const unsigned r = input.mat().rows, c = input.mat().cols; const math::vec3 background = input.background(); cv::Mat out; out.create(r, c, CV_32FC1); const SpherePolyhedron& outerPoly = polyhedrons[1]; const SpherePolyhedron& innerPoly = polyhedrons[0]; //For each point, send rays for (unsigned i = 0; i < r; ++i) { float* data = (float*)(mat.data + mat.step*i); float* dataOut = (float*)(out.data + out.step*i); for(unsigned j = 0; j < c; ++j) { math::vec3& point = *((math::vec3*)(data + j*3)); float alpha; //If right in the middle if(background==point) alpha = 0; else { //Prepare vector const math::vec3 vector = point - background; const float vectorLen = vector.length(); const math::vec3 vectorNorm = vector/vectorLen; const float distanceToPoint = point.distance(background); const float distanceToOuterPoly = outerPoly.findDistanceToPolyhedron(vectorNorm); //If inside outer poly, alpha < 1 if(distanceToPoint < distanceToOuterPoly) { float distanceToInnerPoly = innerPoly.findDistanceToPolyhedron(vectorNorm); //If does not intersect with inner, fully inside if(distanceToPoint < distanceToInnerPoly) alpha = 0; else //interpolate between inner and outer alpha = (vectorLen - distanceToInnerPoly) / (distanceToOuterPoly - distanceToInnerPoly); } else //If intersects with middle, it's outside. Alpha = 1. alpha = 1; } *(dataOut+j) = alpha; } } END_TIMER(AlphaLocator); return out; }
void StableFitting::expand(BoundingPolyhedron& poly, const std::vector<math::vec3>& points, IColourSegmenter* segmenter, math::vec3 backgroundPoint, float startRadius, float endRadius) const { START_TIMER(Expanding); auto innerouter = segmenter->segment(points, backgroundPoint, startRadius); //Indicates whether a vertex was unable to move at least once due to outer points. std::vector<bool> didVertexEncounterResistance; didVertexEncounterResistance.resize(poly.mVertices.size(), false); //Position the polygon around the inner points. poly.positionAround(backgroundPoint, innerouter.inner); //Make a copy that is to be scaled. //We need a copy to deal with the case where no resistance is met. BoundingPolyhedron newPoly = poly; //The starting step is set to be half way between the start and end distance. float step = (endRadius-startRadius)/2.f; //Count number of points outside for later reference. int originalPointsOutside = innerouter.outer.size()-countPointsInside(innerouter.outer, newPoly); //Iterate... for(int iIteration = 0; iIteration < mNoOfIterations; ++iIteration) { //Try moving each vertex outwards for(size_t iVertex = 0; iVertex < newPoly.mVertices.size(); ++iVertex) { //Find movement vector math::vec3 moveNormal = (newPoly.mVertices[iVertex]-newPoly.centre()).normalize(); const math::vec3 vec = moveNormal*step; //Move newPoly.mVertices[iVertex] += vec; //Find the number of points now outside after movement. int newPointsOutside = innerouter.outer.size()-countPointsInside(innerouter.outer, newPoly); //If there are now less points outside, we have gone through something. Move back and mark resistance. if(newPointsOutside < originalPointsOutside) { newPoly.mVertices[iVertex] -= vec; didVertexEncounterResistance[iVertex] = true; } } //halve the step and try again. step *= 0.5f; } //The idea here is that if a vertex did not encounter any resistance while moving, //Move it by the maximin movement of the vertices that _did_ encounter resistance. //You might try replacing maximum with mode, etc. (mean gave really low deltas) //Find max movement of vertices that found resistance: float maxMovement = poly.mVertices[0].distance(newPoly.mVertices[0]); for(size_t i = 1; i < didVertexEncounterResistance.size(); ++i) maxMovement = std::max(maxMovement, poly.mVertices[i].distance(newPoly.mVertices[i])); //If a vertex found resistance, keep it. Otherwise, restore it and move by the average. for(size_t i = 0; i < didVertexEncounterResistance.size(); ++i) if(didVertexEncounterResistance[i]) poly.mVertices[i] = newPoly.mVertices[i]; else poly.mVertices[i] = poly.mVertices[i]+(newPoly.mVertices[i]-poly.mVertices[i]).normalize()*maxMovement; END_TIMER(Expanding); }