/** * @brief Main principal * @param argc El número de argumentos del programa * @param argv Cadenas de argumentos del programa * @return Nada si es correcto o algún número negativo si es incorrecto */ int main( int argc, char** argv ) { if(argc != 2) return -1; // Medimos tiempo para el programa const double start_time = getCurrentTimestamp(); // Creamos el buffer para las partículas y reservamos espacio ALINEADO para los datos size_t N = atoi(argv[1]); particle *particulas = (particle*) _aligned_malloc(N * sizeof(particle), 64); // Inicializamos las partículas const double inicio = getCurrentTimestamp(); for(unsigned index = 0; index < N; ++index) { particulas[index].x = 0.0; particulas[index].y = 0.0; particulas[index].s = 1.0; particulas[index].xp = 0.0; particulas[index].yp = 0.0; particulas[index].sp = 1.0; particulas[index].x0 = 0.0; particulas[index].y0 = 0.0; particulas[index].width = 500; particulas[index].height = 500; particulas[index].w = 0.0f; } const double end_time = getCurrentTimestamp(); // Obtenemos el tiempo consumido por el programa y la suma de los pesos printf("\nTiempo total del programa: %0.3f ms\n", (end_time - start_time) * 1e3); printf("Tiempo total consumido por la inicializacion de las particulas: %0.3f ms\n", (end_time - inicio) * 1e3); }
/************************************************************************ method CompilerTrackingInfo::resetInterval start the new interval (at the current time, clock()) ************************************************************************/ inline void CompilerTrackingInfo::resetInterval() { beginIntervalTime_ = getCurrentTimestamp(); beginIntervalTimeUEpoch_ = getCurrentTimestampUEpoch(); beginIntervalClock_ = clock(); // // water marks for stmt and context heap back to 0 CmpCommon::statementHeap()->resetIntervalWaterMark(); CmpCommon::contextHeap()->resetIntervalWaterMark(); // // metadata cache counters maintained on each interval resetMetadataCacheCounters(); // // query cache resetQueryCacheCounters(); // // histogram cache counters reset on interval resetHistogramCacheCounters(); // // other counters largestStmtIntervalWaterMark_ = 0; systemHeapWaterMark_ = 0; longestCompileClock_ = 0; successfulQueryCount_ = 0; failedQueryCount_ = 0; caughtExceptionCount_ = 0; sessionCount_ = 0; }
/************************************************************************ method CompilerTrackingInfo::intervalExpired Check whether the defined interval for logging has expired and it's OK to log CompilerTrackingInfo again. ************************************************************************/ inline NABoolean CompilerTrackingInfo::intervalExpired(Int32 intervalLengthMins) { return ( currentIntervalDuration(getCurrentTimestamp()) >= intervalLengthMins ); }
/** * @brief Main principal * @param argc El número de argumentos del programa * @param argv Cadenas de argumentos del programa * @return Nada si es correcto o algún número negativo si es incorrecto */ int main( int argc, char** argv ) { if(argc != 2) return -1; // Medimos tiempo para el programa const double start_time = getCurrentTimestamp(); // Creamos el buffer para las partículas y los pesos y reservamos espacio ALINEADO para los datos size_t N = atoi(argv[1]); particle *particulas = (particle*) _aligned_malloc(N * sizeof(particle), 64); int *pesos = (int*) _aligned_malloc(N * sizeof(int), 64); float sum = 0.0f; // Inicializamos las partículas (Me interesan los pesos) srand(time(NULL)); for(unsigned index = 0; index < N; ++index) { particulas[index].x = 0.0; particulas[index].y = 0.0; particulas[index].s = 0.0; particulas[index].xp = 0.0; particulas[index].yp = 0.0; particulas[index].sp = 0.0; particulas[index].x0 = 0.0; particulas[index].y0 = 0.0; particulas[index].width = 0; particulas[index].height = 0; particulas[index].w = (float) (rand() % 2000); sum+=particulas[index].w; } // Normalizamos los datos for(int i = 0; i < N; ++i) particulas[i].w /= sum; const double inicio = getCurrentTimestamp(); // Calculamos el número de partículas en base al peso de cada una for(unsigned index = 0; index < N; ++index) pesos[index] = cvRound( particulas[index].w * N ); const double end_time = getCurrentTimestamp(); // Obtenemos el tiempo consumido por el programa y la suma de los pesos printf("\nTiempo total del programa: %0.3f ms\n", (end_time - start_time) * 1e3); printf("Tiempo total consumido por la generacion del numero de particulas: %0.3f ms\n", (end_time - inicio) * 1e3); }
/************************************************************************ method CompilationStats::exitCmpPhase mark the end of a compilation phase ************************************************************************/ void CompilationStats::exitCmpPhase(CompilationPhase phase) { if (!(isValidPhase(phase))) return; cpuMonitor_[phase].exit(); // // mark the end of the compilation if( CMP_PHASE_ALL == phase ) { compileEndTime_ = getCurrentTimestamp(); } }
/************************************************************************ method CompilerTrackingInfo::logCompilerStatusOnInterval Dump the fields of this class out to a file (or to repository) if the tracking compiler interval has expired ************************************************************************/ void CompilerTrackingInfo::logCompilerStatusOnInterval(Int32 intervalLengthMins) { if( intervalExpired(intervalLengthMins) ) { // // this interval is now done/expired endIntervalTime_ = getCurrentTimestamp(); // // get the latest cache stats once per interval if (!CURRENTQCACHE->getCompilationCacheStats(currentQCacheStats_)) { // if query is disabled, clear the cache counters clearQCacheCounters(); } // // log this interval if( NULL != getCompilerTrackingLogFilename() ) { printToFile(); } // // log directly to a private table using dynamic SQL // instead of using the Repository infrastructure to // populate repository table if (CmpCommon::getDefault(COMPILER_TRACKING_LOGTABLE) == DF_ON) { logIntervalInPrivateTable(); } // // This table doesn't exist on Windows, so don't log there // always log to the repository table Int32 rc = logIntervalInRepository(); if (rc) { // raise a warning that compiler process is unable to log // its status and health information to the repository *CmpCommon::diags() << DgSqlCode(2242); } // // since the interval is expired, reset to begin tracking new interval resetInterval(); } }
/************************************************************************ method CompilationStats::enterCmpPhase mark the begining of a compilation phase ************************************************************************/ void CompilationStats::enterCmpPhase(CompilationPhase phase) { if (!isValidPhase(phase)) return; // always initialize it to zero cpuMonitor_[phase].init(0); cpuMonitor_[phase].enter(); // // mark the start of the compilation if( CMP_PHASE_ALL == phase ) { compileStartTime_ = getCurrentTimestamp(); } }
/************************************************************************ method CompilerTrackingInfo::logCompilerStatusOnInterval Dump the fields of this class out to a file (or to repository) if the tracking compiler interval has expired ************************************************************************/ void CompilerTrackingInfo::logCompilerStatusOnInterval(Int32 intervalLengthMins) { if( intervalExpired(intervalLengthMins) ) { // // this interval is now done/expired endIntervalTime_ = getCurrentTimestamp(); // // get the latest cache stats once per interval if (!CURRENTQCACHE->getCompilationCacheStats(currentQCacheStats_)) { // if query is disabled, clear the cache counters clearQCacheCounters(); } // // log this interval if( NULL != getCompilerTrackingLogFilename() ) { printToFile(); } // // log directly to a private table using dynamic SQL if (CmpCommon::getDefault(COMPILER_TRACKING_LOGTABLE) == DF_ON) { logIntervalInPrivateTable(); } // always log to log4cxx log logIntervalInLog4Cxx(); // since the interval is expired, reset to begin tracking new interval resetInterval(); } }
void OpenniGrabber :: run() { m_should_exit = false; m_current_image.setCalibration(m_calib_data); m_rgbd_image.setCalibration(m_calib_data); // Depth m_rgbd_image.rawDepthRef() = Mat1f(m_calib_data->raw_depth_size); m_rgbd_image.rawDepthRef() = 0.f; m_rgbd_image.depthRef() = m_rgbd_image.rawDepthRef(); m_current_image.rawDepthRef() = Mat1f(m_calib_data->raw_depth_size); m_current_image.rawDepthRef() = 0.f; m_current_image.depthRef() = m_current_image.rawDepthRef(); // Color if (m_has_rgb) { m_rgbd_image.rawRgbRef() = Mat3b(m_calib_data->rawRgbSize()); m_rgbd_image.rawRgbRef() = Vec3b(0,0,0); m_rgbd_image.rgbRef() = m_rgbd_image.rawRgbRef(); m_current_image.rawRgbRef() = Mat3b(m_calib_data->rawRgbSize()); m_current_image.rawRgbRef() = Vec3b(0,0,0); m_current_image.rgbRef() = m_current_image.rawRgbRef(); m_rgbd_image.rawIntensityRef() = Mat1f(m_calib_data->rawRgbSize()); m_rgbd_image.rawIntensityRef() = 0.f; m_rgbd_image.intensityRef() = m_rgbd_image.rawIntensityRef(); m_current_image.rawIntensityRef() = Mat1f(m_calib_data->rawRgbSize()); m_current_image.rawIntensityRef() = 0.f; m_current_image.intensityRef() = m_current_image.rawIntensityRef(); } // User tracking m_rgbd_image.userLabelsRef() = cv::Mat1b(m_calib_data->raw_depth_size); m_rgbd_image.userLabelsRef() = 0u; if (m_track_users) m_rgbd_image.setSkeletonData(new Skeleton()); m_current_image.userLabelsRef() = cv::Mat1b(m_calib_data->raw_depth_size); m_current_image.userLabelsRef() = 0u; if (m_track_users) m_current_image.setSkeletonData(new Skeleton()); if (m_has_rgb) { bool mapping_required = m_calib_data->rawRgbSize() != m_calib_data->raw_depth_size; if (!mapping_required) { m_rgbd_image.mappedRgbRef() = m_rgbd_image.rawRgbRef(); m_rgbd_image.mappedDepthRef() = m_rgbd_image.rawDepthRef(); m_current_image.mappedRgbRef() = m_current_image.rawRgbRef(); m_current_image.mappedDepthRef() = m_current_image.rawDepthRef(); } else { m_rgbd_image.mappedRgbRef() = Mat3b(m_calib_data->raw_depth_size); m_rgbd_image.mappedRgbRef() = Vec3b(0,0,0); m_rgbd_image.mappedDepthRef() = Mat1f(m_calib_data->rawRgbSize()); m_rgbd_image.mappedDepthRef() = 0.f; m_current_image.mappedRgbRef() = Mat3b(m_calib_data->rawDepthSize()); m_current_image.mappedRgbRef() = Vec3b(0,0,0); m_current_image.mappedDepthRef() = Mat1f(m_calib_data->rawRgbSize()); m_current_image.mappedDepthRef() = 0.f; } } m_rgbd_image.setCameraSerial(cameraSerial()); m_current_image.setCameraSerial(cameraSerial()); xn::SceneMetaData sceneMD; xn::DepthMetaData depthMD; xn::ImageMetaData rgbMD; xn::IRMetaData irMD; ImageBayerGRBG bayer_decoder(ImageBayerGRBG::EdgeAware); RGBDImage oversampled_image; if (m_subsampling_factor != 1) { oversampled_image.rawDepthRef().create(m_calib_data->rawDepthSize()*m_subsampling_factor); oversampled_image.userLabelsRef().create(oversampled_image.rawDepth().size()); } while (!m_should_exit) { waitForNewEvent(); ntk_dbg(2) << format("[%x] running iteration", this); { // OpenNI calls do not seem to be thread safe. QMutexLocker ni_locker(&m_ni_mutex); waitAndUpdateActiveGenerators(); } if (m_track_users && m_body_event_detector) m_body_event_detector->update(); m_ni_depth_generator.GetMetaData(depthMD); if (m_has_rgb) { if (m_get_infrared) { m_ni_ir_generator.GetMetaData(irMD); } else { m_ni_rgb_generator.GetMetaData(rgbMD); } } RGBDImage& temp_image = m_subsampling_factor == 1 ? m_current_image : oversampled_image; const XnDepthPixel* pDepth = depthMD.Data(); ntk_assert((depthMD.XRes() == temp_image.rawDepth().cols) && (depthMD.YRes() == temp_image.rawDepth().rows), "Invalid image size."); // Convert to meters. const float depth_correction_factor = 1.0; float* raw_depth_ptr = temp_image.rawDepthRef().ptr<float>(); for (int i = 0; i < depthMD.XRes()*depthMD.YRes(); ++i) raw_depth_ptr[i] = depth_correction_factor * pDepth[i]/1000.f; if (m_has_rgb) { if (m_get_infrared) { const XnGrayscale16Pixel* pImage = irMD.Data(); m_current_image.rawIntensityRef().create(irMD.YRes(), irMD.XRes()); float* raw_img_ptr = m_current_image.rawIntensityRef().ptr<float>(); for (int i = 0; i < irMD.XRes()*irMD.YRes(); ++i) { raw_img_ptr[i] = pImage[i]; } } else { if (m_custom_bayer_decoding) { uchar* raw_rgb_ptr = m_current_image.rawRgbRef().ptr<uchar>(); bayer_decoder.fillRGB(rgbMD, m_current_image.rawRgb().cols, m_current_image.rawRgb().rows, raw_rgb_ptr); cvtColor(m_current_image.rawRgbRef(), m_current_image.rawRgbRef(), CV_RGB2BGR); } else { const XnUInt8* pImage = rgbMD.Data(); ntk_assert(rgbMD.PixelFormat() == XN_PIXEL_FORMAT_RGB24, "Invalid RGB format."); uchar* raw_rgb_ptr = m_current_image.rawRgbRef().ptr<uchar>(); for (int i = 0; i < rgbMD.XRes()*rgbMD.YRes()*3; i += 3) for (int k = 0; k < 3; ++k) { raw_rgb_ptr[i+k] = pImage[i+(2-k)]; } } } } if (m_track_users) { m_ni_user_generator.GetUserPixels(0, sceneMD); uchar* user_mask_ptr = temp_image.userLabelsRef().ptr<uchar>(); const XnLabel* pLabel = sceneMD.Data(); for (int i = 0; i < sceneMD.XRes()*sceneMD.YRes(); ++i) { user_mask_ptr[i] = pLabel[i]; } XnUserID user_ids[15]; XnUInt16 num_users = 15; m_ni_user_generator.GetUsers(user_ids, num_users); // FIXME: only one user supported. for (int i = 0; i < num_users; ++i) { XnUserID user_id = user_ids[i]; if (m_ni_user_generator.GetSkeletonCap().IsTracking(user_id)) { m_current_image.skeletonRef()->computeJoints(user_id, m_ni_user_generator, m_ni_depth_generator); break; } } } if (m_subsampling_factor != 1) { // Cannot use interpolation here, since this would // spread the invalid depth values. cv::resize(oversampled_image.rawDepth(), m_current_image.rawDepthRef(), m_current_image.rawDepth().size(), 0, 0, INTER_NEAREST); // we have to repeat this, since resize can change the pointer. // m_current_image.depthRef() = m_current_image.rawDepthRef(); cv::resize(oversampled_image.userLabels(), m_current_image.userLabelsRef(), m_current_image.userLabels().size(), 0, 0, INTER_NEAREST); } m_current_image.setTimestamp(getCurrentTimestamp()); { QWriteLocker locker(&m_lock); m_current_image.swap(m_rgbd_image); } advertiseNewFrame(); } ntk_dbg(1) << format("[%x] finishing", this); }
//run waifu2x void run(std::vector<float> &input, std::vector<float> &weight, std::vector<float> &output, std::vector<double> &bias, int iter, const int kernelSize, int r, int c) { unsigned int ipp[7][1] = { { 1 },{ 32 },{ 32 },{ 64 },{ 64 },{ 128 },{ 128 } }; unsigned int opp[7][1] = { { 32 },{ 32 },{ 64 },{ 64 },{ 128 },{ 128 },{ 1 } }; const unsigned nInputPlanes = ipp[iter][1]; const unsigned nOutputPlanes = opp[iter][1]; cl_int status; status = clEnqueueWriteBuffer(queue, input_buf, CL_FALSE, 0, r * c * nInputPlanes * sizeof(float), input.data(), 0, NULL, NULL); checkError(status, "Failed to transfer input"); status = clEnqueueWriteBuffer(queue, weight_buf, CL_FALSE, 0, kernelSize * nInputPlanes * nOutputPlanes * sizeof(float), weight.data(), 0, NULL, NULL); checkError(status, "Failed to transfer weight"); status = clEnqueueWriteBuffer(queue, bias_buf, CL_FALSE, 0, nOutputPlanes * sizeof(double), bias.data(), 0, NULL, NULL); checkError(status, "Failed to transfer bias"); clFinish(queue); cl_event kernel_event; const double start_time = getCurrentTimestamp(); unsigned argi = 0; status = clSetKernelArg(kernel, argi++, sizeof(cl_mem), &output_buf); checkError(status, "Failed to set argument %d", argi - 1); status = clSetKernelArg(kernel, argi++, sizeof(cl_mem), &input_buf); checkError(status, "Failed to set argument %d", argi - 1); status = clSetKernelArg(kernel, argi++, sizeof(cl_mem), &weight_buf); checkError(status, "Failed to set argument %d", argi - 1); status = clSetKernelArg(kernel, argi++, sizeof(cl_mem), &bias_buf); checkError(status, "Failed to set argument %d", argi - 1); status = clSetKernelArg(kernel, argi++, sizeof(nInputPlanes), &nInputPlanes); checkError(status, "Failed to set argument %d", argi - 1); status = clSetKernelArg(kernel, argi++, sizeof(nOutputPlanes), &nOutputPlanes); checkError(status, "Failed to set argument %d", argi - 1); status = clSetKernelArg(kernel, argi++, sizeof(r), &r); checkError(status, "Failed to set argument %d", argi - 1); status = clSetKernelArg(kernel, argi++, sizeof(c), &c); checkError(status, "Failed to set argument %d", argi - 1); const size_t* global_work_size = opp[iter]; const size_t* local_work_size = ipp[iter]; printf("Iteration %d\n", iter); status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, &kernel_event); checkError(status, "Failed to launch kernel"); status = clFinish(queue); checkError(status, "Failed to finish"); const double end_time = getCurrentTimestamp(); const double total_time = end_time - start_time; // Wall-clock time taken. printf("\nTime: %0.3f ms\n", total_time * 1e3); // Get kernel times using the OpenCL event profiling API. cl_ulong time_ns = getStartEndTime(kernel_event); printf("Kernel time: %0.3f ms\n", double(time_ns) * 1e-6); clReleaseEvent(kernel_event); status = clEnqueueReadBuffer(queue, output_buf, CL_TRUE, 0, r - 1 * c - 1 * nOutputPlanes * sizeof(float), output.data(), 0, NULL, NULL); checkError(status, "Failed to read output matrix"); }
/** * @brief Main principal * @param argc El número de argumentos del programa * @param argv Cadenas de argumentos del programa * @return Nada si es correcto o algún número negativo si es incorrecto */ int main( int argc, char** argv ) { if(argc != 2) return -1; // Medimos tiempo para el programa const double start_time = getCurrentTimestamp(); FILE *kernels; char *source_str; size_t source_size, work_items; // OpenCL runtime configuration unsigned num_devices; cl_platform_id platform_ids[3]; cl_uint ret_num_platforms; cl_device_id device_id; cl_context context = NULL; cl_command_queue command_queue; cl_program program = NULL; cl_int ret; cl_kernel kernelNUM; cl_event kernel_event, finish_event; cl_mem objPARTICULAS, objPESOS; // Abrimos el fichero que contiene el kernel fopen_s(&kernels, "numparticulasCPU.cl", "r"); if (!kernels) { fprintf(stderr, "Fallo al cargar el kernel\n"); exit(-1); } source_str = (char *) malloc(0x100000); source_size = fread(source_str, 1, 0x100000, kernels); fclose(kernels); // Obtenemos los IDs de las plataformas disponibles if( clGetPlatformIDs(3, platform_ids, &ret_num_platforms) != CL_SUCCESS) { printf("No se puede obtener id de la plataforma"); return -1; } // Intentamos obtener un dispositivo CPU soportado if( clGetDeviceIDs(platform_ids[1], CL_DEVICE_TYPE_CPU, 1, &device_id, &num_devices) != CL_SUCCESS) { printf("No se puede obtener id del dispositivo"); return -1; } clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &work_items, NULL); // Creación de un contexto OpenCL context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); // Creación de una cola de comandos command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret); // Creación de un programa kernel desde un fichero de código program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); if (ret != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: ¡Fallo al construir el programa ejecutable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s", buffer); exit(-1); } // Creación del kernel OpenCL kernelNUM = clCreateKernel(program, "calc_num_particulas", &ret); // Creamos el buffer para las partículas y reservamos espacio ALINEADO para los datos size_t N = atoi(argv[1]); particle *particulas = (particle*) _aligned_malloc(N * sizeof(particle), 64); int *pesos = (int*) _aligned_malloc(N * sizeof(int), 64); objPARTICULAS = clCreateBuffer(context, CL_MEM_READ_ONLY, N * sizeof(particle), NULL, &ret); objPESOS = clCreateBuffer(context, CL_MEM_WRITE_ONLY, N * sizeof(int), NULL, &ret); float sum = 0.0f; const size_t global = 2; const size_t local_work_size = 1; // Inicializamos las partículas (Me interesan los pesos) srand(time(NULL)); for(unsigned index = 0; index < N; ++index) { particulas[index].x = 0.0; particulas[index].y = 0.0; particulas[index].s = 0.0; particulas[index].xp = 0.0; particulas[index].yp = 0.0; particulas[index].sp = 0.0; particulas[index].x0 = 0.0; particulas[index].y0 = 0.0; particulas[index].width = 0; particulas[index].height = 0; particulas[index].w = (float) (rand() % 2000); sum+=particulas[index].w; } // Normalizamos los datos for(int i = 0; i < N; ++i) particulas[i].w /= sum; // Transferimos las partículas al dispositivo y los pesos cl_event write_event; ret = clEnqueueWriteBuffer(command_queue, objPARTICULAS, CL_FALSE, 0, N * sizeof(particle), particulas, 0, NULL, &write_event); // Establecemos los argumentos del kernel ret = clSetKernelArg(kernelNUM, 0, sizeof(cl_mem), &objPARTICULAS); ret = clSetKernelArg(kernelNUM, 1, sizeof(int), &N); ret = clSetKernelArg(kernelNUM, 2, sizeof(cl_mem), &objPESOS); // Ejecutamos el kernel. Un work-item por cada work-group o unidad de cómputo ret = clEnqueueNDRangeKernel(command_queue, kernelNUM, 1, NULL, &global, &local_work_size, 1, &write_event, &kernel_event); // Leemos los resultados ret = clEnqueueReadBuffer(command_queue, objPESOS, CL_FALSE, 0, N * sizeof(int), pesos, 1, &kernel_event, &finish_event); // Esperamos a que termine de leer los resultados clWaitForEvents(1, &finish_event); // Obtenemos el tiempo del kernel y de las transferencias CPU-RAM cl_ulong totalKernel = getStartEndTime(kernel_event); cl_ulong totalRam = getStartEndTime(write_event) + getStartEndTime(finish_event); const double end_time = getCurrentTimestamp(); // Obtenemos el tiempo consumido por el programa, el kernel y las transferencias de memoria printf("\nTiempo total del programa: %0.3f ms\n", (end_time - start_time) * 1e3); printf("Tiempo total consumido por el kernel: %0.3f ms\n", double(totalKernel) * 1e-6); printf("Tiempo total consumido en transferencias CPU-RAM: %0.3f ms\n", double(totalRam) * 1e-6); // Liberamos todos los recursos usados (kernels y objetos OpenCL) clReleaseEvent(kernel_event); clReleaseEvent(finish_event); clReleaseEvent(write_event); clReleaseMemObject(objPARTICULAS); clReleaseMemObject(objPESOS); clReleaseKernel(kernelNUM); clReleaseCommandQueue(command_queue); clReleaseProgram(program); clReleaseContext(context); }
/** * @brief Main principal * @param argc El número de argumentos del programa * @param argv Cadenas de argumentos del programa * @return Nada si es correcto o algún número negativo si es incorrecto */ int main( int argc, char** argv ) { if(argc != 2) return -1; // Medimos tiempo para el programa const double start_time = getCurrentTimestamp(); // Declaración de variables IplImage *first_frame; // Primer frame IplImage *frame, *hsv_frame; CvCapture *video; FILE *kernels; char *source_str; size_t source_size, work_items; // OpenCL runtime configuration unsigned num_devices; cl_platform_id platform_ids[3]; cl_uint ret_num_platforms; cl_device_id device_id; cl_context context = NULL; cl_command_queue command_queue; cl_program program = NULL; cl_int ret; cl_kernel kernelHISTO; cl_event kernel_event, finish_event; cl_mem objFRAME, objHISTO; // Abrimos el fichero que contiene el kernel fopen_s(&kernels, "histoGPU.cl", "r"); if (!kernels) { fprintf(stderr, "Fallo al cargar el kernel\n"); exit(-1); } source_str = (char *) malloc(0x100000); source_size = fread(source_str, 1, 0x100000, kernels); fclose(kernels); // Obtenemos los IDs de las plataformas disponibles if( clGetPlatformIDs(3, platform_ids, &ret_num_platforms) != CL_SUCCESS) { printf("No se puede obtener id de la plataforma"); return -1; } // Intentamos obtener un dispositivo GPU soportado if( clGetDeviceIDs(platform_ids[0], CL_DEVICE_TYPE_GPU, 1, &device_id, &num_devices) != CL_SUCCESS) { printf("No se puede obtener id del dispositivo"); return -1; } clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &work_items, NULL); // Creación de un contexto OpenCL context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); // Creación de una cola de comandos command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret); // Creación de un programa kernel desde un fichero de código program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(program, 1, &device_id, "-cl-nv-verbose", NULL, NULL); if (ret != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: ¡Fallo al construir el programa ejecutable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s", buffer); exit(-1); } // Creación del kernel OpenCL kernelHISTO = clCreateKernel(program, "calc_histo", &ret); // Abrimos el fichero de video y leemos el primer frame video = cvCaptureFromFile( argv[1] ); if( !video ) { printf("No se pudo abrir el fichero de video %s\n", &argv[1]); exit(-1); } first_frame = cvQueryFrame( video ); hsv_frame = cvCreateImage( cvGetSize(first_frame), IPL_DEPTH_32F, 3 ); cvConvertScale( first_frame, hsv_frame, 1.0 / 255.0, 0 ); cvCvtColor( hsv_frame, hsv_frame, CV_BGR2HSV ); // Creamos el buffer para los frames y el histograma float *histo = (float*) _aligned_malloc(HTAM * sizeof(float), 64); objFRAME = clCreateBuffer(context, CL_MEM_READ_ONLY, hsv_frame->imageSize, NULL, &ret); objHISTO = clCreateBuffer(context, CL_MEM_READ_WRITE, HTAM * sizeof(float), NULL, &ret); memset(histo, 0.0f, HTAM * sizeof(float)); const size_t global_work_size = work_items * 1024; const size_t local_work_size = 1024; // Transferimos el frame al dispositivo cl_event write_event[2]; ret = clEnqueueWriteBuffer(command_queue, objFRAME, CL_FALSE, 0, hsv_frame->imageSize, hsv_frame->imageData, 0, NULL, &write_event[0]); ret = clEnqueueWriteBuffer(command_queue, objHISTO, CL_FALSE, 0, HTAM * sizeof(float), histo, 0, NULL, &write_event[1]); // Establecemos los argumentos del kernel ret = clSetKernelArg(kernelHISTO, 0, sizeof(cl_mem), &objHISTO); ret = clSetKernelArg(kernelHISTO, 1, sizeof(cl_mem), &objFRAME); ret = clSetKernelArg(kernelHISTO, 2, sizeof(int), &hsv_frame->widthStep); ret = clSetKernelArg(kernelHISTO, 3, sizeof(int), &hsv_frame->height); ret = clSetKernelArg(kernelHISTO, 4, sizeof(int), &hsv_frame->width); // Ejecutamos el kernel. 128 work-items por cada work-group o unidad de cómputo ret = clEnqueueNDRangeKernel(command_queue, kernelHISTO, 1, NULL, &global_work_size, &local_work_size, 2, write_event, &kernel_event); // Leemos los resultados ret = clEnqueueReadBuffer(command_queue, objHISTO, CL_FALSE, 0, HTAM * sizeof(float), histo, 1, &kernel_event, &finish_event); // Esperamos a que termine de leer los resultados clWaitForEvents(1, &finish_event); // Obtenemos el tiempo del kernel y de las transferencias Pcie cl_ulong totalKernel = getStartEndTime(kernel_event); cl_ulong totalPcie = getStartEndTime(write_event[0]) + getStartEndTime(write_event[1]) + getStartEndTime(finish_event); cvReleaseImage( &hsv_frame ); // Recordar que frame no se puede liberar debido al cvQueryFrame while( frame = cvQueryFrame( video ) ) { hsv_frame = cvCreateImage( cvGetSize(frame), IPL_DEPTH_32F, 3 ); cvConvertScale( frame, hsv_frame, 1.0 / 255.0, 0 ); cvCvtColor( hsv_frame, hsv_frame, CV_BGR2HSV ); memset(histo, 0.0f, HTAM * sizeof(float)); ret = clEnqueueWriteBuffer(command_queue, objFRAME, CL_FALSE, 0, hsv_frame->imageSize, hsv_frame->imageData, 0, NULL, &write_event[0]); ret = clSetKernelArg(kernelHISTO, 0, sizeof(cl_mem), &objHISTO); ret = clSetKernelArg(kernelHISTO, 1, sizeof(cl_mem), &objFRAME); ret = clSetKernelArg(kernelHISTO, 2, sizeof(int), &hsv_frame->widthStep); ret = clSetKernelArg(kernelHISTO, 3, sizeof(int), &hsv_frame->height); ret = clSetKernelArg(kernelHISTO, 4, sizeof(int), &hsv_frame->width); ret = clEnqueueNDRangeKernel(command_queue, kernelHISTO, 1, NULL, &global_work_size, &local_work_size, 2, write_event, &kernel_event); ret = clEnqueueReadBuffer(command_queue, objHISTO, CL_FALSE, 0, HTAM * sizeof(float), histo, 1, &kernel_event, &finish_event); clWaitForEvents(1, &finish_event); totalKernel += getStartEndTime(kernel_event); totalPcie += (getStartEndTime(write_event[0]) + getStartEndTime(write_event[1]) + getStartEndTime(finish_event)); cvReleaseImage( &hsv_frame ); } const double end_time = getCurrentTimestamp(); // Obtenemos el tiempo consumido por el programa, el kernel y las transferencias de memoria printf("\nTiempo total del programa: %0.3f ms\n", (end_time - start_time) * 1e3); printf("Tiempo total consumido por el kernel: %0.3f ms\n", double(totalKernel) * 1e-6); printf("Tiempo total consumido en transferencias Pcie: %0.3f ms\n", double(totalPcie) * 1e-6); // Liberamos todos los recursos usados (kernels, frames y objetos OpenCL) clReleaseEvent(kernel_event); clReleaseEvent(finish_event); clReleaseEvent(write_event[0]); clReleaseEvent(write_event[1]); cvReleaseCapture( &video ); clReleaseMemObject(objFRAME); clReleaseMemObject(objHISTO); clReleaseKernel(kernelHISTO); clReleaseCommandQueue(command_queue); clReleaseProgram(program); clReleaseContext(context); }
// Add new variables to the end of the ordering BOOST_FOREACH(const Values::ConstKeyValuePair& key_value, newTheta) { ordering_.push_back(key_value.key); } // Augment Delta delta_.insert(newTheta.zeroVectors()); // Add the new factors to the graph, updating the variable index insertFactors(newFactors); gttoc(augment_system); // Update the Timestamps associated with the factor keys updateKeyTimestampMap(timestamps); // Get current timestamp double current_timestamp = getCurrentTimestamp(); if (debug) std::cout << "Current Timestamp: " << current_timestamp << std::endl; // Find the set of variables to be marginalized out std::set<Key> marginalizableKeys = findKeysBefore( current_timestamp - smootherLag_); if (debug) { std::cout << "Marginalizable Keys: "; BOOST_FOREACH(Key key, marginalizableKeys) { std::cout << DefaultKeyFormatter(key) << " "; } std::cout << std::endl; } // Reorder