GLuint CGLUtil::gpuMapRgb2PixelBufferObj(const cv::cuda::GpuMat& cvgmRGB_ ){
		//http://rickarkin.blogspot.co.uk/2012/03/use-pbo-to-share-buffer-between-cuda.html
		int nPyrLevel_ = getLevel( cvgmRGB_.cols );
		GLuint uTexture;
		// map OpenGL buffer object for writing from CUDA
		if (cvgmRGB_.channels() == 3) {
			uTexture = _auTexture[nPyrLevel_];
			void *pDev;
			cudaSafeCall( cudaGraphicsMapResources(1, &_apResourceRGBPxielBO[nPyrLevel_], 0)); 
			size_t nSize; 
			cudaSafeCall( cudaGraphicsResourceGetMappedPointer((void **)&pDev, &nSize , _apResourceRGBPxielBO[nPyrLevel_]));
			cv::cuda::GpuMat cvgmRGBA( cvgmRGB_.size(), CV_8UC3, pDev); 
			cvgmRGB_.copyTo(cvgmRGBA); 
			cudaSafeCall( cudaGraphicsUnmapResources(1, &_apResourceRGBPxielBO[nPyrLevel_], 0) );
			//texture mapping
			glBindTexture( GL_TEXTURE_2D, uTexture);
			glBindBuffer ( GL_PIXEL_UNPACK_BUFFER_ARB, _auRGBPixelBO[nPyrLevel_]);
			glTexImage2D( GL_TEXTURE_2D, 0, GL_RGB, cvgmRGB_.cols, cvgmRGB_.rows, 0, GL_RGB, GL_UNSIGNED_BYTE, NULL);
			errorDetectorGL();
			glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
			glBindTexture(GL_TEXTURE_2D, 0);
		}
		else if (cvgmRGB_.channels()==1) {
			uTexture = _auGrayTexture[nPyrLevel_];
			void *pDev;
			cudaSafeCall( cudaGraphicsMapResources(1, &_apResourceGrayPxielBO[nPyrLevel_], 0)); 
			size_t nSize; 
			cudaSafeCall( cudaGraphicsResourceGetMappedPointer((void **)&pDev, &nSize , _apResourceGrayPxielBO[nPyrLevel_]));
			cv::cuda::GpuMat cvgmRGBA( cvgmRGB_.size(), CV_8UC1, pDev);
			cvgmRGB_.copyTo(cvgmRGBA); 
			cudaSafeCall( cudaGraphicsUnmapResources(1, &_apResourceGrayPxielBO[nPyrLevel_], 0) );
			//texture mapping
			glBindTexture(GL_TEXTURE_2D, uTexture);
			glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, _auGrayPixelBO[nPyrLevel_]);
			glTexImage2D(GL_TEXTURE_2D, 0, GL_RED, cvgmRGB_.cols, cvgmRGB_.rows, 0, GL_RED, GL_UNSIGNED_BYTE, NULL);
			errorDetectorGL();
			glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
			glBindTexture(GL_TEXTURE_2D, 0);
		}
		return uTexture;
	}//gpuMapRgb2PixelBufferObj
	void CGLUtil::gpuMapRGBResources(const cv::cuda::GpuMat& cvgmRGBs_){
		int nPyrLevel_ = getLevel( cvgmRGBs_.cols );
		// map OpenGL buffer object for writing from CUDA
		void *pDev;
		cudaGraphicsMapResources(1, &_apResourceRGBVBO[nPyrLevel_], 0);
		size_t nSize; 
		cudaGraphicsResourceGetMappedPointer((void **)&pDev, &nSize, _apResourceRGBVBO[nPyrLevel_] );
		cv::cuda::GpuMat cvgmRGBs(cvgmRGBs_.size(),CV_8UC3,pDev);
		cvgmRGBs_.copyTo(cvgmRGBs);
		cudaGraphicsUnmapResources(1, &_apResourceRGBVBO[nPyrLevel_], 0);
		// render from the vbo
		glBindBuffer(GL_ARRAY_BUFFER, _auRGBVBO[nPyrLevel_]);
		glColorPointer(3, GL_UNSIGNED_BYTE, 0, 0);
		glEnableClientState(GL_COLOR_ARRAY);//you cant move glEnableClientState infront of cuda GraphicsMapResources, otherwise, you will have weird problem
		//glDrawArrays(GL_POINTS, 0, btl::kinect::__aKinectWxH[usPyrLevel_] );
		//glDisableClientState(GL_COLOR_ARRAY);
		//glBindBuffer( GL_ARRAY_BUFFER, 0 );
	}
	void CGLUtil::gpuMapNlResources(const cv::cuda::GpuMat& cvgmNls_){
		int nPyrLevel_ = getLevel( cvgmNls_.cols );
		// map OpenGL buffer object for writing from CUDA
		void *pDev;
		cudaGraphicsMapResources(1, &_apResourceNlVBO[nPyrLevel_], 0);
		size_t nSize; 
		cudaGraphicsResourceGetMappedPointer((void **)&pDev, &nSize, _apResourceNlVBO[nPyrLevel_] );
		cv::cuda::GpuMat cvgmNls(cvgmNls_.size(),CV_32FC3,pDev);
		cvgmNls_.copyTo(cvgmNls);
		cudaGraphicsUnmapResources(1, &_apResourceNlVBO[nPyrLevel_], 0);
		// render from the vbo
		glBindBuffer(GL_ARRAY_BUFFER, _auNlVBO[nPyrLevel_]);
		glNormalPointer(GL_FLOAT, 12, 0); //12 is the stride = the number of bytes occupied by each normal
		glEnableClientState(GL_NORMAL_ARRAY);//you cant move glEnableClientState infront of cuda GraphicsMapResources, otherwise, you will have weird problem
		//glColor3f(1.0, 0.0, 0.0);
		//glDrawArrays(GL_POINTS, 0, btl::kinect::__aKinectWxH[usPyrLevel_] );
		//glDisableClientState(GL_NORMAL_ARRAY);
		//glBindBuffer( GL_ARRAY_BUFFER, 0 );
	}
	void CGLUtil::gpuMapPtResources(const cv::cuda::GpuMat& cvgmPts_){
		int nPyrLevel_ = getLevel( cvgmPts_.cols );
		// map OpenGL buffer object for writing from CUDA
		void *pDev;
		cudaGraphicsMapResources(1, &_apResourcePtVBO[nPyrLevel_], 0);
		size_t nSize; 
		cudaGraphicsResourceGetMappedPointer((void **)&pDev, &nSize, _apResourcePtVBO[nPyrLevel_] );
		cv::cuda::GpuMat cvgmPts( cvgmPts_.size(),CV_32FC3,pDev );
		cvgmPts_.copyTo(cvgmPts); // the operation of the Buffer must be done before cudaGraphicsUnmapResources(), otherwise, the buffer will affects each other
		cudaGraphicsUnmapResources(1, &_apResourcePtVBO[nPyrLevel_], 0);
		// render from the vbo
		glBindBuffer(GL_ARRAY_BUFFER, _auPtVBO[nPyrLevel_]);
		glVertexPointer(3, GL_FLOAT, 0, 0);
		glEnableClientState(GL_VERTEX_ARRAY);//you cant move glEnableClientState in front of cuda GraphicsMapResources, otherwise, you will have weird problem
		//glColor3f(1.0, 0.0, 0.0);
		//glDrawArrays(GL_POINTS, 0, btl::kinect::__aKinectWxH[usPyrLevel_] );
		//glDisableClientState(GL_VERTEX_ARRAY);
		//glBindBuffer( GL_ARRAY_BUFFER, 0 );
	}
    std::vector<bbox_t> tracking_flow(cv::Mat dst_mat, bool check_error = true)
    {
        if (sync_PyrLKOpticalFlow_gpu.empty()) {
            std::cout << "sync_PyrLKOpticalFlow_gpu isn't initialized \n";
            return cur_bbox_vec;
        }

        int const old_gpu_id = cv::cuda::getDevice();
        if(old_gpu_id != gpu_id)
            cv::cuda::setDevice(gpu_id);

        if (dst_mat_gpu.cols == 0) {
            dst_mat_gpu = cv::cuda::GpuMat(dst_mat.size(), dst_mat.type());
            dst_grey_gpu = cv::cuda::GpuMat(dst_mat.size(), CV_8UC1);
        }

        //dst_grey_gpu.upload(dst_mat, stream);    // use BGR
        dst_mat_gpu.upload(dst_mat, stream);
        cv::cuda::cvtColor(dst_mat_gpu, dst_grey_gpu, CV_BGR2GRAY, 1, stream);

        if (src_grey_gpu.rows != dst_grey_gpu.rows || src_grey_gpu.cols != dst_grey_gpu.cols) {
            stream.waitForCompletion();
            src_grey_gpu = dst_grey_gpu.clone();
            cv::cuda::setDevice(old_gpu_id);
            return cur_bbox_vec;
        }

        ////sync_PyrLKOpticalFlow_gpu.sparse(src_grey_gpu, dst_grey_gpu, prev_pts_flow_gpu, cur_pts_flow_gpu, status_gpu, &err_gpu);    // OpenCV 2.4.x
        sync_PyrLKOpticalFlow_gpu->calc(src_grey_gpu, dst_grey_gpu, prev_pts_flow_gpu, cur_pts_flow_gpu, status_gpu, err_gpu, stream);    // OpenCV 3.x

        cv::Mat cur_pts_flow_cpu;
        cur_pts_flow_gpu.download(cur_pts_flow_cpu, stream);

        dst_grey_gpu.copyTo(src_grey_gpu, stream);

        cv::Mat err_cpu, status_cpu;
        err_gpu.download(err_cpu, stream);
        status_gpu.download(status_cpu, stream);

        stream.waitForCompletion();

        std::vector<bbox_t> result_bbox_vec;

        if (err_cpu.cols == cur_bbox_vec.size() && status_cpu.cols == cur_bbox_vec.size())
        {
            for (size_t i = 0; i < cur_bbox_vec.size(); ++i)
            {
                cv::Point2f cur_key_pt = cur_pts_flow_cpu.at<cv::Point2f>(0, i);
                cv::Point2f prev_key_pt = prev_pts_flow_cpu.at<cv::Point2f>(0, i);

                float moved_x = cur_key_pt.x - prev_key_pt.x;
                float moved_y = cur_key_pt.y - prev_key_pt.y;

                if (abs(moved_x) < 100 && abs(moved_y) < 100 && good_bbox_vec_flags[i])
                    if (err_cpu.at<float>(0, i) < flow_error && status_cpu.at<unsigned char>(0, i) != 0 &&
                        ((float)cur_bbox_vec[i].x + moved_x) > 0 && ((float)cur_bbox_vec[i].y + moved_y) > 0)
                    {
                        cur_bbox_vec[i].x += moved_x + 0.5;
                        cur_bbox_vec[i].y += moved_y + 0.5;
                        result_bbox_vec.push_back(cur_bbox_vec[i]);
                    }
                    else good_bbox_vec_flags[i] = false;
                else good_bbox_vec_flags[i] = false;

                //if(!check_error && !good_bbox_vec_flags[i]) result_bbox_vec.push_back(cur_bbox_vec[i]);
            }
        }

        cur_pts_flow_gpu.swap(prev_pts_flow_gpu);
        cur_pts_flow_cpu.copyTo(prev_pts_flow_cpu);

        if (old_gpu_id != gpu_id)
            cv::cuda::setDevice(old_gpu_id);

        return result_bbox_vec;
    }