Esempio n. 1
0
void FillVector(Queue &queue, const Device &device,
                const Program &program, const Databases &,
                EventPointer event, const std::vector<Event> &waitForEvents,
                const size_t n, const size_t inc, const size_t offset,
                const Buffer<T> &dest,
                const T constant_value) {
  auto kernel = Kernel(program, "FillVector");
  kernel.SetArgument(0, static_cast<int>(n));
  kernel.SetArgument(1, static_cast<int>(inc));
  kernel.SetArgument(2, static_cast<int>(offset));
  kernel.SetArgument(3, dest());
  kernel.SetArgument(4, GetRealArg(constant_value));
  auto local = std::vector<size_t>{64};
  auto global = std::vector<size_t>{Ceil(n, 64)};
  RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
Esempio n. 2
0
void Xim2col<T>::DoIm2col(const size_t channels, const size_t height, const size_t width,
                          const size_t kernel_h, const size_t kernel_w, const size_t pad_h,
                          const size_t pad_w, const size_t stride_h, const size_t stride_w,
                          const size_t dilation_h, const size_t dilation_w,
                          const Buffer<T> &im_buffer, const size_t im_offset,
                          const Buffer<T> &col_buffer, const size_t col_offset) {

  // Makes sure all dimensions are larger than zero
  if ((channels == 0) || (height == 0) || (width == 0)) { throw BLASError(StatusCode::kInvalidDimension); }

  // Sets the output height and width
  const auto size_h = height + 2 * pad_h;
  const auto padding_h = dilation_h * (kernel_h - 1) + 1;
  const auto output_h = (size_h >= padding_h) ? (size_h - padding_h) / stride_h + 1 : 1;
  const auto size_w = width + 2 * pad_w;
  const auto padding_w = dilation_w * (kernel_w - 1) + 1;
  const auto output_w = (size_w >= padding_w) ? (size_w - padding_w) / stride_w + 1 : 1;

  // Retrieves the Xcopy kernel from the compiled binary
  auto kernel = Kernel(program_, "im2col");

  // Sets the kernel arguments
  kernel.SetArgument(0, static_cast<int>(height));
  kernel.SetArgument(1, static_cast<int>(width));
  kernel.SetArgument(2, static_cast<int>(channels));
  kernel.SetArgument(3, static_cast<int>(output_h));
  kernel.SetArgument(4, static_cast<int>(output_w));
  kernel.SetArgument(5, static_cast<int>(kernel_h));
  kernel.SetArgument(6, static_cast<int>(kernel_w));
  kernel.SetArgument(7, static_cast<int>(pad_h));
  kernel.SetArgument(8, static_cast<int>(pad_w));
  kernel.SetArgument(9, static_cast<int>(stride_h));
  kernel.SetArgument(10, static_cast<int>(stride_w));
  kernel.SetArgument(11, static_cast<int>(dilation_h));
  kernel.SetArgument(12, static_cast<int>(dilation_w));
  kernel.SetArgument(13, im_buffer());
  kernel.SetArgument(14, static_cast<int>(im_offset));
  kernel.SetArgument(15, col_buffer());
  kernel.SetArgument(16, static_cast<int>(col_offset));

  // Launches the kernel
  const auto w_ceiled = Ceil(output_w, db_["COPY_DIMX"]);
  const auto h_ceiled = Ceil(output_h, db_["COPY_DIMY"]);
  const auto global = std::vector<size_t>{w_ceiled, h_ceiled * channels};
  const auto local = std::vector<size_t>{db_["COPY_DIMX"], db_["COPY_DIMY"]};
  RunKernel(kernel, queue_, device_, global, local, event_);
}
Esempio n. 3
0
	virtual void OnRender(ulong time)
	{
		if (time - m_last_time_dt > 0)
		{
			m_dt = ((time - m_last_time_dt) / (float)m_time_shift) / 1000.0f;
			m_last_time_dt = time;
			m_time_shift = 1;
		}
		else
		{
			m_time_shift = 1;
		}

		glClear(GL_COLOR_BUFFER_BIT);
		glEnableVertexAttribArray(0);
		glEnableVertexAttribArray(1);

		glBindBuffer(GL_ARRAY_BUFFER, m_VB);
		glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, sizeof(Body), 0);
		glVertexAttribPointer(1, 1, GL_FLOAT, GL_FALSE, sizeof(Body), (const GLvoid*)8);
		glDrawArrays(GL_POINTS, 0, m_bodies.size());

		glDisableVertexAttribArray(0);
		glDisableVertexAttribArray(1);
		m_backend->SwapBuffers();

		if (!m_pause)
			RunKernel();
		if (time - m_last_time > 1000)
		{
			printf("FPS: %d\n", m_frame_count);
			m_frame_count = 0;
			m_last_time = time;
		}
		else
		{
			m_frame_count++;
		}

	}
Esempio n. 4
0
void PadCopyTransposeMatrix(Queue &queue, const Device &device,
                            const Databases &db,
                            EventPointer event, const std::vector<Event> &waitForEvents,
                            const size_t src_one, const size_t src_two,
                            const size_t src_ld, const size_t src_offset,
                            const Buffer<T> &src,
                            const size_t dest_one, const size_t dest_two,
                            const size_t dest_ld, const size_t dest_offset,
                            const Buffer<T> &dest,
                            const T alpha,
                            const Program &program, const bool do_pad,
                            const bool do_transpose, const bool do_conjugate,
                            const bool upper = false, const bool lower = false,
                            const bool diagonal_imag_zero = false) {

  // Determines whether or not the fast-version could potentially be used
  auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
                         (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) &&
                         (upper == false) && (lower == false) && (diagonal_imag_zero == false);

  // Determines the right kernel
  auto kernel_name = std::string{};
  if (do_transpose) {
    if (use_fast_kernel &&
        IsMultiple(src_ld, db["TRA_WPT"]) &&
        IsMultiple(src_one, db["TRA_WPT"]*db["TRA_DIM"]) &&
        IsMultiple(src_two, db["TRA_WPT"]*db["TRA_DIM"])) {
      kernel_name = "TransposeMatrixFast";
    }
    else {
      use_fast_kernel = false;
      kernel_name = (do_pad) ? "TransposePadMatrix" : "TransposeMatrix";
    }
  }
  else {
    if (use_fast_kernel &&
        IsMultiple(src_ld, db["COPY_VW"]) &&
        IsMultiple(src_one, db["COPY_VW"]*db["COPY_DIMX"]) &&
        IsMultiple(src_two, db["COPY_WPT"]*db["COPY_DIMY"])) {
      kernel_name = "CopyMatrixFast";
    }
    else {
      use_fast_kernel = false;
      kernel_name = (do_pad) ? "CopyPadMatrix" : "CopyMatrix";
    }
  }

  // Retrieves the kernel from the compiled binary
  auto kernel = Kernel(program, kernel_name);

  // Sets the kernel arguments
  if (use_fast_kernel) {
    kernel.SetArgument(0, static_cast<int>(src_ld));
    kernel.SetArgument(1, src());
    kernel.SetArgument(2, dest());
    kernel.SetArgument(3, GetRealArg(alpha));
  }
  else {
    kernel.SetArgument(0, static_cast<int>(src_one));
    kernel.SetArgument(1, static_cast<int>(src_two));
    kernel.SetArgument(2, static_cast<int>(src_ld));
    kernel.SetArgument(3, static_cast<int>(src_offset));
    kernel.SetArgument(4, src());
    kernel.SetArgument(5, static_cast<int>(dest_one));
    kernel.SetArgument(6, static_cast<int>(dest_two));
    kernel.SetArgument(7, static_cast<int>(dest_ld));
    kernel.SetArgument(8, static_cast<int>(dest_offset));
    kernel.SetArgument(9, dest());
    kernel.SetArgument(10, GetRealArg(alpha));
    if (do_pad) {
      kernel.SetArgument(11, static_cast<int>(do_conjugate));
    }
    else {
      kernel.SetArgument(11, static_cast<int>(upper));
      kernel.SetArgument(12, static_cast<int>(lower));
      kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero));
    }
  }

  // Launches the kernel and returns the error code. Uses global and local thread sizes based on
  // parameters in the database.
  if (do_transpose) {
    if (use_fast_kernel) {
      const auto global = std::vector<size_t>{
        dest_one / db["TRA_WPT"],
        dest_two / db["TRA_WPT"]
      };
      const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]};
      RunKernel(kernel, queue, device, global, local, event, waitForEvents);
    }
    else {
      const auto global = std::vector<size_t>{
        Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]),
        Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"])
      };
      const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]};
      RunKernel(kernel, queue, device, global, local, event, waitForEvents);
    }
  }
  else {
    if (use_fast_kernel) {
      const auto global = std::vector<size_t>{
        dest_one / db["COPY_VW"],
        dest_two / db["COPY_WPT"]
      };
      const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]};
      RunKernel(kernel, queue, device, global, local, event, waitForEvents);
    }
    else {
      const auto global = std::vector<size_t>{
        Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]),
        Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"])
      };
      const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]};
      RunKernel(kernel, queue, device, global, local, event, waitForEvents);
    }
  }
}
Esempio n. 5
0
void PadCopyTransposeMatrixBatched(Queue &queue, const Device &device,
                                   const Databases &db,
                                   EventPointer event, const std::vector<Event> &waitForEvents,
                                   const size_t src_one, const size_t src_two,
                                   const size_t src_ld, const Buffer<int> &src_offsets,
                                   const Buffer<T> &src,
                                   const size_t dest_one, const size_t dest_two,
                                   const size_t dest_ld, const Buffer<int> &dest_offsets,
                                   const Buffer<T> &dest,
                                   const Program &program, const bool do_pad,
                                   const bool do_transpose, const bool do_conjugate,
                                   const size_t batch_count) {

  // Determines the right kernel
  auto kernel_name = std::string{};
  if (do_transpose) {
    kernel_name = (do_pad) ? "TransposePadMatrixBatched" : "TransposeMatrixBatched";
  }
  else {
    kernel_name = (do_pad) ? "CopyPadMatrixBatched" : "CopyMatrixBatched";
  }

  // Retrieves the kernel from the compiled binary
  auto kernel = Kernel(program, kernel_name);

  // Sets the kernel arguments
  kernel.SetArgument(0, static_cast<int>(src_one));
  kernel.SetArgument(1, static_cast<int>(src_two));
  kernel.SetArgument(2, static_cast<int>(src_ld));
  kernel.SetArgument(3, src_offsets());
  kernel.SetArgument(4, src());
  kernel.SetArgument(5, static_cast<int>(dest_one));
  kernel.SetArgument(6, static_cast<int>(dest_two));
  kernel.SetArgument(7, static_cast<int>(dest_ld));
  kernel.SetArgument(8, dest_offsets());
  kernel.SetArgument(9, dest());
  if (do_pad) {
    kernel.SetArgument(10, static_cast<int>(do_conjugate));
  }

  // Launches the kernel and returns the error code. Uses global and local thread sizes based on
  // parameters in the database.
  if (do_transpose) {
    const auto global = std::vector<size_t>{
      Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]),
      Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"]),
      batch_count
    };
    const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"], 1};
    RunKernel(kernel, queue, device, global, local, event, waitForEvents);
  }
  else {
    const auto global = std::vector<size_t>{
      Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]),
      Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"]),
      batch_count
    };
    const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"], 1};
    RunKernel(kernel, queue, device, global, local, event, waitForEvents);
  }
}
Esempio n. 6
0
StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
                            const size_t n,
                            const T alpha,
                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
                            const bool packed) {

  // Makes sure the dimensions are larger than zero
  if (n == 0) { return StatusCode::kInvalidDimension; }

  // The data is either in the upper or lower triangle
  const auto is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
                         (triangle == Triangle::kLower && layout == Layout::kRowMajor));
  const auto is_rowmajor = (layout == Layout::kRowMajor);

  // Tests the matrix and the vectors for validity
  auto status = StatusCode::kSuccess;
  if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
  else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
  if (ErrorIn(status)) { return status; }
  status = TestVectorX(n, x_buffer, x_offset, x_inc);
  if (ErrorIn(status)) { return status; }
  status = TestVectorY(n, y_buffer, y_offset, y_inc);
  if (ErrorIn(status)) { return status; }

  // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
  auto alpha_buffer = Buffer<T>(context_, 1);
  alpha_buffer.Write(queue_, 1, &alpha);

  // Retrieves the kernel from the compiled binary
  try {
    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
    auto kernel = Kernel(program, "Xher2");

    // Sets the kernel arguments
    kernel.SetArgument(0, static_cast<int>(n));
    kernel.SetArgument(1, alpha_buffer());
    kernel.SetArgument(2, x_buffer());
    kernel.SetArgument(3, static_cast<int>(x_offset));
    kernel.SetArgument(4, static_cast<int>(x_inc));
    kernel.SetArgument(5, y_buffer());
    kernel.SetArgument(6, static_cast<int>(y_offset));
    kernel.SetArgument(7, static_cast<int>(y_inc));
    kernel.SetArgument(8, a_buffer());
    kernel.SetArgument(9, static_cast<int>(a_offset));
    kernel.SetArgument(10, static_cast<int>(a_ld));
    kernel.SetArgument(11, static_cast<int>(is_upper));
    kernel.SetArgument(12, static_cast<int>(is_rowmajor));

    // Launches the kernel
    auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]);
    auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
    auto global = std::vector<size_t>{global_one, global_two};
    auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
    status = RunKernel(kernel, queue_, device_, global, local, event_);
    if (ErrorIn(status)) { return status; }

    // Succesfully finished the computation
    return StatusCode::kSuccess;
  } catch (...) { return StatusCode::kInvalidKernel; }
}
Esempio n. 7
0
// the process code  that the host sees
static OfxStatus render( OfxImageEffectHandle  instance,
                         OfxPropertySetHandle inArgs,
                         OfxPropertySetHandle outArgs)
{
  // get the render window and the time from the inArgs
  OfxTime time;
  OfxRectI renderWindow;
  OfxStatus status = kOfxStatOK;

  gPropHost->propGetDouble(inArgs, kOfxPropTime, 0, &time);
  gPropHost->propGetIntN(inArgs, kOfxImageEffectPropRenderWindow, 4, &renderWindow.x1);

  // Retrieve instance data associated with this effect
  MyInstanceData *myData = getMyInstanceData(instance);

  // property handles and members of each image
  OfxPropertySetHandle sourceImg = NULL, outputImg = NULL;
  int srcRowBytes, srcBitDepth, dstRowBytes, dstBitDepth;
  bool srcIsAlpha, dstIsAlpha;
  OfxRectI dstRect, srcRect;
  void *src, *dst;

  DPRINT(("Render: window = [%d, %d - %d, %d]\n",
	  renderWindow.x1, renderWindow.y1,
	  renderWindow.x2, renderWindow.y2));

  int isOpenCLEnabled = 0;
  if (gHostSupportsOpenCL)
  {
      gPropHost->propGetInt(inArgs, kOfxImageEffectPropOpenCLEnabled, 0, &isOpenCLEnabled);
      DPRINT(("render: OpenCL rendering %s\n", isOpenCLEnabled ? "enabled" : "DISABLED"));
  }

  cl_context clContext = NULL;
  cl_command_queue cmdQ = NULL;
  cl_device_id deviceId = NULL;
  if (isOpenCLEnabled)
  {
      void* voidPtrCmdQ;
      gPropHost->propGetPointer(inArgs, kOfxImageEffectPropOpenCLCommandQueue, 0, &voidPtrCmdQ);
      cmdQ = reinterpret_cast<cl_command_queue>(voidPtrCmdQ);

      clGetCommandQueueInfo(cmdQ, CL_QUEUE_CONTEXT, sizeof(cl_context), &clContext, NULL);
      clGetCommandQueueInfo(cmdQ, CL_QUEUE_DEVICE, sizeof(cl_device_id), &deviceId, NULL);
  }
  else
  {
      clContext = GetContext(deviceId);
      cmdQ = clCreateCommandQueue(clContext, deviceId, 0, NULL);
  }

  char deviceName[128];
  clGetDeviceInfo(deviceId, CL_DEVICE_NAME, 128, deviceName, NULL);
  DPRINT(("Using %s for plugin\n", deviceName));

  cl_kernel kernel = GetKernel(clContext);

  // get the source image
  sourceImg = ofxuGetImage(myData->sourceClip, time, srcRowBytes, srcBitDepth, srcIsAlpha, srcRect, src);

  // get the output image
  outputImg = ofxuGetImage(myData->outputClip, time, dstRowBytes, dstBitDepth, dstIsAlpha, dstRect, dst);

  // get the scale parameter
  double rGain = 1, gGain = 1, bGain = 1;
  gParamHost->paramGetValueAtTime(myData->rGainParam, time, &rGain);
  gParamHost->paramGetValueAtTime(myData->gGainParam, time, &gGain);
  gParamHost->paramGetValueAtTime(myData->bGainParam, time, &bGain);
  DPRINT(("Gain(%f %f %f)\n", rGain, gGain, bGain));

  float w = (renderWindow.x2 - renderWindow.x1);
  float h = (renderWindow.y2 - renderWindow.y1);

  const size_t rowSize = w * 4 * sizeof(float);

  if (isOpenCLEnabled)
  {
      DPRINT(("Using OpenCL transfers (same device)\n"));

      RunKernel(cmdQ, deviceId, kernel, w, h, rGain, gGain, bGain, (cl_mem)src, (cl_mem)dst);
  }
  else
  {
      DPRINT(("Using CPU transfers\n"));

      const size_t bufferSize = w * h * 4 * sizeof(float);

      // Allocate the temporary buffers on the plugin device
      cl_mem inBuffer = clCreateBuffer(clContext, CL_MEM_READ_ONLY, bufferSize, NULL, NULL);
      cl_mem outBuffer = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, bufferSize, NULL, NULL);

      // Copy the buffer from the CPU to the plugin device
      clEnqueueWriteBuffer(cmdQ, inBuffer, CL_TRUE, 0, bufferSize, src, 0, NULL, NULL);

      RunKernel(cmdQ, deviceId, kernel, w, h, rGain, gGain, bGain, inBuffer, outBuffer);

      // Copy the buffer from the plugin device to the CPU
      clEnqueueReadBuffer(cmdQ, outBuffer, CL_TRUE, 0, bufferSize, dst, 0, NULL, NULL);

      clFinish(cmdQ);

      // Free the temporary buffers on the plugin device
      clReleaseMemObject(inBuffer);
      clReleaseMemObject(outBuffer);
  }

  if (sourceImg)
  {
      gEffectHost->clipReleaseImage(sourceImg);
  }

  if (outputImg)
  {
      gEffectHost->clipReleaseImage(outputImg);
  }

  return status;
}
Esempio n. 8
0
void Xgemm<T>::DoGemm(const std::vector<size_t>& global,
                      const std::vector<size_t>& local,
                      const std::string& kernelName,
                      std::string argumentOrder,
                      const Layout layout,
                      const Transpose a_transpose, const Transpose b_transpose,
                      const size_t m, const size_t n, const size_t k,
                      const T alpha,
                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
                      const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
                      const T beta,
                      const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {

  // Makes sure all dimensions are larger than zero
  if ((m == 0) || (n == 0) || (k == 0)) { throw BLASError(StatusCode::kInvalidDimension); }

  // Computes whether or not the matrices are transposed in memory. This is based on their layout
  // (row or column-major) and whether or not they are requested to be pre-transposed. Note
  // that the Xgemm kernel expects either matrices A and C (in case of row-major) or B (in case of
  // col-major) to be transformed, so transposing requirements are not the same as whether or not
  // the matrix is actually transposed in memory.
  const auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) ||
                         (layout == Layout::kRowMajor && a_transpose == Transpose::kNo);
  const auto b_rotated = (layout == Layout::kColMajor && b_transpose != Transpose::kNo) ||
                         (layout == Layout::kRowMajor && b_transpose == Transpose::kNo);
  const auto c_rotated = (layout == Layout::kRowMajor);
  static const auto a_want_rotated = false;
  static const auto b_want_rotated = true;
  static const auto c_want_rotated = false;
  const auto a_do_transpose = a_rotated != a_want_rotated;
  const auto b_do_transpose = b_rotated != b_want_rotated;
  const auto c_do_transpose = c_rotated != c_want_rotated;

  // In case of complex data-types, the transpose can also become a conjugate transpose
  const auto a_conjugate = (a_transpose == Transpose::kConjugate);
  const auto b_conjugate = (b_transpose == Transpose::kConjugate);

  // Retrieves the proper XgemmDirect kernel from the compiled binary
  //const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectTT" : "XgemmDirectTN") :
                                       //(b_do_transpose ? "XgemmDirectNT" : "XgemmDirectNN");
  auto kernel = Kernel(*program_, kernelName);

  size_t i = 0;
  auto setArg = [&](std::string arg) {
    arg = trim(arg);
    if (arg == "m") { kernel.SetArgument(i, static_cast<int>(m)); } else
    if (arg == "n") { kernel.SetArgument(i, static_cast<int>(n)); } else
    if (arg == "k") { kernel.SetArgument(i, static_cast<int>(k)); } else
    if (arg == "a") { kernel.SetArgument(i, a_buffer()); } else
    if (arg == "b") { kernel.SetArgument(i, b_buffer()); } else
    if (arg == "c") { kernel.SetArgument(i, c_buffer()); } 
    i = i+1;
  };

  // parse and sets the kernel arguments
  std::string delimiter(",");
  size_t pos = 0;
  while ((pos = argumentOrder.find(delimiter)) != std::string::npos) {
    setArg(argumentOrder.substr(0, pos));
    argumentOrder.erase(0, pos + delimiter.length());
  }
  setArg(argumentOrder); // set last arg

  // Launches the kernel
  RunKernel(kernel, queue_, device_, global, local, event_);
}