Beispiel #1
0
static int perf_cladsyn(CSOUND *csound, CLADSYN *p){

  uint32_t offset = p->h.insdshead->ksmps_offset;
  uint32_t early  = p->h.insdshead->ksmps_no_end;
  uint32_t n, nsmps = CS_KSMPS;
  float *out_ = (float *) p->out_.auxp;
  MYFLT      *asig = p->asig;
  int count = p->count,  vsamps = p->vsamps;
  p->fp = (float *) (p->fsig->frame.auxp);

  if (UNLIKELY(offset)) memset(asig, '\0', offset*sizeof(MYFLT));
  if (UNLIKELY(early)) {
    nsmps -= early;
    memset(&asig[nsmps], '\0', early*sizeof(MYFLT));
   }

  for(n=offset; n < nsmps; n++){
    if(count == 0) {
     int err;
     float freq = *p->kfreq;
     clSetKernelArg(p->kernel1, 4, sizeof(cl_float), &freq);
     clSetKernelArg(p->kernel2, 4, sizeof(cl_float), &freq);

     clEnqueueWriteBuffer(p->commands,p->frame, CL_TRUE, 0,
                          sizeof(cl_float)*p->bins*2,
                          p->fp, 0, NULL, NULL);

     err = clEnqueueNDRangeKernel(p->commands,p->kernel1, 1, NULL, &p->threads, 
            &p->wgs1, 0, NULL, NULL);
     if(err) 
       csound->Message(csound,"Error: Failed to compute sample kernel! %s\n", 
                             cl_error_string(err));
     clFinish(p->commands);
     clEnqueueReadBuffer(p->commands,p->out, 
               CL_TRUE, 0,vsamps*sizeof(cl_float),out_, 0, NULL, NULL);
     err = clEnqueueNDRangeKernel(p->commands,p->kernel2, 1, NULL, &p->mthreads, 
          &p->wgs2, 0, NULL, NULL);
     if(err) 
       csound->Message(csound,"Error: Failed to compute update kernel!%s\n", 
                             cl_error_string(err));
     count = vsamps;
    }
    asig[n] = (MYFLT) out_[vsamps - count];
    count--;
  }
  p->count = count;
  return OK;
}
static const char *estr(CLBlastStatusCode err) {
  if (err > -1024)
    return cl_error_string((cl_int)err);
  switch (err) {
  case CLBlastNotImplemented:
    return "Unimplemented feature";
  case CLBlastInvalidMatrixA:
    return "matrix A is not a valid memory object";
  case CLBlastInvalidMatrixB:
    return "matrix B is not a valid memory object";
  case CLBlastInvalidMatrixC:
    return "matrix C is not a valid memory object";
  case CLBlastInvalidVectorX:
    return "vector X is not a valid memory object";
  case CLBlastInvalidVectorY:
    return "vector Y is not a valid memory object";
  case CLBlastInvalidDimension:
    return "An input dimension (M, N, K) is invalid";
  case CLBlastInvalidLeadDimA:
    return "leading dimension for A must not be less than the size of the first  dimension";
  case CLBlastInvalidLeadDimB:
    return "leading dimension for B must not be less than the size of the second dimension";
  case CLBlastInvalidLeadDimC:
    return "leading dimension for C must not be less than the size of the third dimension";
  case CLBlastInvalidIncrementX:
    return "increment for X must not be 0";
  case CLBlastInvalidIncrementY:
    return "increment for Y must not be 0";
  case CLBlastInsufficientMemoryA:
    return "memory object for matrix A is too small";
  case CLBlastInsufficientMemoryB:
    return "memory object for matrix B is too small";
  case CLBlastInsufficientMemoryC:
    return "memory object for matrix C is too small";
  case CLBlastInsufficientMemoryX:
    return "memory object for vector X is too small";
  case CLBlastInsufficientMemoryY:
    return "memory object for vector Y is too small";
  case CLBlastInvalidLocalMemUsage:
    return "not enough local memory on the device";
  case CLBlastNoHalfPrecision:
    return "float16 is not supported on this device";
  case CLBlastNoDoublePrecision:
    return "float64 is not supported on this device";
  case CLBlastInvalidVectorScalar:
    return "unit-sized vector is not a valid memory object";
  case CLBlastInsufficientMemoryScalar:
    return "memory object for unit-sized vector is too small";
  case CLBlastDatabaseError:
    return "device entry not in database";
  case CLBlastUnknownError:
    return "Unspecified error";
  case CLBlastUnexpectedError:
    return "Unexpected error";
  default:
    return "Unknow error";
  }
}
static const char *estr(clblasStatus err) {
  if (err > -1024)
    return cl_error_string((cl_int)err);
  switch (err) {
  case clblasNotImplemented:
    return "Unimplemented feature";
  case clblasNotInitialized:
    return "Library not initialized";
  case clblasInvalidMatA:
    return "matrix A is not a valid memory object";
  case clblasInvalidMatB:
    return "matrix B is not a valid memory object";
  case clblasInvalidMatC:
    return "matrix C is not a valid memory object";
  case clblasInvalidVecX:
    return "vector X is not a valid memory object";
  case clblasInvalidVecY:
    return "vector Y is not a valid memory object";
  case clblasInvalidDim:
    return "An input dimension (M, N, K) is invalid";
  case clblasInvalidLeadDimA:
    return "leading dimension for A must not be less than the size of the first dimension";
  case clblasInvalidLeadDimB:
    return "leading dimension for B must not be less than the size of the second dimension";
  case clblasInvalidLeadDimC:
    return "leading dimension for C must not be less than the size of the third dimension";
  case clblasInvalidIncX:
    return "increment for X must not be 0";
  case clblasInvalidIncY:
    return "increment for Y must not be 0";
  case clblasInsufficientMemMatA:
    return "memory object for matrix A is too small";
  case clblasInsufficientMemMatB:
    return "memory object for matrix B is too small";
  case clblasInsufficientMemMatC:
    return "memory object for matrix C is too small";
  case clblasInsufficientMemVecX:
    return "memory object for vector X is too small";
  case clblasInsufficientMemVecY:
    return "memory object for vector Y is too small";
  default:
    return "Unknow error";
  }
}
Beispiel #4
0
static int init_cladsyn(CSOUND *csound, CLADSYN *p){

  int asize, ipsize, fpsize, err;
  cl_device_id device_ids[32], device_id;             
  cl_context context;                
  cl_command_queue commands;          
  cl_program program;                
  cl_kernel kernel1, kernel2;                 
  cl_uint num = 0, nump =  0;
  cl_platform_id platforms[16];
    uint i;

  if(p->fsig->overlap > 1024)
     return csound->InitError(csound, "overlap is too large\n");



  err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 32, device_ids, &num);
  if (err != CL_SUCCESS){
    clGetPlatformIDs(16, platforms, &nump);
    int devs = 0;
    for(i=0; i < nump && devs < 32; i++){
     char name[128];
     clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 128, name, NULL);
     csound->Message(csound, "available platform[%d] %s\n",i, name);
     err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 32-devs, &device_ids[devs], &num);
    if (err != CL_SUCCESS)
     csound->InitError(csound, "failed to find an OpenCL device! %s \n", cl_error_string(err));
    }
    devs += num;
  }

  
  for(i=0; i < num; i++){
  char name[128];
  cl_device_type type;
  clGetDeviceInfo(device_ids[i], CL_DEVICE_NAME, 128, name, NULL);
  clGetDeviceInfo(device_ids[i], CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL);
  if(type & CL_DEVICE_TYPE_CPU)
  csound->Message(csound, "available CPU[device %d] %s\n",i, name);
  else  if(type & CL_DEVICE_TYPE_GPU)
  csound->Message(csound, "available GPU[device %d] %s\n",i, name);
  else  if(type & CL_DEVICE_TYPE_ACCELERATOR)
  csound->Message(csound, "available ACCELLERATOR[device %d] %s\n",i, name);
  else 
  csound->Message(csound, "available generic [device %d] %s\n",i, name);;
  }

  // SELECT THE GPU HERE
  if(*p->idev < num)
   device_id = device_ids[(int)*p->idev];
  else
   device_id = device_ids[num-1];

   context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
   if (!context)
     return csound->InitError(csound, "Failed to create a compute context! %s\n", 
                             cl_error_string(err));
  
    // Create a command commands
    //
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
       return csound->InitError(csound, "Failed to create a command commands! %s\n", 
                             cl_error_string(err));
    // Create the compute program from the source buffer
    //
    program = clCreateProgramWithSource(context, 1, (const char **) &code, NULL, &err);
    if (!program)
       return csound->InitError(csound, "Failed to create compute program! %s\n", 
                             cl_error_string(err));
  
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];
        csound->Message(csound, "Failed to build program executable! %s\n", 
                             cl_error_string(err));
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        return csound->InitError(csound, "%s\n", buffer);
    }

    kernel1 = clCreateKernel(program, "sample", &err);
    if (!kernel1 || err != CL_SUCCESS)
      return csound->InitError(csound, "Failed to create sample compute kernel! %s\n", 
                             cl_error_string(err));

   kernel2 = clCreateKernel(program, "update", &err);
    if (!kernel2 || err != CL_SUCCESS)
      return csound->InitError(csound,"Failed to create update compute kernel! %s\n", 
                             cl_error_string(err));
 
  char name[128];
  clGetDeviceInfo(device_id, CL_DEVICE_NAME, 128, name, NULL);
  csound->Message(csound, "using device: %s\n",name);

  p->bins = (p->fsig->N)/2;

  if(*p->inum > 0 && *p->inum < p->bins) p->bins = *p->inum;

  p->vsamps = p->fsig->overlap;
  p->threads = p->bins*p->vsamps;
  p->mthreads = (p->bins > p->vsamps ? p->bins : p->vsamps);

  asize =  p->vsamps*sizeof(cl_float);
  ipsize = (p->bins > p->vsamps ? p->bins : p->vsamps)*sizeof(cl_long);
  fpsize = p->fsig->N*sizeof(cl_float);

  p->out = clCreateBuffer(context,0, asize, NULL, NULL);
  p->frame =   clCreateBuffer(context, CL_MEM_READ_ONLY, fpsize, NULL, NULL);
  p->ph =  clCreateBuffer(context,0, ipsize, NULL, NULL);
  p->amps =  clCreateBuffer(context,0,(p->bins > p->vsamps ? p->bins : p->vsamps)*sizeof(cl_float), NULL, NULL);
 
  // memset needed?

  asize = p->vsamps*sizeof(float);
  if(p->out_.auxp == NULL ||
      p->out_.size < (unsigned long) asize)
    csound->AuxAlloc(csound, asize , &p->out_);

  csound->RegisterDeinitCallback(csound, p, destroy_cladsyn);
  p->count = 0;
  p->context = context;
  p->program = program;
  p->commands = commands;
  p->kernel1 = kernel1;
  p->kernel2 = kernel2;
 
  clGetKernelWorkGroupInfo(p->kernel1, 
       device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(p->wgs1), &p->wgs1, NULL);
  clGetKernelWorkGroupInfo(p->kernel2, 
       device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(p->wgs1), &p->wgs2, NULL);
 
  p->sr = csound->GetSr(csound); 
  clSetKernelArg(p->kernel1, 0, sizeof(cl_mem), &p->out);
  clSetKernelArg(p->kernel1, 1, sizeof(cl_mem), &p->frame);
  clSetKernelArg(p->kernel1, 2, sizeof(cl_mem), &p->ph);
  clSetKernelArg(p->kernel1, 3, sizeof(cl_mem), &p->amps);
  clSetKernelArg(p->kernel1, 5, sizeof(cl_int), &p->bins);
  clSetKernelArg(p->kernel1, 6, sizeof(cl_int), &p->vsamps);
  clSetKernelArg(p->kernel1, 7, sizeof(cl_float), &p->sr);

  clSetKernelArg(p->kernel2, 0, sizeof(cl_mem), &p->out);
  clSetKernelArg(p->kernel2, 1, sizeof(cl_mem), &p->frame);
  clSetKernelArg(p->kernel2, 2, sizeof(cl_mem), &p->ph);
  clSetKernelArg(p->kernel2, 3, sizeof(cl_mem), &p->amps);
  clSetKernelArg(p->kernel2, 5, sizeof(cl_int), &p->bins);
  clSetKernelArg(p->kernel2, 6, sizeof(cl_int), &p->vsamps);
  clSetKernelArg(p->kernel2, 7, sizeof(cl_float),  &p->sr); 
  return OK;
}